# Modélisation avec Azure ML service

> https://docs.microsoft.com/en-us/azure/machine-learning/

In [1]:
import azureml.core
from azureml.core import Experiment, Workspace

# Check core SDK version number
print("Version Azure ML service : ", azureml.core.VERSION, " (Version actuelle au 04/03/2019 : 10.0.17)")

Version Azure ML service :  1.0.17  (Version actuelle au 04/03/2019 : 10.0.17)


In [2]:
# workspace
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Resource group: ' + ws.resource_group, sep='\n')

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


Found the config file in: C:\Users\seretkow\notebooks\Labs Azure ML service\aml_config\config.json
Workspace name: MLServiceWorkspace
Azure region: westeurope
Resource group: mlserviceresourcegroup


## Chargement des données

In [3]:
%%time
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

X, y = load_diabetes(return_X_y = True)
columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
data = {
    "train":{"X": X_train, "y": y_train},        
    "test":{"X": X_test, "y": y_test}
}

print ("Data contains", len(data['train']['X']), "training samples and",len(data['test']['X']), "test samples")

Data contains 353 training samples and 89 test samples
Wall time: 1.5 s


## Modélisation

In [4]:
%%time
# Get an experiment object from Azure Machine Learning
experiment = Experiment(workspace=ws, name="ExempleDiabete")

# Create a run object in the experiment
run = experiment.start_logging()
# Log the algorithm parameter alpha to the run
run.log('alpha', 0.03)

# Create, fit, and test the scikit-learn Ridge regression model
regression_model = Ridge(alpha=0.03)
regression_model.fit(data['train']['X'], data['train']['y'])
preds = regression_model.predict(data['test']['X'])

# Output the Mean Squared Error to the notebook and to the run
print('Mean Squared Error is', mean_squared_error(data['test']['y'], preds))
run.log('mse', mean_squared_error(data['test']['y'], preds))

# Save the model to the outputs directory for capture
joblib.dump(value=regression_model, filename='outputs/monmodele.pkl')

# Take a snapshot of the directory containing this notebook
#run.take_snapshot('./')

# Complete the run
run.complete()

Mean Squared Error is 3424.9003158960168
Wall time: 9.94 s


## Visualisation de l'expérimentation sur le portail Azure

In [5]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
ExempleDiabete,7365efec-c485-4f0d-a604-caac764caa68,,Completed,Link to Azure Portal,Link to Documentation


## Recherche du paramétre optimal

In [7]:
import numpy as np
from tqdm import tqdm

model_name = "monmodele.pkl"

# list of numbers from 0 to 1.0 with a 0.05 interval
alphas = np.arange(0.0, 1.0, 0.05)

# try a bunch of alpha values in a Linear Regression (Ridge) model
for alpha in tqdm(alphas):
    # create a bunch of runs, each train a model with a different alpha value
    with experiment.start_logging() as run:
        # Use Ridge algorithm to build a regression model
        regression_model = Ridge(alpha=alpha)
        regression_model.fit(X=data["train"]["X"], y=data["train"]["y"])
        preds = regression_model.predict(X=data["test"]["X"])
        mse = mean_squared_error(y_true=data["test"]["y"], y_pred=preds)

        # log alpha, mean_squared_error and feature names in run history
        run.log(name="alpha", value=alpha)
        run.log(name="mse", value=mse)

        # Save the model to the outputs directory for capture
        joblib.dump(value=regression_model, filename='outputs/monmodele.pkl')
        
        # Capture this notebook with the run
        #run.take_snapshot('./')

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:46<00:00,  2.07s/it]


## Visualisation de l'expérimentation Azure ML service sur le portail Azure

In [8]:
experiment

Name,Workspace,Report Page,Docs Page
ExempleDiabete,MLServiceWorkspace,Link to Azure Portal,Link to Documentation


## Visualisation des résultats

In [9]:
runs = {}
run_metrics = {}

# Create dictionaries containing the runs and the metrics for all runs containing the 'mse' metric
for r in tqdm(experiment.get_runs()):
    metrics = r.get_metrics()
    if 'mse' in metrics.keys():
        runs[r.id] = r
        run_metrics[r.id] = metrics

# Find the run with the best (lowest) mean squared error and display the id and metrics
best_run_id = min(run_metrics, key = lambda k: run_metrics[k]['mse'])
best_run = runs[best_run_id]
print('Best run is:', best_run_id)
print('Metrics:', run_metrics[best_run_id])

# Tag the best run for identification later
best_run.tag("Best Run")

88it [00:09,  9.46it/s]


Best run is: d2b3cd32-21a8-478c-a007-861abe3cba32
Metrics: {'alpha': 0.4, 'mse': 3295.741064355809}


In [10]:
# View the files in the run
for f in best_run.get_file_names():
    print(f)
    
# Register the model with the workspace
model = best_run.register_model(model_name='best_model', model_path='outputs/monmodele.pkl')

outputs/model.pkl
outputs/monmodele.pkl
