# Cloud Workshop Microsoft
## 2. Modélisation avec Azure ML service

> https://docs.microsoft.com/en-us/azure/machine-learning/

In [1]:
import azureml.core
from azureml.core import Experiment, Workspace

# Check core SDK version number
print("Version Azure ML service : ", azureml.core.VERSION)

Version Azure ML service :  1.0.21


In [2]:
# workspace
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Resource group: ' + ws.resource_group, sep='\n')

Found the config file in: /home/nbuser/library/aml_config/config.json
Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code FX488FQWX to authenticate.
Interactive authentication successfully completed.
Workspace name: azuremlservice
Azure region: westeurope
Resource group: azuremlserviceresourcegroup


## Chargement des données

In [3]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

X, y = load_diabetes(return_X_y = True)
columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
data = {
    "train":{"X": X_train, "y": y_train},        
    "test":{"X": X_test, "y": y_test}
}

print ("Data contains", len(data['train']['X']), "training samples and",len(data['test']['X']), "test samples")

Data contains 353 training samples and 89 test samples


## Modélisation

In [7]:
%%time
# Nom de l'expérimentation Azure ML service
experiment = Experiment(workspace=ws, name="workshop-ExempleDiabete")

# Log
run = experiment.start_logging()
# Log paramétre alpha
run.log('alpha', 0.03)

# ML
regression_model = Ridge(alpha=0.03)
regression_model.fit(data['train']['X'], data['train']['y'])
preds = regression_model.predict(data['test']['X'])

# Log métriques
print('Mean Squared Error is', mean_squared_error(data['test']['y'], preds))
run.log('mse', mean_squared_error(data['test']['y'], preds))

# Sauvegarde modèle
joblib.dump(value=regression_model, filename='outputs/modele.pkl')

# Run
run.complete()

Mean Squared Error is 3424.900315896017
CPU times: user 247 ms, sys: 227 ms, total: 474 ms
Wall time: 11.7 s


## Visualisation de l'expérimentation sur le portail Azure

In [8]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
workshop-ExempleDiabete,16d0c6d5-e16f-4730-bbc6-8feae7a63b80,,Completed,Link to Azure Portal,Link to Documentation


## Recherche du paramétre optimal

In [9]:
import numpy as np
from tqdm import tqdm

model_name = "modele.pkl"

# list of numbers from 0 to 1.0 with a 0.05 interval
alphas = np.arange(0.0, 1.0, 0.05)

# try a bunch of alpha values in a Linear Regression (Ridge) model
for alpha in tqdm(alphas):
    # create a bunch of runs, each train a model with a different alpha value
    with experiment.start_logging() as run:
        # Use Ridge algorithm to build a regression model
        regression_model = Ridge(alpha=alpha)
        regression_model.fit(X=data["train"]["X"], y=data["train"]["y"])
        preds = regression_model.predict(X=data["test"]["X"])
        mse = mean_squared_error(y_true=data["test"]["y"], y_pred=preds)

        # log alpha, mean_squared_error and feature names in run history
        run.log(name="alpha", value=alpha)
        run.log(name="mse", value=mse)

        # Save the model to the outputs directory for capture
        joblib.dump(value=regression_model, filename='outputs/modele.pkl')
        
        # Capture this notebook with the run
        #run.take_snapshot('./')


  0%|          | 0/20 [00:00<?, ?it/s][A
  5%|▌         | 1/20 [00:14<04:31, 14.29s/it][A
 10%|█         | 2/20 [00:27<04:13, 14.07s/it][A
 15%|█▌        | 3/20 [00:51<04:50, 17.06s/it][A
 20%|██        | 4/20 [01:04<04:11, 15.71s/it][A
 25%|██▌       | 5/20 [01:16<03:40, 14.68s/it][A
 30%|███       | 6/20 [01:27<03:09, 13.55s/it][A
 35%|███▌      | 7/20 [01:45<03:13, 14.92s/it][A
 40%|████      | 8/20 [01:54<02:37, 13.13s/it][A
 45%|████▌     | 9/20 [02:05<02:15, 12.34s/it][A
 50%|█████     | 10/20 [02:17<02:04, 12.44s/it][A
 55%|█████▌    | 11/20 [02:29<01:50, 12.30s/it][A
 60%|██████    | 12/20 [02:42<01:39, 12.44s/it][A
 65%|██████▌   | 13/20 [02:55<01:27, 12.55s/it][A
 70%|███████   | 14/20 [03:07<01:15, 12.53s/it][A
 75%|███████▌  | 15/20 [03:20<01:02, 12.44s/it][A
 80%|████████  | 16/20 [03:31<00:48, 12.25s/it][A
 85%|████████▌ | 17/20 [03:44<00:36, 12.29s/it][A
 90%|█████████ | 18/20 [03:57<00:24, 12.43s/it][A
 95%|█████████▌| 19/20 [04:08<00:12, 12.21s/it]

## Visualisation de l'expérimentation Azure ML service sur le portail Azure

In [10]:
experiment

Name,Workspace,Report Page,Docs Page
workshop-ExempleDiabete,azuremlservice,Link to Azure Portal,Link to Documentation


## Visualisation des résultats

In [11]:
runs = {}
run_metrics = {}

# Create dictionaries containing the runs and the metrics for all runs containing the 'mse' metric
for r in tqdm(experiment.get_runs()):
    metrics = r.get_metrics()
    if 'mse' in metrics.keys():
        runs[r.id] = r
        run_metrics[r.id] = metrics

# Find the run with the best (lowest) mean squared error and display the id and metrics
best_run_id = min(run_metrics, key = lambda k: run_metrics[k]['mse'])
best_run = runs[best_run_id]
print('Best run is:', best_run_id)
print('Metrics:', run_metrics[best_run_id])

# Tag the best run for identification later
best_run.tag("Best Run")


0it [00:00, ?it/s][A
1it [00:01,  1.87s/it][A
2it [00:02,  1.51s/it][A
3it [00:03,  1.28s/it][A
4it [00:03,  1.11s/it][A
5it [00:04,  1.02it/s][A
6it [00:05,  1.10it/s][A
7it [00:06,  1.16it/s][A
8it [00:06,  1.22it/s][A
9it [00:07,  1.27it/s][A
10it [00:08,  1.34it/s][A
11it [00:08,  1.35it/s][A
12it [00:09,  1.42it/s][A
13it [00:10,  1.38it/s][A
14it [00:10,  1.44it/s][A
15it [00:11,  1.46it/s][A
16it [00:12,  1.51it/s][A
17it [00:12,  1.50it/s][A
18it [00:13,  1.54it/s][A
19it [00:14,  1.50it/s][A
20it [00:14,  1.47it/s][A
21it [00:15,  1.45it/s][A
22it [00:16,  1.47it/s][A
23it [00:17,  1.45it/s][A
24it [00:17,  1.44it/s][A
25it [00:18,  1.42it/s][A
26it [00:19,  1.43it/s][A
27it [00:19,  1.44it/s][A
28it [00:20,  1.37it/s][A
29it [00:21,  1.32it/s][A
30it [00:22,  1.29it/s][A
31it [00:23,  1.28it/s][A
32it [00:23,  1.31it/s][A
33it [00:24,  1.29it/s][A
34it [00:25,  1.30it/s][A
35it [00:26,  1.32it/s][A
36it [00:26,  1.31it/s][A
37it [00:27,  

Best run is: adb3cb09-b627-40ad-ae22-7baaa40845b7
Metrics: {'alpha': 0.4, 'mse': 3295.741064355809}


In [14]:
 # Register the model with the workspace
model = best_run.register_model(model_name='best_model', model_path='outputs/modele.pkl')

In [15]:
# View the files in the run
for f in best_run.get_file_names():
    print(f)

outputs/model.pkl
outputs/modele.pkl
outputs/monmodele.pkl


> Fin