# Modélisation avec Azure ML service

<img src='https://cdn.thenewstack.io/media/2018/10/2e4f0988-az-ml-0.png'>

> https://docs.microsoft.com/en-us/azure/machine-learning/

In [18]:
import sys
sys.version

'3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 17:14:51) \n[GCC 7.2.0]'

In [2]:
import azureml.core
from azureml.core import Experiment, Workspace

# Check core SDK version number
print("Version Azure ML service : ", azureml.core.VERSION)

Version Azure ML service :  1.0.69


In [3]:
# workspace
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Resource group: ' + ws.resource_group, sep='\n')

Workspace name: azuremlservice
Azure region: westeurope
Resource group: azuremlserviceresourcegroup


## Chargement des données

In [4]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

In [5]:
X, y = load_diabetes(return_X_y = True)

columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

data = {
    "train":{"X": X_train, "y": y_train},        
    "test":{"X": X_test, "y": y_test}
}


In [8]:
print ("Données : ", len(data['train']['X']), "obs. = Base Training. ",len(data['test']['X']), "obs. = Base Test")

Données :  353 obs. = Base Training.  89 obs. = Base Test


## Modélisation

In [9]:
experiment = Experiment(workspace=ws, name="workshoptest")

## Les étapes
1. Logs d'informations
2. Modélisation
3. Logs de résultats de modèles
4. Sauvegarde modèle

In [10]:
from datetime import datetime
maintenant = datetime.now()
print("Maintenant : ", maintenant)

Maintenant :  2019-10-22 08:41:17.638090


In [12]:
%%time
# 1. Run object
run = experiment.start_logging()

# 2. Logs de valeurs
run.log('alpha', 0.1)
run.log('date_log', str(maintenant))

# 3. Modèle ML
regression_model = Ridge(alpha=0.1)
regression_model.fit(data['train']['X'], data['train']['y'])
preds = regression_model.predict(data['test']['X'])

# 4. Output
print('MSE ou Mean Squared Error du modèle =', mean_squared_error(data['test']['y'], preds))
run.log('mse', mean_squared_error(data['test']['y'], preds))

# 5. Export modèle
joblib.dump(value=regression_model, filename='outputs/model.pkl')

# 6. Fin
run.complete()

MSE ou Mean Squared Error du modèle = 3372.6496278100326
CPU times: user 694 ms, sys: 2.63 s, total: 3.32 s
Wall time: 10.9 s


## Visualisation de l'expérimentation sur le portail Azure

In [13]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
workshoptest,373012c5-2848-411a-ae7b-3443a6a4048b,,Running,Link to Azure Portal,Link to Documentation


## Recherche du paramétre optimal

In [14]:
%%time
import numpy as np
from tqdm import tqdm

model_name = "model.pkl"

# list of numbers from 0 to 1.0 with a 0.1 interval
alphas = np.arange(0.0, 1.0, 0.1)

# try a bunch of alpha values in a Linear Regression (Ridge) model
for alpha in tqdm(alphas):
    # create a bunch of runs, each train a model with a different alpha value
    with experiment.start_logging() as run:
        # Use Ridge algorithm to build a regression model
        regression_model = Ridge(alpha=alpha)
        regression_model.fit(X=data["train"]["X"], y=data["train"]["y"])
        preds = regression_model.predict(X=data["test"]["X"])
        mse = mean_squared_error(y_true=data["test"]["y"], y_pred=preds)
       
        # log alpha, mean_squared_error and feature names in run history
        run.log(name="alpha", value=alpha)
        run.log(name="mse", value=mse)
        
        # Save the model to the outputs directory for capture
        joblib.dump(value=regression_model, filename='outputs/model.pkl')
        

100%|██████████| 10/10 [02:06<00:00, 12.53s/it]

CPU times: user 11 s, sys: 11.3 s, total: 22.3 s
Wall time: 2min 6s





## Visualisation de l'expérimentation Azure ML service sur le portail Azure

In [15]:
experiment

Name,Workspace,Report Page,Docs Page
workshoptest,azuremlservice,Link to Azure Portal,Link to Documentation


## Visualisation des résultats

In [16]:
%%time

runs = {}
run_metrics = {}

# Create dictionaries containing the runs and the metrics for all runs containing the 'mse' metric
for r in tqdm(experiment.get_runs()):
    metrics = r.get_metrics()
    if 'mse' in metrics.keys():
        runs[r.id] = r
        run_metrics[r.id] = metrics

# Find the run with the best (lowest) mean squared error and display the id and metrics
best_run_id = min(run_metrics, key = lambda k: run_metrics[k]['mse'])
best_run = runs[best_run_id]
print('Best run :', best_run_id)
print('Metrics :', run_metrics[best_run_id])

# Tag the best run for identification later
best_run.tag("Best Run")

12it [00:11,  1.07it/s]


Best run : 3c49b631-289d-4cef-81c0-fb36b19bd138
Metrics : {'alpha': 0.4, 'mse': 3295.741064355809}
CPU times: user 1.4 s, sys: 158 ms, total: 1.56 s
Wall time: 12.7 s


In [17]:
# View the files in the run

for f in best_run.get_file_names():
    print(f)
    
# Register the model with the workspace
model = best_run.register_model(model_name='best_model_exemple', model_path='outputs/model.pkl')

logs/user_log.txt
outputs/model.pkl


> Fin