In [1]:
import pandas as pd
import numpy as np
import warnings
from math import sqrt
warnings.filterwarnings('ignore')
from azureml.core.run import Run
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.model import Model
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.train.automl import AutoMLConfig
import pickle
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import mlflow

In [2]:
from azureml.core import Workspace, Dataset

subscription_id = '33e364b2-cde3-41b5-9724-c8f6cd7d22d9'
resource_group = 'MLOpsCog'
workspace_name = 'MLOPs-CoE-Cog-MSIT-19-10'

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [3]:
uri = workspace.get_mlflow_tracking_uri()
mlflow.set_tracking_uri(uri)

In [4]:
# Importing pre-processed dataset
dataset = Dataset.get_by_name(workspace, name='processed_weather_data_portofTurku')
print(dataset.name, dataset.version)

processed_weather_data_portofTurku 1


In [5]:
df = dataset.to_pandas_dataframe()

In [6]:
df.head()

Unnamed: 0,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Current_weather_condition,Future_weather_condition
0,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1
1,2006-04-01 03:00:00,"Port of Turku, Finland",9.222222,0.85,13.9587,258,14.9569,1016.66,1,1
2,2006-04-01 04:00:00,"Port of Turku, Finland",7.733333,0.95,12.3648,259,9.982,1016.72,1,1
3,2006-04-01 05:00:00,"Port of Turku, Finland",8.772222,0.89,14.1519,260,9.982,1016.84,1,1
4,2006-04-01 06:00:00,"Port of Turku, Finland",10.822222,0.82,11.3183,259,9.982,1017.37,1,1


# Spliting Pre-Processed data into Training and Validation datasets

In [73]:
# Validation set is used later to evaluate model performance post training. 

In [9]:
df_training = df.iloc[:77160]

In [17]:
df_training.shape

(77160, 10)

In [12]:
df_validation = df.drop(df_training.index)

In [15]:
df_validation.shape

(19289, 10)

# Registering Training and Validation data to the datastore on the workspace. 

In [139]:
!mkdir Data

In [18]:
df_training.to_csv('Data/training_data.csv',index=False)

In [19]:
df_validation.to_csv('Data/validation_data.csv',index=False)

In [20]:
datastore = workspace.get_default_datastore()

In [21]:
datastore.upload(src_dir='Data', target_path='data')

Uploading an estimated of 2 files
Uploading Data/training_data.csv
Uploading Data/validation_data.csv
Uploaded Data/validation_data.csv, 1 files out of an estimated total of 2
Uploaded Data/training_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files


$AZUREML_DATAREFERENCE_b7b7a8c0fca647cc9e3a2e1a341963c5

In [22]:
training_dataset = Dataset.Tabular.from_delimited_files(datastore.path('data/training_data.csv'))

In [23]:
validation_dataset = Dataset.Tabular.from_delimited_files(datastore.path('data/validation_data.csv'))

In [24]:
training_ds = training_dataset.register(workspace=workspace,
                                 name='training_dataset',
                                 description='Dataset to use for ML training')

In [25]:
validation_ds = validation_dataset.register(workspace=workspace,
                                 name='validation_dataset',
                                 description='Dataset for validation ML models')

# Data ingestion step - Training dataset

In [7]:
dataset = Dataset.get_by_name(workspace, name='training_dataset')
print(dataset.name, dataset.version)

training_dataset 1


In [8]:
df = dataset.to_pandas_dataframe()

In [9]:
df.head()

Unnamed: 0,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Current_weather_condition,Future_weather_condition
0,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1
1,2006-04-01 03:00:00,"Port of Turku, Finland",9.222222,0.85,13.9587,258,14.9569,1016.66,1,1
2,2006-04-01 04:00:00,"Port of Turku, Finland",7.733333,0.95,12.3648,259,9.982,1016.72,1,1
3,2006-04-01 05:00:00,"Port of Turku, Finland",8.772222,0.89,14.1519,260,9.982,1016.84,1,1
4,2006-04-01 06:00:00,"Port of Turku, Finland",10.822222,0.82,11.3183,259,9.982,1017.37,1,1


In [10]:
df.shape

(77160, 10)

#### Feature Selection and scaling

In [11]:
X = df[['Temperature_C', 'Humidity', 'Wind_speed_kmph', 'Wind_bearing_degrees', 'Visibility_km', 'Pressure_millibars', 'Current_weather_condition']].values
y = df['Future_weather_condition'].values
y

array([1, 1, 1, ..., 1, 1, 1])

In [12]:
# Splitting the Training dataset into Train and Test set for ML training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [14]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model training and Testing Step

## 1. Support Vector Machine

In [15]:
myexperiment = Experiment(workspace, "support-vector-machine")
mlflow.set_experiment("mlflow-support-vector-machine")

In [16]:
#from sklearn.svm import SVC
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [17]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

In [18]:
svc = svm.SVC()

In [19]:
# initialize a run in Azureml and mlflow experiments
run = myexperiment.start_logging()
mlflow.start_run()


run.log("dataset name", dataset.name)
run.log("dataset Version", dataset.version)

In [20]:
svc_grid = GridSearchCV(svc, parameters)

In [21]:
%%time
svc_grid.fit(X_train, y_train)

CPU times: user 6min 40s, sys: 1.71 s, total: 6min 41s
Wall time: 6min 45s


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
svc_grid.get_params(deep=True)

{'cv': 'warn',
 'error_score': 'raise-deprecating',
 'estimator__C': 1.0,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'auto_deprecated',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
   kernel='rbf', max_iter=-1, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False),
 'fit_params': None,
 'iid': 'warn',
 'n_jobs': None,
 'param_grid': {'kernel': ('linear', 'rbf'), 'C': [1, 10]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': 'warn',
 'scoring': None,
 'verbose': 0}

In [23]:
from sklearn.svm import SVC

In [25]:
svc = SVC(C=svc_grid.get_params(deep=True)['estimator__C'], kernel=svc_grid.get_params(deep=True)['estimator__kernel'])

In [26]:
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [27]:
# Logging training parameters to AzureML and MLFlow experiments
run.log("C", svc_grid.get_params(deep=True)['estimator__C'])
run.log("Kernel", svc_grid.get_params(deep=True)['estimator__kernel'])

In [28]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [29]:
predicted_svc = svc.predict(X_test)

In [30]:
acc = accuracy_score(y_test, predicted_svc)

In [31]:
fscore = f1_score(y_test, predicted_svc, average="macro")
precision = precision_score(y_test, predicted_svc, average="macro")
recall = recall_score(y_test, predicted_svc, average="macro")

In [32]:
import git
repo = git.Repo(search_parent_directories=True)
sha = repo.head.object.hexsha

In [33]:
# Log to AzureML and MLflow
run.log("Test_accuracy", acc)
run.log("Precision", precision)
run.log("Recall", recall)
run.log("F-Score", fscore)
run.log("Git-sha", sha)

In [34]:
run.complete()
print ("run id:", run.id)

run id: 5f2d4100-19aa-4ddc-9bf5-402117da8c77


In [None]:
mlflow.end_run()

In [35]:
run.get_metrics()

{'dataset name': 'training_dataset',
 'dataset Version': 1,
 'C': 1.0,
 'Kernel': 'rbf',
 'Test_accuracy': 0.9519180922757906,
 'Precision': 0.8869828453699851,
 'Recall': 0.8859050416892464,
 'F-Score': 0.8864428755463128,
 'Git-sha': 'bb282af9afe9422cb0986c873bf881e1c994e580'}

In [36]:
workspace.get_details()

{'id': '/subscriptions/6faa9ede-4786-48dc-9c1e-0262e2844ebf/resourceGroups/Learn_MLOps/providers/Microsoft.MachineLearningServices/workspaces/MLOps_WS',
 'name': 'MLOps_WS',
 'location': 'northeurope',
 'type': 'Microsoft.MachineLearningServices/workspaces',
 'tags': {},
 'sku': 'Basic',
 'workspaceid': 'e8674bfa-cb69-4989-8a02-de19862bbbd0',
 'description': '',
 'friendlyName': 'MLOps_WS',
 'creationTime': '2020-07-29T05:13:13.4348343+00:00',
 'keyVault': '/subscriptions/6faa9ede-4786-48dc-9c1e-0262e2844ebf/resourcegroups/learn_mlops/providers/microsoft.keyvault/vaults/mlopsws6106784693',
 'applicationInsights': '/subscriptions/6faa9ede-4786-48dc-9c1e-0262e2844ebf/resourcegroups/learn_mlops/providers/microsoft.insights/components/mlopsws8072806275',
 'identityPrincipalId': '4f27e823-96d9-4f95-8782-8f5a34535ad1',
 'identityTenantId': 'd22095da-de0a-479f-adc4-47380feb19a1',
 'identityType': 'SystemAssigned',
 'storageAccount': '/subscriptions/6faa9ede-4786-48dc-9c1e-0262e2844ebf/resourc

In [124]:
import mlflow.sklearn
mlflow.sklearn.log_model(svc, 'outputs')



Random Forest classifier 

In [53]:
myexperiment = Experiment(workspace, "random-forest-classifier")
mlflow.set_experiment("mlflow-random-forest-classifier")

In [54]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
rf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=100)

In [56]:
# initialize runs in Azureml and mlflow
run = myexperiment.start_logging()
mlflow.start_run()


# Log dataset used 
run.log("dataset name", dataset.name)
run.log("dataset Version", dataset.version)

In [57]:
%%time
rf.fit(X_train, y_train)

CPU times: user 6.52 s, sys: 3.97 ms, total: 6.52 s
Wall time: 6.59 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [58]:
# Logging training parameters to AzureML and MLFlow experiments
run.log("max_depth", 10)
run.log("random_state", 0)
run.log("n_estimators", 100)

In [59]:
predicted_rf = rf.predict(X_test)

In [60]:
acc = accuracy_score(y_test, predicted_rf)
fscore = f1_score(y_test, predicted_rf, average="macro")
precision = precision_score(y_test, predicted_rf, average="macro")
recall = recall_score(y_test, predicted_rf, average="macro")

In [61]:
run.log("Test_accuracy", acc)
run.log("Precision", precision)
run.log("Recall", recall)
run.log("F-Score", fscore)
run.log("Git-sha", sha)

In [62]:
run.complete()
print ("run id:", run.id)

run id: 5596d473-fa15-4925-ab9b-8155845b560b


In [63]:
mlflow.end_run()

In [64]:
run.get_metrics()

{'dataset name': 'training_dataset',
 'dataset Version': 1,
 'max_depth': 10,
 'n_estimators': 100,
 'random_state': 0,
 'Test_accuracy': 0.9548989113530326,
 'Precision': 0.9018705246237031,
 'Recall': 0.8804084310202218,
 'F-Score': 0.8907272822498857,
 'Git-sha': 'bb282af9afe9422cb0986c873bf881e1c994e580'}

# Model Packaging Step

pickle file or onnx

In [51]:
# Convert into SVC model into ONNX format file
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 7]))]
onx = convert_sklearn(svc, initial_types=initial_type)
with open("outputs/svc.onnx", "wb") as f:
    f.write(onx.SerializeToString())



In [65]:
# Convert into RF model into ONNX format file
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 7]))]
onx = convert_sklearn(rf, initial_types=initial_type)
with open("outputs/rf.onnx", "wb") as f:
    f.write(onx.SerializeToString())



# Model Registering Step

In [68]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/svc.onnx', # this points to a local file 
                       model_name = "support-vector-classifier", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, 'hyparameter-C': '1', 'testdata-accuracy': '0.9519'}, 
                       model_framework='pandas==0.23.4',
                       description = "Support vector classifier to predict weather at port of Turku",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model support-vector-classifier
Name: support-vector-classifier
Version: 1


In [69]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/rf.onnx', # this points to a local file 
                       model_name = "random-forest-classifier", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, 'hyparameter-C': '1', 'testdata-accuracy': '0.9548'}, 
                       model_framework='pandas==0.23.4',
                       description = "Random forest classifier to predict weather at port of Turku",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model random-forest-classifier
Name: random-forest-classifier
Version: 1


In [70]:
import mlflow.sklearn

In [71]:
# Save the model to the outputs directory for capture
mlflow.sklearn.log_model(svc, 'outputs/svc.onnx')



In [72]:
# Save the model to the outputs directory for capture
mlflow.sklearn.log_model(rf, 'outputs/rf.onnx')



# Save model artefacts

In [76]:
import pickle

with open('./outputs/scaler.pkl', 'wb') as scaler_pkl:
    pickle.dump(sc, scaler_pkl)

In [77]:
# Register Model on AzureML WS
scaler = Model.register(model_path = './outputs/scaler.pkl', # this points to a local file 
                       model_name = "scaler", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version}, 
                       model_framework='pandas==0.23.4',
                       description = "Scaler used for scaling incoming inference data",
                       workspace = workspace)

print('Name:', scaler.name)
print('Version:', scaler.version)

Registering model scaler
Name: scaler
Version: 1
