In [None]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

In [None]:
#TODO: Name your artifacts
# userid = <userid>
userid = ''

tabular_dataset_name = 'diabetes-data-' + userid + '-raw'
experiment_name = 'pipeline-train-diabetes-' + userid + '-prod'


## Training Pipeline

In [None]:
import os

# Create a folder for the experiment files
experiment_folder = 'experiment_folder'
os.makedirs(experiment_folder, exist_ok=True)
print(experiment_folder, 'folder created')

In [None]:
%%writefile $experiment_folder/diabetes_training.py
# Import libraries
import os
import argparse
import azureml.core
from azureml.core import Run, Dataset, Model, Workspace
from azureml.core.authentication import MsiAuthentication
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
args = parser.parse_args()

# Set these variables
model_name = ''
experiment_name = ''
subscription_id = ''
resource_group = ''
workspace_name = ''


# Set regularization hyperparameter (passed as an argument to the script)
reg = args.reg_rate

# Get the experiment run context
run = Run.get_context()

# Get the training dataset
print("Loading Data...")
diabetes = run.input_datasets['training_data'].to_pandas_dataframe()

# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
run.log('Regularization Rate',  np.float(reg))
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes_model.pkl')

run.upload_file('outputs/diabetes_model.pkl','outputs/diabetes_model.pkl')

run.complete()


# Get current model version accuracy.
msi_auth = MsiAuthentication()

ws = Workspace(subscription_id=subscription_id,
               resource_group=resource_group,
               workspace_name=workspace_name,
               auth=msi_auth)

model_current = Model(ws, name=model_name)
published_model_accuracy = model_current.properties['Accuracy']


# Compare current published model to model trained in this run.
if float(run.get_metrics()['Accuracy']) > float(published_model_accuracy):
    run.register_model(model_path='./outputs/diabetes_model.pkl', model_name=model_name,
                   tags={'Training context':'Pipeline Training'},
                   properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy'], 'Experiment':experiment_name, 'ExperimentRunId': run.get_details()['runId']})
    print('Published new model')


In [None]:
%%writefile $experiment_folder/train_environment.yml
name: train_environment
dependencies:
- python=3.6.2
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute

aml_compute_target = "prod-cluster"

aml_compute = AmlCompute(ws, aml_compute_target)
print("found existing compute target.")
print("Azure Machine Learning Compute attached")

In [None]:
from azureml.core import Dataset

diabetes_ds = ws.datasets.get(tabular_dataset_name)

print('Found registered dataset ' + diabetes_ds.name)

In [None]:
from azureml.core import Environment
from azureml.core import ScriptRunConfig

train_env = Environment.from_conda_specification(
    name="diabetes-train-env", file_path="./experiment_folder/train_environment.yml"
)

train_cfg = ScriptRunConfig(
    source_directory='experiment_folder',
    script="diabetes_training.py",
    arguments = ['--regularization', 0.1],
    compute_target=aml_compute,
    environment=train_env,
)

In [None]:
from azureml.pipeline.steps import PythonScriptStep

train_step = PythonScriptStep(name='train_step',
                            source_directory=train_cfg.source_directory,
                            script_name=train_cfg.script,
                            runconfig=train_cfg.run_config,
                            inputs=[diabetes_ds.as_named_input('training_data')],
                            allow_reuse=False
                            )

print("Step1 created")

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

pipeline = Pipeline(workspace=ws, steps=[train_step])
pipeline_run = Experiment(ws, experiment_name).submit(pipeline, regenerate_outputs=False)
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion()

In [None]:
pipeline.publish(name=experiment_name, description="Train new model. Validate against current model and deploy.")

## Data Drift Pipeline

In [None]:
import os

# Create a folder for the experiment files
data_drift_folder = 'data_drift_folder'
os.makedirs(data_drift_folder + '/baseline', exist_ok=True)
os.makedirs(data_drift_folder + '/target', exist_ok=True)
print(data_drift_folder, 'folder created')

In [None]:
import datetime as dt
from datetime import timedelta
import pandas as pd
import numpy as np

df_baseline = pd.read_csv('../../data/diabetes2.csv')
df_target = df_baseline.copy()

# Modify data to ceate some drift
df_target['Pregnancies'] = df_target['Pregnancies'] + 1
df_target['Age'] = round(df_target['Age'] * 1.2).astype(int)
df_target['BMI'] = df_target['BMI'] * 1.1

row_count = df_baseline.shape[0]

baseline_date = dt.date(2022,1,1)
target_date = dt.date(2022,2,1)

baseline_date_column = []
target_date_column = []
for day in range(row_count):
    baseline_date_column.append(baseline_date)
    target_date_column.append(target_date)

df_baseline['Datetime'] = baseline_date_column
df_target['Datetime'] = target_date_column

df_baseline.head()

df_baseline.to_csv(data_drift_folder + '/baseline/diabetes_baseline.csv')
df_target.to_csv(data_drift_folder + '/target/diabetes_target.csv')

In [None]:
from azureml.core import Datastore, Dataset
from azureml.data.datapath import DataPath

adls_name = 'adlsraw'

datastore = Datastore.get(ws, adls_name)

Register baseline dataset
Dataset.File.upload_directory(src_dir=data_drift_folder + '/baseline',
           target=DataPath(datastore,  'diabetes/' + userid + '/baseline/'),
           show_progress=True)

diabetes_baseline_ds = Dataset.Tabular.from_delimited_files(path=(datastore,'diabetes/jodobrze/baseline'))
diabetes_baseline_ds.register(ws,name='diabetes-data-baseline-jodobrze',create_new_version=False)

# Register target dataset
Dataset.File.upload_directory(src_dir=data_drift_folder + '/target',
           target=DataPath(datastore,  'diabetes/' + userid + '/target/'),
           show_progress=True)

diabetes_target_ds = Dataset.Tabular.from_delimited_files(path=(datastore,'diabetes/jodobrze/target'))
diabetes_target_ds.with_timestamp_columns('Datetime').register(ws,name='diabetes-data-target-jodobrze',create_new_version=False)

In [None]:
%%writefile $data_drift_folder/diabetes_drift_monitor.py
import argparse
from azureml.core import Workspace
from azureml.core.authentication import MsiAuthentication
from azureml.datadrift import DataDriftDetector
from datetime import datetime

parser = argparse.ArgumentParser()
parser.add_argument('--target-date', type=str, dest='target_date', default='2022-02-01', help='target date')
args = parser.parse_args()

subscription_id = ''
resource_group = ''
workspace_name = ''

target_date = args.target_date
target_datetime = datetime.strptime(target_date, '%Y-%m-%d')

msi_auth = MsiAuthentication()

ws = Workspace(subscription_id=subscription_id,
               resource_group=resource_group,
               workspace_name=workspace_name,
               auth=msi_auth)

monitor = DataDriftDetector.get_by_name(ws,name='diabetes-jodobrze-drift-monitor')
print(monitor)

monitor_run = monitor.run(target_date=target_datetime)

monitor_run.wait_for_completion()

drift_percent = monitor_run.get_metrics()['Datadrift percentage']['drift_percentage']

print(drift_percent)


In [None]:
%%writefile $data_drift_folder/drift_environment.yml
name: train_environment
dependencies:
- python=3.6.2
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults
  - azureml-datadrift

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute

aml_compute_target = "dev-cluster"

aml_compute_drift_target = AmlCompute(ws, aml_compute_target)
print("found existing compute target.")
print("Azure Machine Learning Compute attached")

In [None]:
from azureml.datadrift import DataDriftDetector
monitor = DataDriftDetector.get_by_name(ws,name='diabetes-jodobrze-drift-monitor')
type(monitor)
# monitor_run = monitor.run(target_date=target_datetime)



In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from datetime import datetime

target_date = '2022-02-01'
target_datetime = datetime.strptime(target_date, '%Y-%m-%d')

drift_pipeline_1 = Pipeline(workspace=ws, steps=[monitor.run(target_date=target_datetime)])

# RunDetails(drift_pipeline_run).show()
# drift_pipeline_run.wait_for_completion()

In [None]:
drift_pipeline_run = Experiment(ws, 'diabetes-jodobrze-drift-monitor-Monitor-Runs').submit(drift_pipeline, regenerate_outputs=False)

In [None]:
from azureml.core import Environment
from azureml.core import ScriptRunConfig

drift_env = Environment.from_conda_specification(
    name="diabetes-drift-env", file_path="./data_drift_folder/drift_environment.yml"
)

drift_cfg = ScriptRunConfig(
    source_directory='data_drift_folder',
    script="diabetes_drift_monitor.py",
    arguments = ['--target-date', '2022-02-01'],
    compute_target=aml_compute_drift_target,
    environment=drift_env,
)

In [None]:
from azureml.pipeline.steps import PythonScriptStep

drift_monitor_step = PythonScriptStep(name='drift_monitor_step',
                            source_directory=drift_cfg.source_directory,
                            script_name=drift_cfg.script,
                            runconfig=drift_cfg.run_config,
                            allow_reuse=False
                            )

print("Step1 created")

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

drift_pipeline = Pipeline(workspace=ws, steps=[drift_monitor_step])
drift_pipeline_run = Experiment(ws, 'diabetes-jodobrze-drift-monitor-Monitor-Runs').submit(drift_pipeline, regenerate_outputs=False)
RunDetails(drift_pipeline_run).show()
drift_pipeline_run.wait_for_completion()

In [None]:
drift_pipeline.publish(name='pipeline-drift-monitor-diabetes-jodobrze-prod', description="Execute data drift monitor for diabetes.")

### Clean-up local workspace
Remove files and directories created during exercise.

In [None]:
import os
import shutil

shutil.rmtree('experiment_folder', ignore_errors=True)
shutil.rmtree('diabetes_training_pipeline', ignore_errors=True)
shutil.rmtree('azureml-models', ignore_errors=True)
shutil.rmtree('outputs', ignore_errors=True)
shutil.rmtree('data_drift_folder', ignore_errors=True)
