In [150]:

import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.49.0 to work with saurav_aml


In [151]:
from azureml.core import Dataset
from azureml.data.datapath import DataPath

default_ds = ws.get_default_datastore()

if 'insurance dataset' not in ws.datasets:
    Dataset.File.upload_directory(src_dir='data',
                              target=DataPath(default_ds, 'insurance-data/')
                              )

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'insurance-data/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='insurance dataset',
                                description='insurance data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Dataset already registered.


In [152]:
import os
# Create a folder for the pipeline step files
experiment_folder = 'insurance_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

insurance_pipeline


In [153]:
%%writefile $experiment_folder/prep_insurance.py
# Import libraries
import os
import argparse
# import pandas as pd
# import numpy as np
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import scipy.stats as stats #It has all the probability distributions available along with many statistical functions.
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

# import seaborn as sns
# import warnings
# warnings.filterwarnings('ignore') # To supress warnings
# sns.set(style="darkgrid") # set the background for the graphs
from scipy.stats import skew
from statsmodels.stats.proportion import proportions_ztest # For proportion Z-test
from statsmodels.formula.api import ols      # For n-way ANOVA
from statsmodels.stats.anova import anova_lm # For n-way ANOVA
from   scipy.stats import chi2_contingency   # For Chi-Sq 


# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
prep_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
df = run.input_datasets['raw_data'].to_pandas_dataframe()
# df = df.replace('?',np.NaN)
# Log raw row count
row_count = (len(df))
run.log('raw_rows', row_count)

def iqr_outlier_cap(df,column):
    q1 = df[column].quantile(0.25)
    q2 = df[column].quantile(0.75)
    #finding out the value of Inter Quartile Range
    # IQR= np.subtract(q2,q1)
    IQR= (q2.astype(np.float32) - q1.astype(np.float32)).astype(np.bool)
    #defining max and min limits
    max_limit = q2 + (1.5 * IQR)
    min_limit = np.subtract(q1, (1.5 * IQR)) 
    #capping
    df_new = pd.DataFrame(np.where(df[column] > max_limit, max_limit, 
             (np.where(df[column] < min_limit, min_limit, df[column]))), columns=[column])
    return df_new

def Preprocessing(df):
    """Data Pre-processing"""
    # if '?' in the datset which we have to remove by NaN Values
    df = df.replace('?',np.NaN)

    df['collision_type'].fillna(df['collision_type'].mode()[0], inplace = True)
    df['property_damage'].fillna('NO', inplace = True)
    df['police_report_available'].fillna('NO', inplace = True)

    # let's extrat days, month and year from policy bind date
    # df['policy_bind_date'] = pd.to_datetime(df['policy_bind_date'], errors = 'coerce')

    # dropping unimportant columns
    # df = df.drop(columns = [
    #     'umbrella_limit', 
    #     '_c39'])

    df.drop(['_c39'], axis=1, inplace=True)    

    numeric_data = df._get_numeric_data()
    cat_data = df.select_dtypes(include=['object'])

    lst=[]
    for i in numeric_data.columns:
        lst.append(iqr_outlier_cap(numeric_data,i))
    numeric_data_cap=pd.concat(lst,axis=1)

    for c in cat_data:
        lbl = LabelEncoder()
        lbl.fit(cat_data[c].values)
        cat_data[c] = lbl.transform(cat_data[c].values)


    # Normalize the numeric columns
    # scaler = MinMaxScaler()

    # num_data_clean = scaler.fit_transform(numeric_data_cap)

    clean_data = pd.concat([numeric_data,cat_data],axis=1)

    # clean_data = scaler.fit_transform(clean_data)
    
    
    return clean_data

dataPrep = Preprocessing(df)

# # Log processed rows
row_count = (len(df))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(prep_folder, exist_ok=True)
save_path_1 = os.path.join(prep_folder,'prep_data.csv')
dataPrep.to_csv(save_path_1, index=False, header=True)

# End the run
run.complete()

Overwriting insurance_pipeline/prep_insurance.py


In [154]:
%%writefile $experiment_folder/train_insurance.py
# Import libraries


from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
# from azureml.core import Run
import argparse, joblib, os
import argparse
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, classification_report, cohen_kappa_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
# from sklearn.tree import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-data", type=str, dest='training_data', help='training data')
args = parser.parse_args()
training_data = args.training_data

# Get the experiment run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_data,'prep_data.csv')
data_prep = pd.read_csv(file_path)


# Get parameters
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators", type=int)
parser.add_argument("--min_samples_leaf", type=int)
parser.add_argument("--datafolder", type=str)

args, unknown = parser.parse_known_args()

ne = args.n_estimators
msl = args.min_samples_leaf

print(ne, msl)

# X = data_prep.iloc[:, 0:-1]

# y = data_prep.iloc[:, -1]
X = data_prep.drop("fraud_reported",axis=1)
y=data_prep["fraud_reported"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1234)
#smote

sm = SMOTE(random_state = 2)
X_train, y_train= sm.fit_resample(X_train, y_train.ravel())

# Baseline Random forest based Model
# rfc = RandomForestClassifier(n_estimators=ne, min_samples_leaf=msl)
rfc = RandomForestClassifier()

rfcg = rfc.fit(X_train, y_train) # fit on training data
Y_predict = rfcg.predict(X_test)

# Get the probability score - Scored Probabilities
Y_prob = rfcg.predict_proba(X_test)[:, 1]

# Get Confusion matrix and the accuracy/score - Evaluate

cm    = confusion_matrix(y_test, Y_predict)
accuracy = accuracy_score(y_test, Y_predict)

# Create the confusion matrix dictionary
cm_dict = {"schema_type": "confusion_matrix",
           "schema_version": "v1",
           "data": {"class_labels": ["N", "Y"],
                    "matrix": cm.tolist()}
           }

run.log("TotalObservations", len(data_prep))
run.log_confusion_matrix("ConfusionMatrix", cm_dict)
run.log("Accuracy", accuracy)


# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'insurance_model.pkl')
joblib.dump(value=rfc, filename=model_file)

# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'insurance_model',
               tags={'Training context':'Pipeline'},
               properties={'Accuracy': np.float(accuracy)})


run.complete()

Overwriting insurance_pipeline/train_insurance.py


In [155]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "saurav-compute-cluster"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)
    

Found existing cluster, use it.


In [156]:
%%writefile $experiment_folder/experiment_env.yml
name: experiment_env
dependencies:
- python=3.8
- scikit-learn
- ipykernel
- matplotlib
- pandas
- numpy
- statsmodels
- scipy
- pip
- pip:
  - azureml-defaults
  - pyarrow
  - imblearn
  

Overwriting insurance_pipeline/experiment_env.yml


In [157]:

from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/experiment_env.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


In [158]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
insurance_ds = ws.datasets.get("insurance dataset")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
prepped_data = OutputFileDatasetConfig("prepped_data")

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = experiment_folder,
                                script_name = "prep_insurance.py",
                                arguments = ['--input-data', insurance_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = experiment_folder,
                                script_name = "train_insurance.py",
                                arguments = ['--training-data', prepped_data.as_input()],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [159]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [prep_step, train_step]
pipeline_new = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment_new = Experiment(workspace=ws, name = 'saurav-insurance-pipeline')
pipeline_run = experiment_new.submit(pipeline_new, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)
     

Pipeline is built.
Created step Prepare Data [8e3aa793][45940dff-b098-480f-ae27-d12d8f10d9e1], (This step will run and generate new outputs)
Created step Train and Register Model [a99673a2][daee9143-c396-4d2f-9e9c-7a014e21e1db], (This step will run and generate new outputs)
Submitted PipelineRun 4d0b4e07-0eba-4770-bcf0-6a8b7ef331f2
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4d0b4e07-0eba-4770-bcf0-6a8b7ef331f2?wsid=/subscriptions/7c248226-48dc-4d36-baa0-0f0883669328/resourcegroups/saurv_01/workspaces/saurav_aml&tid=0563aea9-b886-4b62-9457-a85924a13bd1
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 4d0b4e07-0eba-4770-bcf0-6a8b7ef331f2
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4d0b4e07-0eba-4770-bcf0-6a8b7ef331f2?wsid=/subscriptions/7c248226-48dc-4d36-baa0-0f0883669328/resourcegroups/saurv_01/workspaces/saurav_aml&tid=0563aea9-b886-4b62-9457-a85924a13bd1
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: e367da67-c468-406c-b731-c6cd56843004
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/e367da67-c468-406c-b731-c6cd56843004?wsid=/subscriptions/7c248226-48dc-4d36-baa0-0f0883669328/resourcegroups/saurv_01/workspaces/saurav_aml&tid=0563aea9-b886-4b62-9457-a85924a13bd1
StepRun( Prepare Data ) Status: NotStarted
StepRun( Prepare Data ) Status: Running

StepRun(Prepare Data) Execution Summary
StepRun( Prepare Data ) Status: Failed

AzureMLCompute job failed.
ExecutionFailed: [REDACTED]
	exit_codes: 1
	Appinsights Reachable: Some(true)
Execution failed. User process '/azureml-envs/azureml_95e68210fe9605c8784b2606

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "Execution failed. User process '/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/bin/python' exited with status code 1. Please check log file 'user_logs/std_log.txt' for error details. Error:     return quantile_with_mask(values, mask, fill_value, qs, interpolation)\n  File \"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/pandas/core/array_algos/quantile.py\", line 95, in quantile_with_mask\n    result = _nanpercentile(\n  File \"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/pandas/core/array_algos/quantile.py\", line 216, in _nanpercentile\n    return np.percentile(\n  File \"<__array_function__ internals>\", line 180, in percentile\n  File \"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\", line 4166, in percentile\n    return _quantile_unchecked(\n  File \"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\", line 4424, in _quantile_unchecked\n    r, k = _ureduce(a,\n  File \"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\", line 3725, in _ureduce\n    r = func(a, **kwargs)\n  File \"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\", line 4593, in _quantile_ureduce_func\n    result = _quantile(arr,\n  File \"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\", line 4710, in _quantile\n    result = _lerp(previous,\n  File \"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\", line 4527, in _lerp\n    diff_b_a = subtract(b, a)\nTypeError: numpy boolean subtract, the `-` operator, is not supported, use the bitwise_xor, the `^` operator, or the logical_xor function instead.\n\n",
        "messageParameters": {},
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z",
    "componentName": "CommonRuntime"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"Execution failed. User process '/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/bin/python' exited with status code 1. Please check log file 'user_logs/std_log.txt' for error details. Error:     return quantile_with_mask(values, mask, fill_value, qs, interpolation)\\n  File \\\"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/pandas/core/array_algos/quantile.py\\\", line 95, in quantile_with_mask\\n    result = _nanpercentile(\\n  File \\\"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/pandas/core/array_algos/quantile.py\\\", line 216, in _nanpercentile\\n    return np.percentile(\\n  File \\\"<__array_function__ internals>\\\", line 180, in percentile\\n  File \\\"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\\\", line 4166, in percentile\\n    return _quantile_unchecked(\\n  File \\\"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\\\", line 4424, in _quantile_unchecked\\n    r, k = _ureduce(a,\\n  File \\\"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\\\", line 3725, in _ureduce\\n    r = func(a, **kwargs)\\n  File \\\"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\\\", line 4593, in _quantile_ureduce_func\\n    result = _quantile(arr,\\n  File \\\"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\\\", line 4710, in _quantile\\n    result = _lerp(previous,\\n  File \\\"/azureml-envs/azureml_95e68210fe9605c8784b26066c59b46a/lib/python3.8/site-packages/numpy/lib/function_base.py\\\", line 4527, in _lerp\\n    diff_b_a = subtract(b, a)\\nTypeError: numpy boolean subtract, the `-` operator, is not supported, use the bitwise_xor, the `^` operator, or the logical_xor function instead.\\n\\n\",\n        \"messageParameters\": {},\n        \"details\": []\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\",\n    \"componentName\": \"CommonRuntime\"\n}"
    }
}

In [None]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

In [None]:
from azureml.core import Model

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')