# Project 3: Capstone Project

## AutoML

### Sandeep Pawar

In [1]:
# !pip install --upgrade --upgrade-strategy eaget azureml-sdk[automl,widgets,notebooks]

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.impute import SimpleImputer

from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier

from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.decomposition import KernelPCA
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

from sklearn.decomposition import PCA

from azureml.core import Workspace, Experiment

# from azureml.widgets import RunDetails
# from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive import PrimaryMetricGoal
from azureml.train.hyperdrive import BanditPolicy
from azureml.train.hyperdrive import RandomParameterSampling, BayesianParameterSampling
from azureml.train.hyperdrive import HyperDriveConfig
from azureml.train.hyperdrive import choice
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.widgets import RunDetails
from azureml.train.automl import AutoMLConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core import ScriptRunConfig
from azureml.core import Workspace, Environment
from interpret.glassbox import ExplainableBoostingClassifier

import os
from sklearn.feature_selection import SelectFromModel


seed = 123


import warnings
warnings.filterwarnings("ignore")

In [3]:
path1 = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
X = pd.read_table(path1, header=None, delim_whitespace=True)

path2 = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
y = pd.read_table(path2, header=None, usecols=[0], squeeze=True, delim_whitespace=True)

In [4]:
x1,x2,y1,y2 = train_test_split(X,y, random_state=seed, shuffle=True, stratify=y)

In [5]:
x1.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
871,3212.46,2522.41,2200.2333,1173.8377,1.3281,100.0,101.6111,0.1211,1.465,0.0035,...,,,0.4936,0.0131,0.0032,2.6457,0.0117,0.0262,0.0089,223.1018
385,3067.89,2570.93,2196.8,1090.0084,1.327,100.0,99.3944,0.1212,1.5001,-0.0199,...,0.0025,127.2483,0.4968,0.0096,0.0029,1.9411,0.0056,0.0071,0.0025,127.2483
611,2967.54,2573.09,2160.6,1124.5821,1.5257,100.0,98.7122,0.1246,1.475,0.0248,...,,,0.4973,0.0129,0.003,2.6016,0.0252,0.0157,0.0046,62.3881


# Create Azure ML Hyperdrive

### Define Azure Workspace & Compute

In [6]:
ws =Workspace.from_config()
exp = Experiment(workspace=ws, name="Project3")

print(ws.get_details()['id'])

compute_name = "DS2"



try:
    vm = ComputeTarget(ws, compute_name)
    print(f"{compute_name} exists already")
except:
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=4)
    vm = ComputeTarget.create(ws, compute_name, compute_config)
    
vm.wait_for_completion(show_output=True)

Performing interactive authentication. Please follow the instructions on the terminal.


Note, we have launched a browser for you to login. For old experience with device code, use "az login --use-device-code"


You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
/subscriptions/6b4af8be-9931-443e-90f6-c4c34a1f9737/resourceGroups/aml-quickstarts-136429/providers/Microsoft.MachineLearningServices/workspaces/quick-starts-ws-136429
DS2 exists already

Running


### AUTOML

#### Import data and register the dataset

In [7]:
path1 = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
X = pd.read_table(path1, header=None, delim_whitespace=True)

path2 = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
y = pd.read_table(path2, header=None, usecols=[0], squeeze=True, delim_whitespace=True)

x_df = X.copy()
x_df['y'] = y

In [8]:
default_ds = ws.get_default_datastore()
amlds = (TabularDatasetFactory
        .register_pandas_dataframe(x_df,target=(default_ds,'aml'),
                                           name='amldata',
                                           show_progress=True)
        )

Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to aml/c6d56f85-ae36-46e9-85e3-1a751bbda9fe/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


#### Define AutoML Config

In [9]:
automl_config = AutoMLConfig(
    experiment_timeout_minutes=20,
    task='classification',
    primary_metric='AUC_weighted',
    compute_target=vm,
    training_data=amlds,
    label_column_name='y',
    n_cross_validations=5)

#### Start AutoML

In [10]:
#Submit aml experiment

print('Starting AutoML...')
automl_experiment = Experiment(ws, 'automl')
automl_run = automl_experiment.submit(automl_config)
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

Starting AutoML...
Running on remote.


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of 


****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more about high cardinality feature handling: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   PIPELINE                                       DURATION 

{'runId': 'AutoML_d19356e5-7ca0-4836-bc8b-2ece446278d5',
 'target': 'DS2',
 'status': 'Completed',
 'startTimeUtc': '2021-01-28T23:28:45.840056Z',
 'endTimeUtc': '2021-01-28T23:56:30.263265Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'DS2',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"6eb1a0c8-c586-412c-a662-0d12cd0f42ec\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"aml/c6d56f85-ae36-46e9-85e3-1a751bbda9fe/\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-136429\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"6b4af8be-9931-443e-90f6-c4c34a1f9737\\\\\\", \

In [11]:
best_run, model = automl_run.get_output()

In [23]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl,AutoML_d19356e5-7ca0-4836-bc8b-2ece446278d5_16,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [22]:
model

PipelineWithYTransformations(Pipeline={'memory': None,
                                       'steps': [('datatransformer',
                                                  DataTransformer(enable_dnn=None,
                                                                  enable_feature_sweeping=None,
                                                                  feature_sweeping_config=None,
                                                                  feature_sweeping_timeout=None,
                                                                  featurization_config=None,
                                                                  force_text_dnn=None,
                                                                  is_cross_validation=None,
                                                                  is_onnx_compatible=None,
                                                                  logger=None,
                                                              

In [12]:
model.steps[1][1].estimators


[('8',
  Pipeline(memory=None,
           steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                  ('extratreesclassifier',
                   ExtraTreesClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='entropy',
                                        max_depth=None, max_features=0.3,
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=0.01,
                                        min_samples_split=0.056842105263157895,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=25, n_jobs=1, oob_score=True,
                                        random_state=None, verbose=0,
                                        w

### Model Deployment

In [13]:
metrics = best_run.get_metrics()

In [14]:
metrics

{'f1_score_micro': 0.9336216194216641,
 'matthews_correlation': 0.0,
 'recall_score_micro': 0.9336216194216641,
 'AUC_weighted': 0.7670637167144247,
 'accuracy': 0.9336216194216641,
 'norm_macro_recall': 0.0,
 'f1_score_macro': 0.48281210840980704,
 'weighted_accuracy': 0.9947200235979221,
 'balanced_accuracy': 0.5,
 'average_precision_score_micro': 0.9651313973262992,
 'recall_score_weighted': 0.9336216194216641,
 'precision_score_micro': 0.9336216194216641,
 'average_precision_score_weighted': 0.9270152187950098,
 'precision_score_weighted': 0.8718193140891846,
 'f1_score_weighted': 0.9016190220237142,
 'log_loss': 0.22468976670665075,
 'precision_score_macro': 0.46681080971083205,
 'AUC_micro': 0.9661555312292203,
 'average_precision_score_macro': 0.5976067425775142,
 'recall_score_macro': 0.5,
 'AUC_macro': 0.767063716714425,
 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.AutoML_d19356e5-7ca0-4836-bc8b-2ece446278d5_16/confusion_matrix',
 'accuracy_table': 'aml://artifact

### Register the automl model

In [16]:
#Register the automl model
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

bestmodel = best_run.register_model(model_path = 'outputs/model.pkl',
                                    model_name = 'automlmodel', 
                                    resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=0.5),
                                    model_framework=Model.Framework.MULTI,
                                    
                                    )



In [17]:
from azureml.core import Model

mymodel = Model(ws, 'automlmodel')

In [21]:
mymodel

Model(workspace=Workspace.create(name='quick-starts-ws-136429', subscription_id='6b4af8be-9931-443e-90f6-c4c34a1f9737', resource_group='aml-quickstarts-136429'), name=automlmodel, id=automlmodel:1, version=1, tags={}, properties={})