# Project 3: Capstone Project

## AutoML

### Sandeep Pawar

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.impute import SimpleImputer

from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier

from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.decomposition import KernelPCA
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

from sklearn.decomposition import PCA

from azureml.core import Workspace, Experiment

# from azureml.widgets import RunDetails
# from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive import PrimaryMetricGoal
from azureml.train.hyperdrive import BanditPolicy
from azureml.train.hyperdrive import RandomParameterSampling, BayesianParameterSampling
from azureml.train.hyperdrive import HyperDriveConfig
from azureml.train.hyperdrive import choice
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.widgets import RunDetails
from azureml.train.automl import AutoMLConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core import ScriptRunConfig
from azureml.core import Workspace, Environment
from interpret.glassbox import ExplainableBoostingClassifier

import os
from sklearn.feature_selection import SelectFromModel


seed = 123


import warnings
warnings.filterwarnings("ignore")

In [3]:
path1 = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
X = pd.read_table(path1, header=None, delim_whitespace=True)

path2 = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
y = pd.read_table(path2, header=None, usecols=[0], squeeze=True, delim_whitespace=True)

In [4]:
x1,x2,y1,y2 = train_test_split(X,y, random_state=seed, shuffle=True, stratify=y)

In [5]:
x1.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
871,3212.46,2522.41,2200.2333,1173.8377,1.3281,100.0,101.6111,0.1211,1.465,0.0035,...,,,0.4936,0.0131,0.0032,2.6457,0.0117,0.0262,0.0089,223.1018
385,3067.89,2570.93,2196.8,1090.0084,1.327,100.0,99.3944,0.1212,1.5001,-0.0199,...,0.0025,127.2483,0.4968,0.0096,0.0029,1.9411,0.0056,0.0071,0.0025,127.2483
611,2967.54,2573.09,2160.6,1124.5821,1.5257,100.0,98.7122,0.1246,1.475,0.0248,...,,,0.4973,0.0129,0.003,2.6016,0.0252,0.0157,0.0046,62.3881


# Create Azure ML Hyperdrive

### Define Azure Workspace & Compute

In [10]:
ws =Workspace.from_config()
exp = Experiment(workspace=ws, name="Project3")

print(ws.get_details()['id'])

compute_name = "DS2"



try:
    vm = ComputeTarget(ws, compute_name)
    print(f"{compute_name} exists already")
except:
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=4)
    vm = ComputeTarget.create(ws, compute_name, compute_config)
    
vm.wait_for_completion(show_output=True)

/subscriptions/81cefad3-d2c9-4f77-a466-99a7f541c7bb/resourceGroups/aml-quickstarts-136322/providers/Microsoft.MachineLearningServices/workspaces/quick-starts-ws-136322
DS2 exists already
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### AUTOML

#### Import data and register the dataset

In [8]:
path1 = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
X = pd.read_table(path1, header=None, delim_whitespace=True)

path2 = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
y = pd.read_table(path2, header=None, usecols=[0], squeeze=True, delim_whitespace=True)

x_df = X.copy()
x_df['y'] = y

In [11]:
default_ds = ws.get_default_datastore()
amlds = (TabularDatasetFactory
        .register_pandas_dataframe(x_df,target=(default_ds,'aml'),
                                           name='amldata',
                                           show_progress=True)
        )

Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to aml/acc6d3b8-0df7-4206-8e40-d9aaeb68d74d/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


#### Define AutoML Config

In [12]:
automl_config = AutoMLConfig(
    experiment_timeout_minutes=20,
    task='classification',
    primary_metric='AUC_weighted',
    compute_target=vm,
    training_data=amlds,
    label_column_name='y',
    n_cross_validations=5)

#### Start AutoML

In [13]:
#Submit aml experiment

print('Starting AutoML...')
automl_experiment = Experiment(ws, 'automl')
automl_run = automl_experiment.submit(automl_config)
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

Starting AutoML...
Running on remote.


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of 

{'runId': 'AutoML_4d818fd0-ee9f-4193-b92d-5a619e59ec18',
 'target': 'DS2',
 'status': 'Completed',
 'startTimeUtc': '2021-01-28T02:00:12.204043Z',
 'endTimeUtc': '2021-01-28T02:29:54.673908Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'DS2',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"8ea33772-d4c5-4c91-8485-d61b4934b7cc\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"aml/acc6d3b8-0df7-4206-8e40-d9aaeb68d74d/\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-136322\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"81cefad3-d2c9-4f77-a466-99a7f541c7bb\\\\\\", \

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…