# Automated ML

In [1]:
# Install opendatasets package to download the dataset from Kaggle
!pip install opendatasets
import opendatasets

import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset
from azureml.core.datastore import Datastore

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 4.0 MB/s eta 0:00:011
Collecting python-slugify
  Downloading python_slugify-6.1.2-py2.py3-none-any.whl (9.4 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.8 MB/s  eta 0:00:01
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73052 sha256=c84f7173ed85d1fba48f55a628d6bc15cc44c04160f0de9a68ea0831acdcded9
  Stored in directory: /home/azureuser/.cache/pip/wheels/29/da/11/144cc25aebdaeb4931b231e25fd34b394e6a5725cbb2f50106
Successfully built kaggle
Installing collected packages: text-unidecode, python-slugify, kaggle, opendatasets
Successfully installed kaggle-1.5.12 op

## Dataset

### Overview
The dataset that will be used for this project is the Credit Card Churn Prediction dataset from Kaggle. According to Kaggle, the goal of the dataset is to identify the cause of customer attrition from a consumer credit card bank. The dataset and additional information can be found here: https://www.kaggle.com/datasets/anwarsan/credit-card-bank-churn.

To create the Dataset for this project, the data was first downloaded from Kaggle using the opendatasets package and entering my username and API Key when prompted. The dataset contains two Naive Bayes Classifier fields which the uploader suggested to be deleted and not be considered for analysis; as such, I read the csv to a pandas Dataframe and dropped the two columns before proceeding. Once the dataset was cleaned, I proceeded to register the Dataframe as a TabularDataset in the Workspace's Datastore for use in this experiment.

In [3]:
# Download the dataset from Kaggle. This will prompt user input for a username and API Key.
opendatasets.download('https://www.kaggle.com/datasets/anwarsan/credit-card-bank-churn')

# Clean the dataset. Need to drop last two columns that are from someone else's analysis and should not be included as predictors.
df = pd.read_csv('credit-card-bank-churn/credit_card_churn.csv')
df = df.drop(['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: rglegge2
Your Kaggle Key: ········
Downloading credit-card-bank-churn.zip to ./credit-card-bank-churn


100%|██████████| 379k/379k [00:00<00:00, 2.73MB/s]







In [4]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'capstone-experiment'
experiment=Experiment(ws, experiment_name)

found = False
key = "credit_card_churn"
description_text = "Credit Card Churn Prediction dataset for Machine Learning Engineer with Microsoft Azure"

if key in ws.datasets.keys():
    print(f'Dataset with key: {key} found in Workspace.')
    found = True
    dataset = ws.datasets[key]

if not found:
    print(f'Dataset with key: {key} not found in Workspace.')
    # Get the Datastore where the Dataset will be registered
    datastore = Datastore.get(ws, 'workspaceblobstore')
    dataset = Dataset.Tabular.register_pandas_dataframe(dataframe=df, target=datastore, name=key, description=description_text)

df = dataset.to_pandas_dataframe()
df.describe()

Dataset with key: credit_card_churn not found in Workspace.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/11220365-e927-4b61-80b1-66a1a5f157cc/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
count,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0
mean,46.32596,2.346203,35.928409,3.81258,2.341167,2.455317,8631.953698,1162.814061,7469.139637,0.759941,4404.086304,64.858695,0.712222,0.274894
std,8.016814,1.298908,7.986416,1.554408,1.010622,1.106225,9088.77665,814.987335,9090.685324,0.219207,3397.129254,23.47257,0.238086,0.275691
min,26.0,0.0,13.0,1.0,0.0,0.0,1438.3,0.0,3.0,0.0,510.0,10.0,0.0,0.0
25%,41.0,1.0,31.0,3.0,2.0,2.0,2555.0,359.0,1324.5,0.631,2155.5,45.0,0.582,0.023
50%,46.0,2.0,36.0,4.0,2.0,2.0,4549.0,1276.0,3474.0,0.736,3899.0,67.0,0.702,0.176
75%,52.0,3.0,40.0,5.0,3.0,3.0,11067.5,1784.0,9859.0,0.859,4741.0,81.0,0.818,0.503
max,73.0,5.0,56.0,6.0,6.0,6.0,34516.0,2517.0,34516.0,3.397,18484.0,139.0,3.714,0.999


In [5]:
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


## AutoML Configuration

For the AutoML configurations, I chose to set the `experiment_timeout_minutes` to 30 in order to ensure that the model would finish training in a timely manner. I used 4 for the `max_concurrent_iterations` as this was the same number of nodes that I provisioned for my Compute Cluster. The `primary_metric` I chose to use was AUC_weighted in order to account for the imbalanced classes in this dataset as only 16% of the records in the dataset are `Attrited Customers` compared to 84% being `Existing Customers`. Lastly, I enabled `enable_early_stopping` in order to save compute resources if the models are no longer improving.

In [6]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# The name of the CPU cluster to use
amlcompute_cluster_name = "rlegge-compute-cluster"

# Verify that the cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print(f'Found existing cluster with name: {amlcompute_cluster_name}, will use it')
except ComputeTargetException:
    print(f'Compute cluster with name: {amlcompute_cluster_name} not found, will create it')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS3_V2', max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count=1, timeout_in_minutes=10)

Compute cluster with name: rlegge-compute-cluster not found, will create it
InProgress.
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded......................................................................................................................
AmlCompute wait for completion finished

Wait timeout has been reached
Current provisioning state of AmlCompute is "Succeeded" and current node count is "0"


In [7]:
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'AUC_weighted'
}

automl_config = AutoMLConfig(
    compute_target=compute_target,
    task='classification',
    training_data=dataset,
    enable_early_stopping=True,
    featurization='auto',
    debug_log='automl_errors.log',
    label_column_name='Attrition_Flag',
    n_cross_validations=5,
    **automl_settings
)

In [8]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
capstone-experiment,AutoML_7ec42a0e-ef4a-4651-8a50-c808bca08781,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


## Run Details

AutoML trains multiple models with varying algorithms and hyperparameters in order to identify the best model based on a particular metric. In this case, there were 38 scheduled iterations with 4 being cancelled due to early stopping being enabled. The best model from this run was the VotingEnsemble which had an AUC_weighted score of 0.99301462. Some of the other models that were trained included Logistic Regressions, XGBoostClassifiers, and RandomForest. While the scores were lower than the best model, it can be noted that all of them had fairly acceptable perfomances, only going as low as 0.88905727 for the AUC_weighted score. The plot of all of metrics for all of the models can be seen below for reference.

In [10]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Best Model

In [12]:
automl_run_job, best_automl_model = remote_run.get_output()
print(automl_run_job.id)
print(best_automl_model)

Package:azureml-automl-runtime, training version:1.44.0, current version:1.43.0
Package:azureml-core, training version:1.44.0, current version:1.43.0
Package:azureml-dataprep, training version:4.2.2, current version:4.0.4
Package:azureml-dataprep-rslex, training version:2.8.1, current version:2.6.3
Package:azureml-dataset-runtime, training version:1.44.0, current version:1.43.0.post2
Package:azureml-defaults, training version:1.44.0, current version:1.43.0
Package:azureml-inference-server-http, training version:0.7.4, current version:0.4.13
Package:azureml-interpret, training version:1.44.0, current version:1.43.0
Package:azureml-mlflow, training version:1.44.0, current version:1.43.0.post1
Package:azureml-pipeline-core, training version:1.44.0, current version:1.43.0
Package:azureml-responsibleai, training version:1.44.0, current version:1.43.0
Package:azureml-telemetry, training version:1.44.0, current version:1.43.0
Package:azureml-train-automl-client, training version:1.44.0, curre

AutoML_7ec42a0e-ef4a-4651-8a50-c808bca08781_38
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/mount...
                 PreFittedSoftVotingClassifier(classification_labels=array([0, 1]), estimators=[('0', Pipeline(memory=None, steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('lightgbmclassifier', LightGBMClassifier(min_data_in_leaf=20, n_jobs=1, problem_info=ProblemInfo(gpu_training_param_dict={'processing_unit_type': 'cpu'}), random_state=None))], verbose=False)), ('1', Pipeline(memory=None, steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('xgboostclassifier', XGBoostClassifier(n_jobs=1, problem_info=ProblemInfo(gpu_training_param_dict={'processing_un

In [13]:
import joblib

# Save the best model
joblib.dump(value=best_automl_model, filename='automl_model.joblib')

['automl_model.joblib']

## Model Deployment

Remember you have to deploy only one of the two models you trained but you still need to register both the models. Perform the steps in the rest of this notebook only if you wish to deploy this model.

In [23]:
from azureml.core.model import Model, InferenceConfig
from azureml.core.resource_configuration import ResourceConfiguration

# Register the model
model = automl_run_job.register_model(model_name='best-automl-model',
                                      model_path='outputs/model.pkl',
                                      description='Best model from the output of AutoML for Credit Card Churn Prediction')

In [65]:
from azureml.core import Environment
from azureml.core.webservice import AciWebservice

# Get the output files from the AutoML run job for the best model to use for deployment
automl_run_job.download_files(output_directory='automl_output')

env = Environment.from_conda_specification('automl-env', './automl_output/outputs/conda_env_v_1_0_0.yml')

inference_config = InferenceConfig(environment=env,
                                   entry_script='./automl_output/outputs/scoring_file_v_1_0_0.py')

deployment_config = AciWebservice.deploy_configuration(cpu_cores=1,
                                                       memory_gb=1,
                                                       enable_app_insights=True)

# Deploy the model
deployment_name = 'automl-model-deployment'
service = Model.deploy(workspace=ws,
                       name=deployment_name,
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=deployment_config,
                       overwrite=True)
service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2022-08-12 09:00:22+00:00 Creating Container Registry if not exists.
2022-08-12 09:00:22+00:00 Registering the environment.
2022-08-12 09:00:24+00:00 Use the existing image.
2022-08-12 09:00:25+00:00 Generating deployment configuration.
2022-08-12 09:00:25+00:00 Submitting deployment to compute.
2022-08-12 09:00:29+00:00 Checking the status of deployment automl-model-deployment..
2022-08-12 09:02:42+00:00 Checking the status of inference endpoint automl-model-deployment.
Succeeded
ACI service creation operation finished, operation "Succeeded"


TODO: In the cell below, print the logs of the web service and delete the service

In [76]:
# Testing the endpoint
import requests
import json

rest_endpoint = service.scoring_uri
headers = {'Content-type': 'application/json'}
response = requests.post(rest_endpoint,
                         headers=headers,
                         json={
                             'data': [{
                                 "Customer_Age": 26,
                                 "Gender": "F",
                                 "Dependent_count": 1,
                                 "Education_Level": "Graduate",
                                 "Marital_Status": "Single",
                                 "Income_Category": "$60K - $80K",
                                 "Card_Category": "Blue",
                                 "Months_on_book": 69,
                                 "Total_Relationship_Count": 5,
                                 "Months_Inactive_12_mon": 1,
                                 "Contacts_Count_12_mon": 3,
                                 "Credit_Limit": 42069.00,
                                 "Total_Revolving_Bal": 777,
                                 "Avg_Open_To_Buy": 11914.00,
                                 "Total_Amt_Chng_Q4_Q1": 1.33,
                                 "Total_Trans_Amt": 1144,
                                 "Total_Trans_Ct": 40,
                                 "Total_Ct_Chng_Q4_Q1": 1.69,
                                 "Avg_Utilization_Ratio": 0.06
                             }]
                         }
                        )


In [77]:
print(response.content)

b'"{\\"result\\": [\\"Existing Customer\\"]}"'


In [75]:
# Print logs for the Webservice
logs = service.get_logs()
for line in logs.split('\n'):
    print(line)

/bin/bash: /azureml-envs/azureml_aef7569aa4179bd7346618f9699ef669/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /azureml-envs/azureml_aef7569aa4179bd7346618f9699ef669/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /azureml-envs/azureml_aef7569aa4179bd7346618f9699ef669/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /azureml-envs/azureml_aef7569aa4179bd7346618f9699ef669/lib/libtinfo.so.6: no version information available (required by /bin/bash)
2022-08-12T09:02:33,300976000+00:00 - iot-server/run 
2022-08-12T09:02:33,302678700+00:00 - rsyslog/run 
2022-08-12T09:02:33,301246200+00:00 - gunicorn/run 
bash: /azureml-envs/azureml_aef7569aa4179bd7346618f9699ef669/lib/libtinfo.so.6: no version information available (required by bash)
2022-08-12T09:02:33,322543700+00:00 | gunicorn/run | 
2022-08-12T09:02:33,329623500+00:00 | gunicorn/run | ####################################

In [None]:
# Delete the Webservice and Compute Cluster
service.delete()
compute_target.delete()

**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
