### Initialize the workspace

In [1]:
# Config file already exist in compute instance 
from azureml.core import Workspace

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

ADS_AMLworkspace
ADS_Book
westus2
ab8f5415-63b3-4fd4-8a8a-9213316abb6e


### If running on local, supply the below information from Azure machine learning workspace overview

In [1]:
import os

subscription_id = os.getenv("SUBSCRIPTION_ID", default="<my-subscription-id>")
resource_group = os.getenv("RESOURCE_GROUP", default="<my-resource-group>")
workspace_name = os.getenv("WORKSPACE_NAME", default="<my-workspace-name>")
workspace_region = os.getenv("WORKSPACE_REGION", default="eastus2")

### Access your workspace

In [2]:
from azureml.core import Workspace

try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config()
    print("Workspace configuration succeeded. Skip the workspace creation steps below")
except:
    print("Workspace not accessible. Change your parameters or create a new workspace below")

Workspace configuration succeeded. Skip the workspace creation steps below


### Create an Experiment

In [2]:
from azureml.core import Experiment
experiment_name = 'Bankmarketing-train-on-amlcompute'
experiment = Experiment(workspace = ws, name = experiment_name)
experiment

Name,Workspace,Report Page,Docs Page
Bankmarketing-train-on-amlcompute,ADS_AMLworkspace,Link to Azure Machine Learning studio,Link to Documentation


### Create Azure machine learning Compute clusters

In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "ninjacpucluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


### Workspace Datastores

In [4]:
datastores = ws.datastores
datastores

{'kaggledatabook': {
   "name": "kaggledatabook",
   "container_name": "opendata",
   "account_name": "kaggledatabook",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'azureml_globaldatasets': {
   "name": "azureml_globaldatasets",
   "container_name": "globaldatasets",
   "account_name": "mmstoragewestus2",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'opendata': {
   "name": "opendata",
   "container_name": "datasets",
   "account_name": "holstoragev2",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'workspacefilestore': {
   "name": "workspacefilestore",
   "container_name": "azureml-filestore-6f61da3d-8d28-4c47-b40c-e2743511c301",
   "account_name": "adsamlworkspac8022875928",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'workspaceblobstore': {
   "name": "workspaceblobstore",
   "container_name": "azureml-blobstore-6f61da3d-8d28-4c47-b40c-e2743511c301",
   "account_name": "adsamlworkspac8022875928",
   "protocol":

### Default datastore

In [4]:
datastore = ws.get_default_datastore()
datastore

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-6f61da3d-8d28-4c47-b40c-e2743511c301",
  "account_name": "adsamlworkspac8022875928",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

### Register a new datastore

In [6]:
from azureml.core import Datastore
blob_datastore_name='kaggledatabook' # Name of the datastore to workspace
container_name=os.getenv("BLOB_CONTAINER", "opendata") # Name of Azure blob container
account_name=os.getenv("BLOB_ACCOUNTNAME", "kaggledatabook") # Storage account name
account_key=os.getenv("BLOB_ACCOUNT_KEY", "QGmWeGNpXKFtmU7cnXW5Dg0LwX7L2SCbfjsZlBKKHHgsdhABgTfFo5Vh4ja3KTFdCfDrh7Q6n3SGpVlE4g/eXA==") # Storage account access key

blob_datastore = Datastore.register_azure_blob_container(workspace=ws, 
                                                         datastore_name=blob_datastore_name, 
                                                         container_name=container_name, 
                                                         account_name=account_name,
                                                         account_key=account_key)
blob_datastore

{
  "name": "kaggledatabook",
  "container_name": "opendata",
  "account_name": "kaggledatabook",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

### Upload data to datastore as dataset

In [4]:
blob_datastore.upload_files(files = ['./Bank.csv'], overwrite = True, show_progress = True)

Uploading an estimated of 1 files
Uploading ./Bank.csv
Uploaded ./Bank.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_kaggledatabook

### Read existing data from datastore as a dataset

In [7]:
from azureml.core.dataset import Dataset
bank_dataset = Dataset.Tabular.from_delimited_files(path=blob_datastore.path('Bank.csv'))
bank_dataset

{
  "source": [
    "('kaggledatabook', 'Bank.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

In [170]:
#quick view of dataset
bank_dataset.to_pandas_dataframe().head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


### Register the datasets in workspace to share with others and reuse in experiments

In [17]:
bank_ds = bank_dataset.register(workspace=ws,
                                 name='bank_dataset',
                                 description='Bank_Marketing_Kaggledata')

### Create a folder to submit to remote compute

In [15]:
import os
script_folder = os.path.join(os.getcwd(), "sklearn-bankmarketing")
os.makedirs(script_folder, exist_ok=True)

### Create a Training Script

In [17]:
%%writefile $script_folder/train_bankmarketing.py

import os
import argparse
import pickle
import pandas as pd
from azureml.core import Dataset, Run
import numpy as np
from sklearn.metrics import accuracy_score #metrics
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier

# sklearn.externals.joblib is removed in 0.23
from sklearn import __version__ as sklearnver
from packaging.version import Version
if Version(sklearnver) < Version("0.23.0"):
    from sklearn.externals import joblib
else:
    import joblib
    
run = Run.get_context()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--learning_rate', type=float, default=0.2,
                        help='learning_rate parameter to be used in the algorithm')
    parser.add_argument('--n_estimators', type=int, default=100,
                        help='n_estimators to be used in the algorithm')
    parser.add_argument('--max_depth', type=int, default=3,
                        help='max_depth parameter to be used in the algorithm')
    parser.add_argument('--min_samples_split', type=int, default=100,
                        help='min_samples_split to be used in the algorithm')
    parser.add_argument('--min_samples_leaf', type=int, default=100,
                        help='min_samples_leaf to be used in the algorithm')
    parser.add_argument('--subsample', type=float, default=3,
                        help='subsample parameter to be used in the algorithm')
    parser.add_argument('--random_state', type=int, default=0.7,
                        help='random_state to be used in the algorithm')
    parser.add_argument('--max_features', type=int, default=0.0,
                        help='max_features parameter to be used in the algorithm')
    

    args = parser.parse_args()  
    run.log('learning_rate', np.float(args.learning_rate))
    run.log('n_estimators', np.int(args.n_estimators))
    run.log('max_depth', np.int(args.max_depth))
    run.log('min_samples_split', np.int(args.min_samples_split))
    run.log('min_samples_leaf', np.int(args.min_samples_leaf))
    run.log('subsample', np.float(args.subsample))
    run.log('random_state', np.int(args.subsample))
    run.log('max_features', np.int(args.subsample))


    # get input dataset by name
    bank_dataset = run.input_datasets['bank_dataset']
    data = bank_dataset.to_pandas_dataframe()

    
    # Data Cleaning
    cat_col = ['default', 'housing', 'loan', 'deposit', 'job', 
                'marital', 'education', 'contact', 'month', 'poutcome']
    for column in cat_col:
        label_encoder = LabelEncoder()
        label_encoder = label_encoder.fit(data[column])
        label_encoded_y = label_encoder.transform(data[column])
        data[column + '_cat'] = label_encoded_y
    #     data[column + '_bool'] = data[column].eq('yes').mul(1)
    data = data.drop(columns = cat_col)
    
    #drop irrelevant columns
    data = data.drop(columns = ['pdays'])
    #impute incorrect values and drop original columns
    def get_correct_values(row, column_name, threshold, df):
        ''' Returns mean value if value in column_name is above threshold'''
        if row[column_name] <= threshold:
            return row[column_name]
        else:
            mean = df[df[column_name] <= threshold][column_name].mean()
            return mean
    data['campaign_cleaned'] = data.apply(lambda row: get_correct_values(row, 'campaign', 50, data),axis=1)
    data['previous_cleaned'] = data.apply(lambda row: get_correct_values(row, 'previous', 50, data),axis=1)
    data = data.drop(columns = ['campaign', 'previous'])


    # Model Training
    X = data.drop(columns = 'deposit_cat')
    y = data[['deposit_cat']]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, seed = 300)
    Params = {'learning_rate': np.float(args.learning_rate),
              'n_estimators': np.int(args.n_estimators),
              'max_depth': np.int(args.max_depth),
              'min_samples_split': np.int(args.min_samples_split),
              'min_samples_leaf': np.int(args.min_samples_leaf),
              'subsample': np.float(args.subsample),
              'random_state': np.int(args.random_state),
              'max_features': np.int(args.max_features)}
        
    # GradientBoostingClassifier
    clf = GradientBoostingClassifier(**Params)
    clf.fit(X_train,y_train.squeeze().values)
    
    #calculate and print scores for the model 
    y_train_preds = clf.predict(X_train)
    y_test_preds = clf.predict(X_test)


    model_file_name = 'joblibGB_bankmarketing.sav'

    accuracy_score_train = accuracy_score(y_train, y_train_preds)
    accuracy_score_test = accuracy_score(y_test, y_test_preds)
    run.log('Gradient Boosting Accuracy Score for training', accuracy_score_train)
    run.log('Graident Boosting Accuracy Score for testing', accuracy_score_test)

    # Save the trained model
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(value=clf, filename='outputs/' + model_file_name)    

if __name__ == '__main__':
    main()

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/adscompute/code/Users/prsing/BankMarketingAnalysis/sklearn-bankmarketing/train_bankmarketing.py


### Create an Estimator

In [33]:
from azureml.train.sklearn import SKLearn
# from azureml.core import Dataset
script_params = {
    '--learning_rate': 0.01,
    '--n_estimators' : 600,
    '--max_depth': 9,
    '--min_samples_split': 1200,
    '--min_samples_leaf': 60,
    '--subsample': 0.85,
    '--random_state': 10,
    '--max_features': 7,
}

estimator = SKLearn(source_directory=script_folder, 
              script_params=script_params,
              entry_script='train_bankmarketing.py', 
              # pass dataset object as an input with name 'titanic'
              inputs=[bank_dataset.as_named_input('bank_dataset')],
              pip_packages=['azureml-dataset-runtime[fuse]', 'packaging', 'numpy==1.16.2'],
              compute_target=cpu_cluster_name)

In [34]:
run = experiment.submit(estimator)
run.tag("GB_BankMArketing_joblibsave")

In [35]:
from azureml.widgets import RunDetails

# monitor the run
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [36]:
run.get_metrics()

{'learning_rate': 0.01,
 'n_estimators': 600,
 'max_depth': 9,
 'min_samples_split': 1200,
 'min_samples_leaf': 60,
 'random_state': 0,
 'subsample': 0.85,
 'max_features': 0,
 'Gradient Boosting Accuracy Score for training': 0.8590703067355329,
 'Graident Boosting Accuracy Score for testing': 0.8334328358208956}

In [37]:
print(run.get_file_names())

['azureml-logs/20_image_build_log.txt', 'azureml-logs/55_azureml-execution-tvmps_8c8e1fd4b1fd49e9ae4431b8ebd162e2f6868cd895fa2103c0abce24a92e2360_d.txt', 'azureml-logs/65_job_prep-tvmps_8c8e1fd4b1fd49e9ae4431b8ebd162e2f6868cd895fa2103c0abce24a92e2360_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_8c8e1fd4b1fd49e9ae4431b8ebd162e2f6868cd895fa2103c0abce24a92e2360_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/109_azureml.log', 'logs/azureml/dataprep/backgroundProcess.log', 'logs/azureml/dataprep/backgroundProcess_Telemetry.log', 'logs/azureml/dataprep/engine_spans_l_f3102ee7-397f-4aef-9612-60615be58aa9.jsonl', 'logs/azureml/dataprep/python_span_l_f3102ee7-397f-4aef-9612-60615be58aa9.jsonl', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/joblibGB_bankmarketing.sav']


### Register the moodel

In [47]:
# register model
model = run.register_model(model_name='Gradientboosting_bankmarketing',
                           model_path='outputs/joblibGB_bankmarketing.sav')
print(model.name, model.id, model.version, sep='\t')

Gradientboosting_bankmarketing	Gradientboosting_bankmarketing:5	5


## Deploy the model to Azure Container Instance

In [39]:
from azureml.core.model import Model
model_path = Model.get_model_path('Gradientboosting_bankmarketing', version=4, _workspace=ws)
model = joblib.load(model_path)

'azureml-models/Gradientboosting_bankmarketing/4/joblibGB_bankmarketing.sav'

In [40]:
model = joblib.load(model_path)
# loaded_model = joblib.load('outputs/joblibGB_bankmarketing.sav')
# loaded_model.predict(X_test)
# sv = SaveLoad()
# model = sv.load('models/{id}.mdl'.format(id=id)).
# model = load(model_path)

### Create Scoring Script

In [88]:
%%writefile score.py
import json
import numpy as np
import os
import pickle
from sklearn.externals import joblib
from sklearn import __version__ as sklearnver
from packaging.version import Version
if Version(sklearnver) < Version("0.23.0"):
    from sklearn.externals import joblib
else:
    import joblib
from azureml.core.model import Model

def init():
    global model
    # AZUREML_MODEL_DIR is an environment variable created during deployment.
    # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
    # For multiple models, it points to the folder containing all deployed models (./azureml-models)
    model_filename = 'joblibGB_bankmarketing.sav'
    model_path = os.path.join(os.environ['AZUREML_MODEL_DIR'], model_filename)
    model = joblib.load(model_path)

def run(raw_data):
    data = np.array(json.loads(raw_data)['data'])
    # make prediction
    y_hat = model.predict(data)
    # you can return any data type as long as it is JSON-serializable
    return y_hat.tolist()

Overwriting score.py


### Create configuration file

In [89]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
import sklearn


environment = Environment('my-sklearn-environment')
environment.python.conda_dependencies = CondaDependencies.create(pip_packages=[
    'azureml-defaults',
    'inference-schema[numpy-support]',
    'joblib',
    'numpy==1.16.2',
    'packaging',
    'scikit-learn=={}'.format(sklearn.__version__)
])
# ['azureml-dataset-runtime[fuse]', 'packaging', 'numpy==1.16.2'],

### Deploy to Azure Container Instance

In [90]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice


service_name = 'joblibgb-bankmarketing'

inference_config = InferenceConfig(entry_script='score.py', environment=environment)
aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

service = Model.deploy(workspace=ws,
                       name=service_name,
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=aci_config,
                       overwrite=True)
service.wait_for_deployment(show_output=True)

Running...........................
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [92]:
service.get_logs()

'2020-08-29T15:41:54,583695133+00:00 - iot-server/run \n2020-08-29T15:41:54,586339411+00:00 - nginx/run \n2020-08-29T15:41:54,587586701+00:00 - rsyslog/run \n/usr/sbin/nginx: /azureml-envs/azureml_032341806dd16cc7d4cebbeb5ef1817c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_032341806dd16cc7d4cebbeb5ef1817c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_032341806dd16cc7d4cebbeb5ef1817c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_032341806dd16cc7d4cebbeb5ef1817c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n/usr/sbin/nginx: /azureml-envs/azureml_032341806dd16cc7d4cebbeb5ef1817c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)\n2020-08-29T15:41:54,583315136+00:00 - gunicorn/run 

In [106]:
scoring_uri = service.scoring_uri
print(scoring_uri)

http://764223ca-901a-4b72-be70-7ec48213d18f.westus2.azurecontainer.io/score


### Test the Web Service

In [154]:
data = Dataset.get_by_name(ws, name='bank_dataset')
data = data.to_pandas_dataframe()

### Use the same Cleaning data code which we used to train our model. 
Note: We will use AzureML pipelines to wrap the clean data code and automate the process for reusability

In [155]:
# Data Cleaning
cat_col = ['default', 'housing', 'loan', 'deposit', 'job', 
            'marital', 'education', 'contact', 'month', 'poutcome']
for column in cat_col:
    label_encoder = LabelEncoder()
    label_encoder = label_encoder.fit(data[column])
    label_encoded_y = label_encoder.transform(data[column])
    data[column + '_cat'] = label_encoded_y
#     data[column + '_bool'] = data[column].eq('yes').mul(1)
data = data.drop(columns = cat_col)

#drop irrelevant columns
data = data.drop(columns = ['pdays'])
#impute incorrect values and drop original columns
def get_correct_values(row, column_name, threshold, df):
    ''' Returns mean value if value in column_name is above threshold'''
    if row[column_name] <= threshold:
        return row[column_name]
    else:
        mean = df[df[column_name] <= threshold][column_name].mean()
        return mean
data['campaign_cleaned'] = data.apply(lambda row: get_correct_values(row, 'campaign', 50, data),axis=1)
data['previous_cleaned'] = data.apply(lambda row: get_correct_values(row, 'previous', 50, data),axis=1)
data = data.drop(columns = ['campaign', 'previous'])


# Model Training
X = data.drop(columns = 'deposit_cat')
y = data[['deposit_cat']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 300)
X_test.head()

Unnamed: 0,age,balance,day,duration,default_cat,housing_cat,loan_cat,job_cat,marital_cat,education_cat,contact_cat,month_cat,poutcome_cat,campaign_cleaned,previous_cleaned
1911,59,145,12,556,0,1,0,0,1,1,1,8,2,2.0,13.0
306,36,3057,16,2769,0,0,0,2,1,2,2,6,3,4.0,0.0
8092,45,1529,30,160,0,0,0,4,1,2,0,6,3,1.0,0.0
5366,42,-67,9,128,0,1,0,1,1,0,2,8,3,1.0,0.0
4591,62,6,13,103,0,1,0,6,0,2,0,0,2,1.0,3.0


In [156]:
import pandas as pd
import requests
import json

df_list = X_test.values.tolist()
data = {}
data['data'] = df_list
input_data = json.dumps(data)

# Set the content type
headers = {'Content-Type': 'application/json'}
# # If authentication is enabled, set the authorization header
# headers['Authorization'] = f'Bearer {key}'

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
# resp_json = resp.text
X_test['pred_termdeposit'] = resp.json()
# retrieve first 10 predictions from the response
resp.json()[:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[1, 1, 0, 0, 1, 0, 1, 1, 1, 0]

### Deploy on Azure Kubernetes Service

##### Create AKS Cluster

In [157]:
from azureml.core.compute import AksCompute, ComputeTarget

prov_config = AksCompute.provisioning_configuration(vm_size = "STANDARD_DS3_V2",
                                                       agent_count = 1,
                                                       location = "westus2",
                                                    cluster_purpose = AksCompute.ClusterPurpose.DEV_TEST)

aks_name = 'myaks'
# Creates the cluster
aks_target = ComputeTarget.create(workspace = ws,
                                    name = aks_name,
                                    provisioning_configuration = prov_config)

# Waits for the create process to complete
aks_target.wait_for_completion(show_output = True)

Creating........................................................
SucceededProvisioning operation finished, operation "Succeeded"


##### Deploy

In [161]:
from azureml.core.webservice import AksWebservice, Webservice
from azureml.core.model import Model
from azureml.core.compute import AksCompute

aks_target = AksCompute(ws,cluster_name)
aks_target

AksCompute(workspace=Workspace.create(name='ADS_AMLworkspace', subscription_id='ab8f5415-63b3-4fd4-8a8a-9213316abb6e', resource_group='ADS_Book'), name=myaks, id=/subscriptions/ab8f5415-63b3-4fd4-8a8a-9213316abb6e/resourceGroups/ADS_Book/providers/Microsoft.MachineLearningServices/workspaces/ADS_AMLworkspace/computes/myaks, type=AKS, provisioning_state=Succeeded, location=westus2, tags=None)

In [167]:
deployment_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)
model = Model(ws, name='Gradientboosting_bankmarketing', version= 5)
service = Model.deploy(ws, "gb-bankmarketing", [model], inference_config, deployment_config, aks_target)
service.wait_for_deployment(show_output = True)

# inference_config = InferenceConfig(entry_script='score.py', environment=environment)

Running........
Succeeded
AKS service creation operation finished, operation "Succeeded"
