In [8]:
pip show xgboost

Name: xgboost
Version: 1.3.3
Summary: XGBoost Python Package
Home-page: https://github.com/dmlc/xgboost
Author: None
Author-email: None
License: Apache-2.0
Location: /anaconda/envs/azureml_py38/lib/python3.8/site-packages
Requires: numpy, scipy
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [4]:
# AML workspace details
subscription_id = "13457d73-d2df-4297-b63a-2632b1c1b881"
resource_group = "rg-dic"
workspace = "mlw-dic"

from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# get a handle to the workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)

Class FeatureStoreOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class FeatureSetOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class FeatureStoreEntityOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


### Register Dataset

In [15]:
# Create Data Asset
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

my_path = './azure_dic_project/cleaned_data.csv'

my_data = Data(
    path=my_path,
    type=AssetTypes.URI_FILE,
    description="Credit score brackets - Multi class classification dataset, cleaned",
    name="credit-score-data-cleaned"
)

ml_client.data.create_or_update(my_data)

[32mUploading cleaned_data.csv[32m (< 1 MB): 100%|██████████| 22.8M/22.8M [00:00<00:00, 49.8MB/s]
[39m



Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_file', 'is_anonymous': False, 'auto_increment_version': False, 'name': 'credit-score-data-cleaned', 'description': 'Credit score brackets - Multi class classification dataset, cleaned', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/13457d73-d2df-4297-b63a-2632b1c1b881/resourceGroups/rg-dic/providers/Microsoft.MachineLearningServices/workspaces/mlw-dic/data/credit-score-data-cleaned/versions/1', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/dic-compute/code/Users/pradeepsurya', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7f3da4a7f7f0>, 'serialize': <msrest.serialization.Serializer object at 0x7f3d92672fe0>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/13457d73-d2df-4297-b63a-2632b1c1b881/resourcegroups/rg-dic/workspaces/mlw-dic/datastores/workspaceblobsto

In [7]:
datasets = ml_client.data.list()
for ds_name in datasets:
    print(ds_name.name)

credit-score-data-cleaned


### Training Script

In [6]:
import os

# create a folder for the script files
script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

src folder created


In [12]:
%%writefile $script_folder/train-model-mlflow.py
# import libraries
import mlflow
import argparse
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.metrics import f1_score
import xgboost as xgb
import matplotlib.pyplot as plt

def main(args):
    # enable autologging
    mlflow.autolog()
    
    # Read data asset
    print("Reading data...")
    df = pd.read_csv(args.training_data)

    # Read model hyperparameters from json file
    with open(args.hparam_file) as f:
        hyperparameters = json.load(f)

    # Label Encoding
    le = LabelEncoder()
    df['Month'] = le.fit_transform(df['Month'])

    # One hot encoding
    encoded_df = pd.get_dummies(df, columns=['Occupation', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour'])
    
    # Scaling numerical features
    scaler = MinMaxScaler()
    cols = ['Month','Age','Annual_Income','Monthly_Inhand_Salary','Num_Bank_Accounts','Num_Credit_Card','Interest_Rate',
        'Num_of_Loan', 'Delay_from_due_date','Num_of_Delayed_Payment','Changed_Credit_Limit','Num_Credit_Inquiries',
        'Outstanding_Debt', 'Credit_Utilization_Ratio','Credit_History_Age','Total_EMI_per_month','Amount_invested_monthly']

    encoded_df[cols] = scaler.fit_transform(encoded_df[cols])

    # split data
    print("Splitting data...")
    y = encoded_df['Credit_Score']
    y = y.map({'Good':2, 'Standard':1, 'Poor':0})
    X = encoded_df.drop(['Credit_Score'], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=587)

    # train model
    print("Training model...")
    model = xgb.XGBClassifier(**hyperparameters)
    model.fit(X_train, y_train)

    # evaluate model
    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)
    print('Accuracy:', acc)
    mlflow.log_metric("Accuracy", acc)

    f1 = f1_score(y_test, y_hat, average='weighted')
    print('F1 score:', f1)
    mlflow.log_metric("F1-score", f1)

    y_scores = model.predict_proba(X_test)

    # Feature importances
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    feature_names = X_train.columns

    fig = plt.figure(figsize=(6, 4))
    plt.bar(range(len(importances)), importances[indices], align='center')
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title('XGBoost Feature Importance')
    plt.savefig("xgb_feature_importance.png")
    mlflow.log_artifact("xgb_feature_importance.png") 


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--training_data", dest='training_data', type=str)
    parser.add_argument("--json_file", dest="hparam_file", type=str, help="Path to the JSON file containing hyperparameters")

    args = parser.parse_args()

    return args


if __name__ == "__main__":
    print("\n\n")
    print("*" * 60)

    args = parse_args()
    main(args)

    print("*" * 60)
    print("\n\n")

Overwriting src/train-model-mlflow.py


### Model Training

In [13]:
!pwd

/mnt/batch/tasks/shared/LS_root/mounts/clusters/dic-compute/code/Users/pradeepsurya


In [14]:
from azure.ai.ml import command
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes

# configure input 
job_input = {
    "data": Input(type=AssetTypes.URI_FILE, path="azureml:credit-score-data-cleaned:1")
}

# configure job
job = command(
    code="./src",
    command="python train-model-mlflow.py --training_data ${{inputs.data}} --json_file xgb_hyperparameters",
    inputs=job_input,
    environment="AzureML-lightgbm-3.2-ubuntu18.04-py37-cpu@latest",
    compute="anl-cluster",
    display_name="xgb-train-optimal",
    experiment_name="credit-score-xgb-mlflow"
    )

# submit job
returned_job = ml_client.create_or_update(job)
aml_url = returned_job.studio_url
print("Monitor job at", aml_url)

[32mUploading src (0.0 MBs): 100%|██████████| 3628/3628 [00:00<00:00, 48812.28it/s]
[39m



Monitor job at https://ml.azure.com/runs/happy_tooth_krzs2642pg?wsid=/subscriptions/13457d73-d2df-4297-b63a-2632b1c1b881/resourcegroups/rg-dic/workspaces/mlw-dic&tid=234a6691-d3c4-40e4-86a9-01996278dd47


### Register the Model

In [None]:
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes

job_name = returned_job.name

run_model = Model(
    path=f"azureml://jobs/{job_name}/outputs/artifacts/paths/model/",
    name="mlflow-diabetes",
    description="Model created from run.",
    type=AssetTypes.MLFLOW_MODEL,
)
# Uncomment after adding required details above
ml_client.models.create_or_update(run_model)

### Create Endpoint

In [None]:
from azure.ai.ml.entities import ManagedOnlineEndpoint
import datetime

online_endpoint_name = "endpoint-" + datetime.datetime.now().strftime("%m%d%H%M%f")

# create an online endpoint
endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="Online endpoint for MLflow diabetes model",
    auth_mode="key",
)

ml_client.begin_create_or_update(endpoint).result()

### Configure Deployment

In [None]:
from azure.ai.ml.entities import Model, ManagedOnlineDeployment
from azure.ai.ml.constants import AssetTypes

# create a blue deployment
model = Model(
    path="./model",
    type=AssetTypes.MLFLOW_MODEL,
    description="my sample mlflow model",
)

blue_deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name=online_endpoint_name,
    model=model,
    instance_type="Standard_F4s_v2",
    instance_count=1,
)

ml_client.online_deployments.begin_create_or_update(blue_deployment).result()

In [None]:
# Get the details for online endpoint
endpoint = ml_client.online_endpoints.get(name=online_endpoint_name)

# existing traffic details
print(endpoint.traffic)

# Get the scoring URI
print(endpoint.scoring_uri)