In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from plotnine import ggplot,geom_bar,theme_bw,labs,coord_flip,aes
import time

## Basic EDA

In [None]:
df=pd.read_csv('../data/income_data.csv', index_col=False)

In [None]:
print(df.head() ,'\n') 
print(df.info(), '\n')    
print(df.describe(), '\n')

In [None]:
if not os.path.exists("categorical_variable_plots"):
    os.makedirs("categorical_variable_plots")    
for i in df.iloc[:,:-1].select_dtypes(include='object').columns:
    print(f'Variable {i}  \n ')
    print(df[i].value_counts())
    plot = ggplot(df)+ geom_bar(aes(x=df[i], fill=df.Target), position='fill')+ theme_bw() + labs(title=f'Variable {i} ~ Target')+ coord_flip()
    print(plot)    #A
    plot.save(f"categorical_variable_plots/Variable {i}")  


## Setting Up MLflow

In [None]:
import mlflow
import uuid

In [None]:
# setting tracking uri and experiment. If experiment name does not exist it will be created
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("income-classifier")


In [None]:
with mlflow.start_run(run_name=f"eda-{uuid.uuid4()}"): 
    for i in df.iloc[:,:-1].select_dtypes(include='object').columns:
        print(f'Variable {i}  \n ')
        print(df[i].value_counts())

        plot = ggplot(df)+ geom_bar(aes(x=df[i], fill=df.Target), position='fill')+ theme_bw() + labs(title=f'Variable {i} ~ Target')+ coord_flip()
        print(plot)
        if not os.path.exists("categorical_variable_plots"):
            os.makedirs("categorical_variable_plots")
        plot.save(f"categorical_variable_plots/Variable {i}")
        mlflow.log_artifacts("categorical_variable_plots")  


### Model Training

Train Test Split

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np
target=df.Target
feature_df=df.drop('Target', axis=1)
#Codifying the predictors and the target
encoder=OneHotEncoder(sparse_output=False, drop='if_binary')
target=encoder.fit_transform(np.array(target).reshape(-1,1))
dummyfied_df=pd.get_dummies(feature_df, drop_first=True, sparse=False, dtype=float)
col_list = dummyfied_df.columns.to_list()
X_train,X_test,y_train, y_test=train_test_split(dummyfied_df.reindex(columns=col_list,fill_value=0)
, target, train_size=0.80, shuffle=True)

Utilities

In [None]:
from minio import Minio
minioClient = Minio("localhost:9000", access_key="minio", secret_key="minio123", secure=False)
found = minioClient.bucket_exists("mlflow-datasets")
if not found:
    print("PLEASE CREATE BUCKET IN MINIO BEFORE PROCEEDING")


In [None]:
from io import BytesIO
def save_df_to_minio(df,bucket_name,path):
    csv_bytes = df.to_csv(index=False).encode('utf-8')
    csv_buffer = BytesIO(csv_bytes)

    minioClient.put_object(f'{bucket_name}',
                        f'{path}',
                            data=csv_buffer,
                            length=len(csv_bytes),
                            content_type='application/csv')

### Training with MLflow

Decision Tree Classifier

In [None]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import  roc_auc_score
BUCKET_NAME = "mlflow-datasets"
with mlflow.start_run() as run:
    results=pd.DataFrame(index=['Roc Auc Score test', 'Accuracy score train', 'Accuracy Score test','time to fit'])
    tree=DecisionTreeClassifier()
    run_id = run.info.run_id
    feature_df_path = f"income-classifier-datasets/feature_df-{run_id}.csv"
    save_df_to_minio(feature_df,BUCKET_NAME,feature_df_path)
    train_df = pd.concat([X_train,pd.Series(y_train.ravel())],axis=1)
    train_df_path = f"income-classifier-datasets/train-{run_id}.csv"
    save_df_to_minio(train_df,BUCKET_NAME,train_df_path)    #A
    test_df = pd.concat([X_test,pd.Series(y_test.ravel())],axis=1)
    test_df_path = f"income-classifier-datasets/test-{run_id}.csv"
    save_df_to_minio(test_df,BUCKET_NAME,test_df_path)    
    training_dataset = mlflow.data.from_pandas(train_df, source=f"{BUCKET_NAME}/{train_df_path}")    #B
    test_dataset = mlflow.data.from_pandas(test_df, source=f"{BUCKET_NAME}/{test_df_path}")
    feature_dataset = mlflow.data.from_pandas(feature_df, source=f"{BUCKET_NAME}/{feature_df_path}")   
    mlflow.log_input(training_dataset,context="training")    #C
    mlflow.log_input(test_dataset,context="testing")
    mlflow.log_input(feature_dataset,context="reference")    
    tree.fit(X_train, y_train.ravel())    #D
    roc_auc_score_train = roc_auc_score(y_train==1, tree.predict_proba(X_train)[:,1])    #E
    roc_auc_score_test = roc_auc_score(y_test==1, tree.predict_proba(X_test)[:,1])
    training_accuracy = tree.score(X_train, y_train)
    test_accuracy = tree.score(X_test, y_test)
    mlflow.log_metric("roc_auc_score_train",roc_auc_score_train)    #F
    print(f'Roc Auc Score train: {roc_auc_score_train}  \n')
    mlflow.log_metric("roc_auc_score_test",roc_auc_score_test)
    print(f'Roc Auc Score test: {roc_auc_score_test}  \n')
    mlflow.log_metric("training_accuracy",training_accuracy)
    print(f'Accuracy train : {training_accuracy}')
    mlflow.log_metric("test_accuracy",test_accuracy)
    print(f'Accuracy test : {test_accuracy}')
    mlflow.sklearn.log_model(tree,"income-classifier")    #G
    mlflow.log_params(tree.get_params())    #H


RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
with mlflow.start_run():
    forest=RandomForestClassifier()

    start=time.time()
    train_df = pd.concat([X_train,pd.Series(y_train.ravel())],axis=1)
    test_df = pd.concat([X_test,pd.Series(y_test.ravel())],axis=1)
    feature_df_path = f"income-classifier-datasets/feature_df-{run_id}.csv"
    train_df_path = f"income-classifier-datasets/train-{run_id}.csv"
    save_df_to_minio(train_df,BUCKET_NAME,train_df_path)
    save_df_to_minio(feature_df,BUCKET_NAME,feature_df_path)
    test_df_path = f"income-classifier-datasets/test-{run_id}.csv"
    test_df_path = f"income-classifier-datasets/test-{run_id}.csv"
    save_df_to_minio(test_df,BUCKET_NAME,test_df_path)
    feature_dataset = mlflow.data.from_pandas(feature_df, source=f"{BUCKET_NAME}/{test_df_path}")
    training_dataset = mlflow.data.from_pandas(train_df, source=f"{BUCKET_NAME}/{train_df_path}")
    test_dataset = mlflow.data.from_pandas(test_df, source=f"{BUCKET_NAME}/{test_df_path}")
    mlflow.log_input(training_dataset,context="training")
    mlflow.log_input(test_dataset,context="testing")
    mlflow.log_input(feature_dataset,context="reference_features")
    forest.fit(X_train, y_train.ravel())
    end=time.time()

    roc_auc_score_train = roc_auc_score(y_train==1, forest.predict_proba(X_train)[:,1])
    roc_auc_score_test = roc_auc_score(y_test==1, forest.predict_proba(X_test)[:,1])
    training_accuracy = forest.score(X_train, y_train)
    test_accuracy = forest.score(X_test, y_test)

    mlflow.log_metric("roc_auc_score_train",roc_auc_score_train)
    print(f'Roc Auc Score train: {roc_auc_score_train}  \n')
    mlflow.log_metric("roc_auc_score_test",roc_auc_score_test)
    print(f'Roc Auc Score test: {roc_auc_score_test}  \n')
    mlflow.log_metric("training_accuracy",training_accuracy)
    print(f'Accuracy train : {training_accuracy}')
    mlflow.log_metric("test_accuracy",test_accuracy)
    print(f'Accuracy test : {test_accuracy}')
    mlflow.sklearn.log_model(forest,"income-classifier")
    
    mlflow.log_params(forest.get_params())

XGBoost with MLflow autologging

In [None]:
import xgboost as xgb
from sklearn.metrics import  accuracy_score
with mlflow.start_run():
    mlflow.xgboost.autolog()    #A
    n_round=30
    dtrain= xgb.DMatrix(data=X_train, label=y_train.ravel())
    dtest= xgb.DMatrix(data=X_test, label=y_test.ravel())
    params={"objective":"binary:logistic",'colsample_bytree': 1,'learning_rate': 1,
                    'max_depth': 10 , 'subsample':1}
    model=xgb.train(params,dtrain, n_round)
    ax = xgb.plot_importance(model, max_num_features=10, importance_type='cover')
    fig = ax.figure
    fig.set_size_inches(10, 8)
    pred_train= model.predict(dtrain)
    pred_test=model.predict(dtest)
    model=xgb.train(params={"objective":"binary:hinge",'colsample_bytree': 1,'learning_rate': 1,
                    'max_depth': 10 , 'subsample':1}, dtrain=dtrain)
    pred_train= model.predict(dtrain)
    pred_test=model.predict(dtest)
    roc_auc_score_train = roc_auc_score(y_train==1, pred_train)
    roc_auc_score_test = roc_auc_score(y_test==1, pred_test)
    training_accuracy = accuracy_score(y_train, pred_train)
    test_accuracy = accuracy_score(y_test, pred_test)
    mlflow.log_metric("roc_auc_score_train",roc_auc_score_train)    #B
    print(f'Roc Auc Score train: {roc_auc_score_train}  \n')
    mlflow.log_metric("roc_auc_score_test",roc_auc_score_test)
    print(f'Roc Auc Score test: {roc_auc_score_test}  \n')
    mlflow.log_metric("training_accuracy",training_accuracy)
    print(f'Accuracy train : {training_accuracy}')
    mlflow.log_metric("test_accuracy",test_accuracy)
    print(f'Accuracy test : {test_accuracy}')


## MLFlow Client

In [None]:
from mlflow import MlflowClient

mlflow_client = MlflowClient()
experiment_name = "income-classifier"
experiment = mlflow_client.get_experiment_by_name(experiment_name)
run_object = mlflow_client.search_runs(experiment_ids=experiment.experiment_id,filter_string="metrics.roc_auc_score_test > 0.8",max_results=1,order_by=["metrics.roc_auc_score_test DESC"])[0]
model_uri = f"runs:/{run_object.info.run_id}/{experiment_name}"
mlflow.register_model(model_uri, "random-forest-classifier")