### Research and Debugging

In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

import mlflow
from mlflow.models import infer_signature

import os
import yaml
import pickle

In [2]:
# Setting mlflow configuration with dagshub
os.environ['MLFLOW_TRACKING_URI']="https://dagshub.com/siddmirjank2696/Diabetes-Detection-MLflow.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME']="siddmirjank2696"
os.environ["MLFLOW_TRACKING_PASSWORD"]="ed2891a2b1abb4d15f5d9a6a45f8bb11829b6bed"

In [3]:
# Accessing the contents of the yaml file
params = yaml.safe_load(open("../params.yaml"))['train']

# Loading the data
df = pd.read_csv(params['input'])

df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.000000,0.500745,0.234415,0.483333,1
1,0.058824,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667,0
2,0.470588,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.183333,1
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000,0
4,0.000000,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000,1
...,...,...,...,...,...,...,...,...,...
763,0.588235,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000,0
764,0.117647,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000,0
765,0.294118,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000,0
766,0.058824,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333,1


In [4]:
# Splitting the data and labels
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Splitting the data and labels into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=params['random_state'])

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((614, 8), (614,), (154, 8), (154,))

In [5]:
# Creating a function to train the model
def train(X_train, y_train, X_test, func):

    # Creating a model object
    model = func()

    # Fitting the data to the model
    model.fit(X_train, y_train)

    # Making predictions on the unseen test data
    y_pred = model.predict(X_test)

    # Calculating the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)

    # Returning the accuracy
    return accuracy

In [6]:
# Performing Naive Bayes Classification
print(f"Naïve Bayes Classification : {train(X_train, y_train, X_test, MultinomialNB)}")

# Performing Random Forest Classification
print(f"Random Forest Classification : {train(X_train, y_train, X_test, RandomForestClassifier)}")

# Performing XGBoost Classification
print(f"XGBoost Classification : {train(X_train, y_train, X_test, XGBClassifier)}")

# Displaying to the user
print("")
print("Going ahead with Random Forest Classification!")

Naïve Bayes Classification : 0.6428571428571429
Random Forest Classification : 0.7402597402597403
XGBoost Classification : 0.7077922077922078

Going ahead with Random Forest Classification!


In [7]:
# Creating a function to perform hyperparameter tuning
def hyperparameter_tuning(X_train, y_train, X_test):

    # Creating a parameters grid
    param_grid = {
        'n_estimators' : [100, 200, 300],
        'criterion' : ['gini', 'entropy', 'log_loss'],
        'max_features' : ['sqrt', 'log2', None]
    }

    # Creating a model object
    rf = RandomForestClassifier(random_state=params['random_state'])

    # Creating a GridSearchCV object
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1, cv=3)

    # Fitting the GridSearchCV object to the training data
    grid_search.fit(X_train, y_train)

    # Returning the grid search object
    return grid_search

# Setting mlflow tracking uri
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

# Setting mlflow experiment name
mlflow.set_experiment(params['experiment_name'])

# Defining the model signature
signature = infer_signature(X_train, y_train)

# Starting mlflow experiment
with mlflow.start_run():

    # Performing hyperparameter tuning
    grid_search = hyperparameter_tuning(X_train, y_train, X_test)

    # Retrieving the best model
    best_model = grid_search.best_estimator_

    # Retrieving the best parameters
    best_params = grid_search.best_params_

    # Logging the best parameters
    mlflow.log_params(best_params)

    # Making predictions on the unseen test data
    y_pred = best_model.predict(X_test)

    # Calculating the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)

    # Logging the accuracy
    mlflow.log_metric("accuracy", accuracy)

    # Logging the model
    model_info = mlflow.sklearn.log_model(
        sk_model = best_model,
        artifact_path = "diabetes_model",
        signature = signature
    )

    # Returning the accuracy
    accuracy



🏃 View run invincible-hound-493 at: https://dagshub.com/siddmirjank2696/Diabetes-Detection-MLflow.mlflow/#/experiments/0/runs/4b6d351e6249472eaa7556bff4490929
🧪 View experiment at: https://dagshub.com/siddmirjank2696/Diabetes-Detection-MLflow.mlflow/#/experiments/0


In [8]:
# Creating a PCA object
pca = PCA(n_components=4)

# Performing dimensionality reduction on the train data
X_train_pca = pca.fit_transform(X_train)

# Performing dimensionality reduction on the test data
X_test_pca = pca.transform(X_test)

# Performing hyperparameter tuning
grid_search = hyperparameter_tuning(X_train_pca, y_train, X_test_pca)

# Retrieving the best model
best_model = grid_search.best_estimator_

# Making predictions on the unseen test data
y_pred = best_model.predict(X_test_pca)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Returning the accuracy
accuracy

0.6753246753246753

In [9]:
# Displaying to the user
print("PCA is not necessary!")

PCA is not necessary!


In [10]:
# Creating a directory to store the best model
os.makedirs("../models", exist_ok=True)

# Saving the best model in the desired directory
pickle.dump(best_model, open(params['output'], 'wb'))

### Actual Implementation

In [11]:
# Importing the required libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import mlflow
from mlflow.models import infer_signature

import os
import yaml
import pickle

# Setting mlflow configuration with dagshub
os.environ['MLFLOW_TRACKING_URI']="https://dagshub.com/siddmirjank2696/Diabetes-Detection-MLflow.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME']="siddmirjank2696"
os.environ["MLFLOW_TRACKING_PASSWORD"]="ed2891a2b1abb4d15f5d9a6a45f8bb11829b6bed"

# Accessing the contents of the yaml file
params = yaml.safe_load(open("../params.yaml"))['train']

# Loading the data
df = pd.read_csv(params['input'])

# Splitting the data and labels
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Splitting the data and labels into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=params['random_state'])

# Creating a function to perform hyperparameter tuning
def hyperparameter_tuning(X_train, y_train, X_test):

    # Creating a parameters grid
    param_grid = {
        'n_estimators' : [100, 200, 300],
        'criterion' : ['gini', 'entropy', 'log_loss'],
        'max_features' : ['sqrt', 'log2', None]
    }

    # Creating a model object
    rf = RandomForestClassifier(random_state=params['random_state'])

    # Creating a GridSearchCV object
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1, cv=3)

    # Fitting the GridSearchCV object to the training data
    grid_search.fit(X_train, y_train)

    # Returning the grid search object
    return grid_search

# Creating a function to train a Machine Learning Model
def train(X_train, y_train, X_test):

    # Setting mlflow tracking uri
    mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

    # Setting mlflow experiment name
    mlflow.set_experiment(params['experiment_name'])

    # Defining the model signature
    signature = infer_signature(X_train, y_train)

    # Starting mlflow experiment
    with mlflow.start_run():

        # Performing hyperparameter tuning
        grid_search = hyperparameter_tuning(X_train, y_train, X_test)

        # Retrieving the best model
        best_model = grid_search.best_estimator_

        # Retrieving the best parameters
        best_params = grid_search.best_params_

        # Logging the best parameters
        mlflow.log_params(best_params)

        # Making predictions on the unseen test data
        y_pred = best_model.predict(X_test)

        # Calculating the accuracy of the model
        accuracy = accuracy_score(y_test, y_pred)

        # Logging the accuracy
        mlflow.log_metric("accuracy", accuracy)

        # Logging the model
        model_info = mlflow.sklearn.log_model(
            sk_model = best_model,
            artifact_path = "diabetes_model",
            signature = signature
        )

    # Creating a directory to store the best model
    os.makedirs("../models", exist_ok=True)

    # Saving the best model in the desired directory
    pickle.dump(best_model, open(params['output'], 'wb'))

    # Displaying the success statement
    print("\nThe model was successfully saved!")

train(X_train, y_train, X_test)



🏃 View run righteous-carp-697 at: https://dagshub.com/siddmirjank2696/Diabetes-Detection-MLflow.mlflow/#/experiments/0/runs/ff58b3185b614f61bc393b8ad2b67989
🧪 View experiment at: https://dagshub.com/siddmirjank2696/Diabetes-Detection-MLflow.mlflow/#/experiments/0

The model was successfully saved!
