### Research

In [1]:
# Importing the required libraries
import pandas as pd
import yaml
import pickle
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

import mlflow

In [2]:
# Accessing the params.yaml file
params = yaml.safe_load(open('params.yaml'))['train']

In [3]:
# Importing the data and labels
X = pd.read_csv(params['input_data_path']).values
y = pd.read_csv(params['input_label_path']).values.ravel()

# Splitting the data into train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((46914, 26), (46914,), (11729, 26), (11729,))

In [4]:
# Creating a function to calculate accuracy
def calc_accuracy(X_train, y_train, Model):

    # Creating a model object
    model = Model()

    # Training the model
    model.fit(X_train, y_train)

    # Predicting the validation results
    y_pred = model.predict(X_val)

    # Calculating the accuracy
    accuracy = accuracy_score(y_val, y_pred)

    # Returning the accuracy
    return accuracy

# Displaying the various accuracies
print(f"Multinomial Naive Bayes Accuracy: {calc_accuracy(X_train, y_train, MultinomialNB)}")
print(f"Random Forest Accuracy: {calc_accuracy(X_train, y_train, RandomForestClassifier)}")
print(f"XGBoost Accuracy: {calc_accuracy(X_train, y_train, XGBClassifier)}")

Multinomial Naive Bayes Accuracy: 0.8879699889163611
Random Forest Accuracy: 0.9523403529712678
XGBoost Accuracy: 0.9530224230539688


In [5]:
# Defining the parameters for the classifier
xgb_params = {
    'learning_rate' : [None, 0.1, 0.01],
    'n_estimators' : [None, 200, 400, 500],
    'max_depth' : [None, 3, 4, 5, 6, 7]
}

# Creating an xgboost object
xgb = XGBClassifier()

# Performing hyperparameter tuning
grid_search = GridSearchCV(estimator=xgb, param_grid=xgb_params, n_jobs=-1, cv=3, verbose=2)

# Fitting the model
grid_search.fit(X_train, y_train)

# Performing predictions on the validation data
y_pred = grid_search.predict(X_val)

# Calculating the accuracy
accuracy = accuracy_score(y_val, y_pred)

accuracy

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] END learning_rate=None, max_depth=None, n_estimators=None; total time=   1.9s
[CV] END learning_rate=None, max_depth=None, n_estimators=None; total time=   1.9s
[CV] END learning_rate=None, max_depth=None, n_estimators=None; total time=   1.9s
[CV] END .learning_rate=None, max_depth=3, n_estimators=None; total time=   1.3s
[CV] END .learning_rate=None, max_depth=3, n_estimators=None; total time=   1.3s
[CV] END .learning_rate=None, max_depth=3, n_estimators=None; total time=   1.3s
[CV] END learning_rate=None, max_depth=None, n_estimators=200; total time=   3.6s
[CV] END learning_rate=None, max_depth=None, n_estimators=200; total time=   3.8s
[CV] END learning_rate=None, max_depth=None, n_estimators=200; total time=   3.6s
[CV] END ..learning_rate=None, max_depth=3, n_estimators=200; total time=   2.1s
[CV] END ..learning_rate=None, max_depth=3, n_estimators=200; total time=   2.1s
[CV] END ..learning_rate=None, max_dep

0.9529371642936312

In [6]:
# Setting mlflow configuration with dagshub
os.environ['MLFLOW_TRACKING_URI']="https://dagshub.com/siddmirjank2696/Diabetes-Detection-MLflow.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME']="siddmirjank2696"
os.environ["MLFLOW_TRACKING_PASSWORD"]="ed2891a2b1abb4d15f5d9a6a45f8bb11829b6bed"

# Setting the mlflow tracking uri
mlflow.set_tracking_uri("https://dagshub.com/siddmirjank2696/Loan-Approval-Prediction-AWS-Deployment.mlflow")

# Setting the experiment name
mlflow.set_experiment("XgBoost Tracking")

# Starting an mlflow run
with mlflow.start_run():

    # Creating an xgboost object
    xgb = XGBClassifier()

    # Fititng the data to the model
    xgb.fit(X_train, y_train)

    # Performing predictions on the validation data
    y_pred = xgb.predict(X_val)

    # Calculating the accuracy
    accuracy = accuracy_score(y_val, y_pred)

    # Logging the accuracy
    mlflow.log_metric("accuracy", accuracy)

    accuracy

🏃 View run sassy-goat-552 at: https://dagshub.com/siddmirjank2696/Loan-Approval-Prediction-AWS-Deployment.mlflow/#/experiments/0/runs/29066c6f20f74adaa46dde3e5c5648ae
🧪 View experiment at: https://dagshub.com/siddmirjank2696/Loan-Approval-Prediction-AWS-Deployment.mlflow/#/experiments/0


In [7]:
# Loading the test data
person_test_data = pd.read_csv(params['person_data_path'])
loan_test_data = pd.read_csv(params['loan_data_path'])

# Merging the test data to form one csv file
predictions = pd.merge(person_test_data, loan_test_data, how="inner", on="id")

# Dropping id from the test data because it does not contribute to the prediction
test_data = predictions.drop("id", axis=1)

# Loading the transformer
transformer = pickle.load(open(params['transformer_path'], 'rb'))

# Transforming the test data into the same format as the train data
X_test = transformer.transform(test_data)

# Predicting the loan status on the test data
y_test = xgb.predict(X_test)

# Adding the predictions to the csv file
predictions["loan_status"] = y_test

# Creating a directory to save the predictions and the model
os.makedirs("data/predictions", exist_ok=True)
os.makedirs("models", exist_ok=True)

# Saving the predictions as a csv file
predictions.to_csv(params['output_path'], index=False)

# Saving the model as a pickle file
pickle.dump(xgb, open(params['model_path'], 'wb'))

### Actual Implementation

In [8]:
# Importing the required libraries
import pandas as pd
import yaml
import pickle
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

# Accessing the params.yaml file
params = yaml.safe_load(open('params.yaml'))['train']

# Creating a function to train the data
def train_data(input_data_path, input_label_path, person_test_data_path, loan_test_data_path, transformer_path, output_path, model_path):

    # Importing the data and labels
    X = pd.read_csv(input_data_path).values
    y = pd.read_csv(input_label_path).values.ravel()

    # Splitting the data into train and test
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Setting mlflow configuration with dagshub
    os.environ['MLFLOW_TRACKING_URI']="https://dagshub.com/siddmirjank2696/Diabetes-Detection-MLflow.mlflow"
    os.environ['MLFLOW_TRACKING_USERNAME']="siddmirjank2696"
    os.environ["MLFLOW_TRACKING_PASSWORD"]="ed2891a2b1abb4d15f5d9a6a45f8bb11829b6bed"

    # Setting the mlflow tracking uri
    mlflow.set_tracking_uri("https://dagshub.com/siddmirjank2696/Loan-Approval-Prediction-AWS-Deployment.mlflow")

    # Setting the experiment name
    mlflow.set_experiment("XgBoost Tracking")

    # Starting an mlflow run
    with mlflow.start_run():

        # Creating an xgboost object
        xgb = XGBClassifier()

        # Fititng the data to the model
        xgb.fit(X_train, y_train)

        # Performing predictions on the validation data
        y_pred = xgb.predict(X_val)

        # Calculating the accuracy
        accuracy = accuracy_score(y_val, y_pred)

        # Logging the accuracy
        mlflow.log_metric("accuracy", accuracy)

    # Loading the test data
    person_test_data = pd.read_csv(person_test_data_path)
    loan_test_data = pd.read_csv(loan_test_data_path)

    # Merging the test data to form one csv file
    predictions = pd.merge(person_test_data, loan_test_data, how="inner", on="id")

    # Dropping id from the test data because it does not contribute to the prediction
    test_data = predictions.drop("id", axis=1)

    # Loading the transformer
    transformer = pickle.load(open(transformer_path, 'rb'))

    # Transforming the test data into the same format as the train data
    X_test = transformer.transform(test_data)

    # Predicting the loan status on the test data
    y_test = xgb.predict(X_test)

    # Adding the predictions to the csv file
    predictions["loan_status"] = y_test

    # Creating a directory to save the predictions and the model
    os.makedirs("data/predictions", exist_ok=True)
    os.makedirs("models", exist_ok=True)

    # Saving the predictions as a csv file
    predictions.to_csv(output_path, index=False)

    # Saving the model as a pickle file
    pickle.dump(xgb, open(model_path, 'wb'))

    # Displaying the success message
    print("\nThe model and predictions were saved successfully!")

    # Returning nothing
    return

# Calling the function to train the model
train_data(params['input_data_path'], params['input_label_path'], params['person_data_path'], params['loan_data_path'], 
           params['transformer_path'], params['output_path'], params['model_path'])

🏃 View run tasteful-zebra-211 at: https://dagshub.com/siddmirjank2696/Loan-Approval-Prediction-AWS-Deployment.mlflow/#/experiments/0/runs/b6ebce83a615464a8a0a1e1c1d34544f
🧪 View experiment at: https://dagshub.com/siddmirjank2696/Loan-Approval-Prediction-AWS-Deployment.mlflow/#/experiments/0

The model and predictions were saved successfully!
