# Machine Learning Model Experiments

## Experiment Setup

In [1]:
# Setting up execution path
import os

print(f"Current working directory: {os.path.basename(os.getcwd())}")

# Change to root directory
os.chdir("../")
print(f"Current working directory (Changed): {os.path.basename(os.getcwd())}")

Current working directory: notebooks
Current working directory (Changed): Lending-Approval-Predictor


In [2]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# module setup
%matplotlib inline
pd.options.display.precision = 3
warnings.filterwarnings("ignore")

In [3]:
from os.path import dirname, normpath

In [4]:
from src.constants import CONFIGS
from src.exception import CustomException
from src.logger import logger
from src.utils.basic_utils import create_directories, read_yaml, save_as_pickle

In [5]:
# Read the configuration files
configs = read_yaml(CONFIGS).model_evaluation

# Train and test array paths
train_array_path = normpath(configs.train_array_path)
test_array_path = normpath(configs.test_array_path)

[2024-02-13 07:06:21 PM]:ProjectLogger INFO:basic_utils 43 - yaml file: conf\configs.yaml loaded successfully


In [6]:
# Load the training and test set array
train_array = np.load(train_array_path)
test_array = np.load(test_array_path)

# Split train_array into features and target
x_train, y_train = train_array[:, :-1], train_array[:, -1]
x_test, y_test = test_array[:, :-1], test_array[:, -1]

# Log the shapes
print(f"The shape of x_train: {x_train.shape}")
print(f"The shape of y_train: {y_train.shape}")

print(f"The shape of x_test: {x_test.shape}")
print(f"The shape of y_test: {y_test.shape}")

The shape of x_train: (7662, 19)
The shape of y_train: (7662,)
The shape of x_test: (1916, 19)
The shape of y_test: (1916,)


In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [8]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
}

In [12]:
hyper_params = {
    "Decision Tree": {
        "criterion": ["gini", "entropy", "log_loss"],
        "splitter": ["best", "random"],
        "max_features": ["sqrt", "log2"],
    },
    "Random Forest": {
        "criterion": ["gini", "entropy", "log_loss"],
        "max_features": ["sqrt", "log2"],
        "n_estimators": [8, 16, 32, 64, 128, 256],
    },
}

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV


def evaluate_classification_models(X_train: np.array, y_train: np.array, X_test: np.array, y_test: np.array, models: dict, params: dict, binary_classification:bool=True) -> list[dict]:
    """_summary_

    Args:
        X_train (np.array): _description_
        y_train (np.array): _description_
        X_test (np.array): _description_
        y_test (np.array): _description_
        models (dict): _description_
        params (dict): _description_
        binary_classification (bool, optional): _description_. Defaults to True.

    Returns:
        list[dict]: _description_
    """
    model_scores = []
    
    for model_name in models:
        model = models[model_name]
        hyperparameters = params[model_name]
        
        # Perform Grid Search
        grid_search = GridSearchCV(model, hyperparameters, cv=3)
        grid_search.fit(X_train, y_train)
        
        # Fetch the best parameters and fit the model on training set
        model.set_params(**grid_search.best_params_)
        model.fit(X_train, y_train)
        
        # Perform prediction over the training and test set
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Evaluate the model over training set
        accuracy_score_train = accuracy_score(y_train, y_train_pred)
        precision_score_train = precision_score(y_train, y_train_pred) if binary_classification else precision_score(y_train, y_train_pred, average="weighted")
        recall_score_train = recall_score(y_train, y_train_pred) if binary_classification else recall_score(y_train, y_train_pred, average="weighted")
        f1_score_train = f1_score(y_train, y_train_pred) if binary_classification else f1_score(y_train, y_train_pred, average="weighted")
        
        # Evaluate the model over test set
        accuracy_score_test = accuracy_score(y_test, y_test_pred)
        precision_score_test = precision_score(y_test, y_test_pred) if binary_classification else precision_score(y_test, y_test_pred, average="weighted")
        recall_score_test = recall_score(y_test, y_test_pred) if binary_classification else recall_score(y_test, y_test_pred, average="weighted")
        f1_score_test = f1_score(y_test, y_test_pred) if binary_classification else f1_score(y_test, y_test_pred, average="weighted")
        
        # Fetch the best hyperparameter values
        model_params = model.get_params()
        best_hyperparameters = {k: v for k, v in model_params.items() 
                                if k in list(hyperparameters)}
        
        # Append the results into the list
        model_scores.append({
            "model_name": model_name,
            "model": model,
            "hyperparameters": best_hyperparameters,
            "accuracy_score_train": accuracy_score_train,
            "precision_score_train": precision_score_train,
            "recall_score_train": recall_score_train,
            "f1_score_train": f1_score_train,
            "accuracy_score_test": accuracy_score_test,
            "precision_score_test": precision_score_test,
            "recall_score_test": recall_score_test,
            "f1_score_test": f1_score_test
        })
    return model_scores

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [52]:
model_scores = []

for model_name in models:
    model = models[model_name]
    hyperparams = hyper_params[model_name]

    gs = GridSearchCV(model, hyperparams, cv=3)
    gs.fit(x_train, y_train)

    model.set_params(**gs.best_params_)
    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    train_model_scores = f1_score(y_train, y_train_pred)
    test_model_scores = f1_score(y_test, y_test_pred)

    model_params = model.get_params()
    req_params = list(hyperparams)
    model_hyper_params = {k: v for k, v in model_params.items() if k in req_params}

    model_scores.append(
        {
            "model_name": model_name,
            "model": model,
            "hyperparamers": model_hyper_params,
            "f1_score_train": train_model_scores,
            "f1_score_test": test_model_scores,
        }
    )

In [None]:
def get_best_model(scores_df: pd.DataFrame, evaluation_metric: str="f1_score_test") -> list:
    best_model_row = scores_df.nlargest(1, evaluation_metric).squeeze()
    
    best_model_name = best_model_row["model_name"]
    best_model = best_model_row["model"]
    best_model_hyperparameters = best_model_row["hyperparameters"]
    
    return {
        model_name: best_model_name,
        model = best_model,
        hyperparameters: best_model_hyperparameters
    }
    

In [53]:
scores_df = pd.DataFrame(model_scores)
scores_df

Unnamed: 0,model_name,model,hyperparamers,f1_score_train,f1_score_test
0,Decision Tree,"DecisionTreeClassifier(criterion='entropy', ma...","{'criterion': 'entropy', 'max_features': 'log2...",1.0,0.977
1,Random Forest,"(DecisionTreeClassifier(max_features='log2', r...","{'criterion': 'gini', 'max_features': 'log2', ...",1.0,0.993


In [71]:
best_model_row = scores_df.nlargest(1, "f1_score_test").squeeze()
best_model_row

model_name                                            Random Forest
model             (DecisionTreeClassifier(max_features='log2', r...
hyperparamers     {'criterion': 'gini', 'max_features': 'log2', ...
f1_score_train                                                  1.0
f1_score_test                                                 0.993
Name: 1, dtype: object

In [72]:
best_model = best_model_row["model"]
best_model_hyperparams = best_model_row["hyperparamers"]

In [73]:
best_model

In [74]:
best_model_hyperparams

{'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 64}

In [30]:
hyper_params["Decision Tree"].keys()

dict_keys(['criterion', 'splitter', 'max_features'])

In [41]:
scores_df["model_all_params"] = scores_df["model"].apply(lambda x: x.get_params())
scores_df["required_model_params"] = scores_df.apply(
    lambda x: {
        k: v
        for k, v in x["model_all_params"]
        if k in hyper_params[x["model_name"]].keys()
    },
    axis=1,
)

ValueError: too many values to unpack (expected 2)