_Based on our previous training data, we determined that the Naive Bayes and Decision Tree were the top performing models. Here, we seek to further tune the model parameters to see if we can achieve a higher metric._


# Imports and Dependencies

In [3]:
import pandas as pd
import os

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

from helper_functions import custom_metric, calculate_metrics

In [4]:
# Folder containing the CSV files
folder_path = 'split_data/'

# Access the loaded DataFrames
X_ND_train = pd.read_csv(os.path.join(folder_path, "X_ND_train.csv"))
X_DD_train = pd.read_csv(os.path.join(folder_path, "X_DD_train.csv"))
X_ND_val = pd.read_csv(os.path.join(folder_path, "X_ND_val.csv"))
X_DD_val = pd.read_csv(os.path.join(folder_path, "X_DD_val.csv"))
y_train = pd.read_csv(os.path.join(folder_path, "y_train.csv")).squeeze("columns")
y_val = pd.read_csv(os.path.join(folder_path, "y_val.csv")).squeeze("columns")

In [5]:
custom_scorer = make_scorer(custom_metric, greater_is_better=True, custom_weight=0.7)
dd_results = {}
nd_results = {}

# Decision Tree

In [6]:
%%timeit -n 1 -r 1
# Discriminatory Data

# Define the parameter grid for hyperparameter tuning
decisiontree_dd_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [None, 'sqrt', 'log2']
}

# Perform grid search using cross-validation
decisiontree_dd_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=5),
                                           param_grid=decisiontree_dd_param_grid, cv=5, n_jobs=-1)
decisiontree_dd_grid_search.fit(X_DD_train, y_train)

# Best model and estimator
decisiontree_dd = decisiontree_dd_grid_search.best_estimator_
print(f"Best Hyperparameters: {decisiontree_dd_grid_search.best_params_}")

decisiontree_dd_val_pred = decisiontree_dd.predict(X_DD_val)
dd_results['Decision Tree'] = calculate_metrics(y_val, decisiontree_dd_val_pred)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 10}
{'Accuracy': 0.5884743258219431, 'Precision': 0.5750595710881652, 'Recall': 0.555640828856485, 'Score': 0.5614664515259891}
33.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [7]:
%%timeit -n 1 -r 1
# Non Discriminatory Data

# Define the parameter grid for hyperparameter tuning
decisiontree_nd_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [None, 'sqrt', 'log2']
}

# Perform grid search using cross-validation
decisiontree_nd_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=5),
                                           param_grid=decisiontree_nd_param_grid, cv=5, n_jobs=-1)
decisiontree_nd_grid_search.fit(X_ND_train, y_train)

# Best model and estimator
decisiontree_nd = decisiontree_nd_grid_search.best_estimator_
print(f"Best Hyperparameters: {decisiontree_nd_grid_search.best_params_}")

decisiontree_nd_val_pred = decisiontree_nd.predict(X_ND_val)
nd_results['Decision Tree'] = calculate_metrics(y_val, decisiontree_nd_val_pred)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 20}
{'Accuracy': 0.5838566678980421, 'Precision': 0.5627444009953786, 'Recall': 0.6074443591711435, 'Score': 0.594034371718414}
19.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
