_Based on our previous training data, we determined that the Naive Bayes and Decision Tree were the top performing models. Here, we seek to further tune the model parameters to see if we can achieve a higher metric._


# Imports and Dependencies

In [3]:
import pandas as pd
import os

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

from helper_functions import custom_metric, calculate_metrics

In [13]:
# Folder containing the CSV files
folder_path = 'split_data/'

# Access the loaded DataFrames
X_DD_train = pd.read_csv(os.path.join(folder_path, "X_DD_train.csv"))
X_DD_val = pd.read_csv(os.path.join(folder_path, "X_DD_val.csv"))
X_DD_train_val = pd.concat([X_DD_train, X_DD_val])

X_ND_train = pd.read_csv(os.path.join(folder_path, "X_ND_train.csv"))
X_ND_val = pd.read_csv(os.path.join(folder_path, "X_ND_val.csv"))
X_ND_train_val = pd.concat([X_ND_train, X_ND_val])

y_train = pd.read_csv(os.path.join(folder_path, "y_train.csv")).squeeze("columns")
y_val = pd.read_csv(os.path.join(folder_path, "y_val.csv")).squeeze("columns")
y_train_val = pd.concat([y_train, y_val])

X_DD_test = pd.read_csv(os.path.join(folder_path, "X_DD_test.csv"))
X_ND_test = pd.read_csv(os.path.join(folder_path, "X_ND_test.csv"))
y_test = pd.read_csv(os.path.join(folder_path, "y_test.csv")).squeeze('columns')

In [5]:
custom_scorer = make_scorer(custom_metric, greater_is_better=True, custom_weight=0.7)
dd_results = {}
nd_results = {}

# Decision Tree

In [15]:
%%timeit -n 1 -r 1
# Discriminatory Data

# Define the parameter grid for hyperparameter tuning
decisiontree_dd_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [None, 'sqrt', 'log2']
}

# Perform grid search using cross-validation
decisiontree_dd_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=5),
                                           param_grid=decisiontree_dd_param_grid, cv=5, n_jobs=-1)
decisiontree_dd_grid_search.fit(X_DD_train_val, y_train_val)

# Best model and estimator
decisiontree_dd = decisiontree_dd_grid_search.best_estimator_
print(f"Best Hyperparameters: {decisiontree_dd_grid_search.best_params_}")

decisiontree_dd_test_pred = decisiontree_dd.predict(X_DD_test)
dd_results['Decision Tree'] = calculate_metrics(y_test, decisiontree_dd_test_pred)

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
{'Accuracy': 0.5941252540181046, 'Precision': 0.5688311688311688, 'Recall': 0.6684471575734453, 'Score': 0.6385623609507624}
48.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [16]:
%%timeit -n 1 -r 1
# Discriminatory Data

# Define the parameter grid for hyperparameter tuning
decisiontree_nd_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [None, 'sqrt', 'log2']
}

# Perform grid search using cross-validation
decisiontree_nd_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=5),
                                           param_grid=decisiontree_nd_param_grid, cv=5, n_jobs=-1)
decisiontree_nd_grid_search.fit(X_ND_train_val, y_train_val)

# Best model and estimator
decisiontree_nd = decisiontree_nd_grid_search.best_estimator_
print(f"Best Hyperparameters: {decisiontree_nd_grid_search.best_params_}")

decisiontree_nd_test_pred = decisiontree_nd.predict(X_ND_test)
nd_results['Decision Tree'] = calculate_metrics(y_test, decisiontree_nd_test_pred)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 20}
{'Accuracy': 0.5883983003879549, 'Precision': 0.5778834720570749, 'Recall': 0.5562762304463945, 'Score': 0.5627584029295987}
27.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Naive Bayes

In [17]:
%%timeit -n 1 -r 1
# Discriminatory Data

# Define the parameter grid for hyperparameter tuning
naivebayes_dd_param_grid = {
    'priors': [None, [0.5, 0.5], [0.3, 0.7], [0.7, 0.3]],  # Vary the prior probabilities
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-5]  # Vary the smoothing parameter
}

# Perform grid search using cross-validation
naivebayes_dd_grid_search = GridSearchCV(estimator=GaussianNB(),
                                         param_grid=naivebayes_dd_param_grid, cv=5, n_jobs=-1,
                                         scoring=custom_scorer)
naivebayes_dd_grid_search.fit(X_DD_train_val, y_train_val)

# Get the best hyperparameters and model
naivebayes_dd = naivebayes_dd_grid_search.best_estimator_
print(f"Best Hyperparameters: {naivebayes_dd_grid_search.best_params_}")

naivebayes_dd_test_pred = naivebayes_dd.predict(X_DD_test)
dd_results['Naive Bayes'] = calculate_metrics(y_test, naivebayes_dd_test_pred)

Best Hyperparameters: {'priors': [0.3, 0.7], 'var_smoothing': 1e-05}
{'Accuracy': 0.5294661001293183, 'Precision': 0.5080330004342163, 'Recall': 0.8927890118275468, 'Score': 0.7773622084095476}
2.21 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [18]:
%%timeit -n 1 -r 1
# Non Discriminatory Data

# Define the parameter grid for hyperparameter tuning
naivebayes_nd_param_grid = {
    'priors': [None, [0.5, 0.5], [0.3, 0.7], [0.7, 0.3]],  # Vary the prior probabilities
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-5]  # Vary the smoothing parameter
}

# Perform grid search using cross-validation
naivebayes_nd_grid_search = GridSearchCV(estimator=GaussianNB(),
                                         param_grid=naivebayes_nd_param_grid, cv=5, n_jobs=-1,
                                         scoring=custom_scorer)
naivebayes_nd_grid_search.fit(X_ND_train_val, y_train_val)

# Get the best hyperparameters and model
naivebayes_nd = naivebayes_nd_grid_search.best_estimator_
print(f"Best Hyperparameters: {naivebayes_nd_grid_search.best_params_}")

naivebayes_nd_test_pred = naivebayes_nd.predict(X_ND_test)
nd_results['Naive Bayes'] = calculate_metrics(y_test, naivebayes_nd_test_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Best Hyperparameters: {'priors': [0.3, 0.7], 'var_smoothing': 1e-05}
{'Accuracy': 0.50822094956586, 'Precision': 0.495735385895569, 'Recall': 0.9091949637542922, 'Score': 0.7851570903966751}
919 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [21]:
pd.DataFrame(dd_results)

Unnamed: 0,Decision Tree,Naive Bayes
Accuracy,0.594125,0.529466
Precision,0.568831,0.508033
Recall,0.668447,0.892789
Score,0.638562,0.777362


In [20]:
pd.DataFrame(nd_results)

Unnamed: 0,Decision Tree,Naive Bayes
Accuracy,0.588398,0.508221
Precision,0.577883,0.495735
Recall,0.556276,0.909195
Score,0.562758,0.785157
