# Imports and Dependencies

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import sklearn

from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
import tensorflow

from helper_functions import custom_metric, calculate_metrics

In [2]:
# Folder containing the CSV files
folder_path = 'split_data/'

# Access the loaded DataFrames
X_ND_train = pd.read_csv(os.path.join(folder_path, "X_ND_train.csv"))
X_DD_train = pd.read_csv(os.path.join(folder_path, "X_DD_train.csv"))
X_ND_val = pd.read_csv(os.path.join(folder_path, "X_ND_val.csv"))
X_DD_val = pd.read_csv(os.path.join(folder_path, "X_DD_val.csv"))
y_train = pd.read_csv(os.path.join(folder_path, "y_train.csv")).squeeze("columns")
y_val = pd.read_csv(os.path.join(folder_path, "y_val.csv")).squeeze("columns")

In [3]:
custom_scorer = make_scorer(custom_metric, greater_is_better=True, custom_weight=0.7)

In [4]:
dd_results = {}
nd_results = {}

# Logistic Regression

In [5]:
%%timeit -n 1 -r 1
# Discriminatory Data

logreg_dd = LogisticRegression(random_state=0, max_iter=500).fit(X_DD_train, y_train)
logreg_dd_val_pred = logreg_dd.predict(X_DD_val)
dd_results["Logistic Regression"] = calculate_metrics(y_val, logreg_dd_val_pred)

{'Accuracy': 0.58348725526413, 'Precision': 0.5727310401989225, 'Recall': 0.5303146584804298, 'Score': 0.5430395729959776}
535 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [7]:
%%timeit -n 1 -r 1
# Non Discriminatory Data

logreg_nd = LogisticRegression(random_state=0, max_iter=500).fit(X_ND_train, y_train)
logreg_nd_val_pred = logreg_nd.predict(X_ND_val)
nd_results["Logistic Regression"] = calculate_metrics(y_val, logreg_nd_val_pred)

{'Accuracy': 0.5792390099741411, 'Precision': 0.5726950354609929, 'Recall': 0.4957789716039908, 'Score': 0.5188537907610915}
156 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Random Forest

In [8]:
%%timeit -n 1 -r 1
# Discriminatory Data

# Define the parameter grid for grid search
randomforest_dd_param_grid = {
    'n_estimators': [10, 33, 66, 100],
    'max_depth': [None, 5, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
randomforest_dd_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                                           param_grid=randomforest_dd_param_grid, cv=5, n_jobs=-1,
                                           scoring=custom_scorer)
randomforest_dd_grid_search.fit(X_DD_train, y_train)

# Best model and estimator
randomforest_dd = randomforest_dd_grid_search.best_estimator_
print(f"Best Hyperparameters: {randomforest_dd_grid_search.best_params_}")

randomforest_dd_val_pred = randomforest_dd.predict(X_DD_val)
dd_results['Random Forest'] = calculate_metrics(y_val, randomforest_dd_val_pred)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
{'Accuracy': 0.6200591060214259, 'Precision': 0.6116307442049613, 'Recall': 0.5771297006907138, 'Score': 0.587480013744988}
1min 10s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [9]:
%%timeit -n 1 -r 1
# Non Discriminatory Data

# Define the parameter grid for grid search
randomforest_nd_param_grid = {
    'n_estimators': [10, 30, 50, 80],
    'max_depth': [None, 5, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
randomforest_nd_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                                           param_grid=randomforest_nd_param_grid, cv=5, n_jobs=-1,
                                           scoring=custom_scorer)
randomforest_nd_grid_search.fit(X_ND_train, y_train)

# Best model and estimator
randomforest_nd = randomforest_nd_grid_search.best_estimator_
print(f"Best Hyperparameters: {randomforest_nd_grid_search.best_params_}")

randomforest_nd_val_pred = randomforest_nd.predict(X_ND_val)
nd_results['Random Forest'] = calculate_metrics(y_val, randomforest_nd_val_pred)

Best Hyperparameters: {'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 10}
{'Accuracy': 0.59235315847802, 'Precision': 0.5783889980353635, 'Recall': 0.5648503453568687, 'Score': 0.5689119411604172}
40.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Naive Bayes

In [10]:
%%timeit -n 1 -r 1
# Discriminatory Data

# Define the parameter grid for hyperparameter tuning
naivebayes_dd_param_grid = {
    'priors': [None, [0.5, 0.5], [0.3, 0.7], [0.7, 0.3]],  # Vary the prior probabilities
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-5]  # Vary the smoothing parameter
}

# Perform grid search using cross-validation
naivebayes_dd_grid_search = GridSearchCV(estimator=GaussianNB(), param_grid=naivebayes_dd_param_grid, cv=5, n_jobs=-1)
naivebayes_dd_grid_search.fit(X_DD_train, y_train)

# Get the best hyperparameters and model
naivebayes_dd = naivebayes_dd_grid_search.best_estimator_
print(f"Best Hyperparameters: {naivebayes_dd_grid_search.best_params_}")

naivebayes_dd_val_pred = naivebayes_dd.predict(X_DD_val)
dd_results['Naive Bayes'] = calculate_metrics(y_val, naivebayes_dd_val_pred)

Best Hyperparameters: {'priors': None, 'var_smoothing': 1e-09}
{'Accuracy': 0.5775766531215367, 'Precision': 0.5480277024992473, 'Recall': 0.6983883346124329, 'Score': 0.6532801449784772}
555 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [11]:
%%timeit -n 1 -r 1
# Non Discriminatory Data

# Define the parameter grid for hyperparameter tuning
naivebayes_nd_param_grid = {
    'priors': [None, [0.5, 0.5], [0.3, 0.7], [0.7, 0.3]],  # Vary the prior probabilities
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-5]  # Vary the smoothing parameter
}

# Perform grid search using cross-validation
naivebayes_nd_grid_search = GridSearchCV(estimator=GaussianNB(), param_grid=naivebayes_nd_param_grid, cv=5, n_jobs=-1)
naivebayes_nd_grid_search.fit(X_ND_train, y_train)

# Get the best hyperparameters and model
naivebayes_nd = naivebayes_nd_grid_search.best_estimator_
print(f"Best Hyperparameters: {naivebayes_nd_grid_search.best_params_}")

naivebayes_nd_val_pred = naivebayes_nd.predict(X_ND_val)
nd_results['Naive Bayes'] = calculate_metrics(y_val, naivebayes_nd_val_pred)

Best Hyperparameters: {'priors': None, 'var_smoothing': 1e-08}
{'Accuracy': 0.5387883265607684, 'Precision': 0.5144294413555732, 'Recall': 0.7455871066768994, 'Score': 0.6762398070805016}
486 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Decision Tree

In [12]:
%%timeit -n 1 -r 1
# Discriminatory Data

# Define the parameter grid for hyperparameter tuning
decisiontree_dd_param_grid = {
    'criterion': ['gini', 'entropy'],  # Criterion for splitting
    'max_depth': [None, 5, 10, 15],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required at each leaf node
}

# Perform grid search using cross-validation
decisiontree_dd_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=5),
                                           param_grid=decisiontree_dd_param_grid, cv=5, n_jobs=-1)
decisiontree_dd_grid_search.fit(X_DD_train, y_train)

# Best model and estimator
decisiontree_dd = decisiontree_dd_grid_search.best_estimator_
print(f"Best Hyperparameters: {decisiontree_dd_grid_search.best_params_}")

decisiontree_dd_val_pred = decisiontree_dd.predict(X_DD_val)
dd_results['Decision Tree'] = calculate_metrics(y_val, decisiontree_dd_val_pred)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
{'Accuracy': 0.5925378647949759, 'Precision': 0.5664451827242525, 'Recall': 0.6542594013814275, 'Score': 0.6279151357842749}
10.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%%timeit -n 1 -r 1
# Non Discriminatory Data

# Define the parameter grid for hyperparameter tuning
decisiontree_nd_param_grid = {
    'criterion': ['gini', 'entropy'],  # Criterion for splitting
    'max_depth': [None, 5, 10, 15],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required at each leaf node
}

# Perform grid search using cross-validation
decisiontree_nd_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=5),
                                           param_grid=decisiontree_nd_param_grid, cv=5, n_jobs=-1)
decisiontree_nd_grid_search.fit(X_ND_train, y_train)

# Best model and estimator
decisiontree_nd = decisiontree_nd_grid_search.best_estimator_
print(f"Best Hyperparameters: {decisiontree_nd_grid_search.best_params_}")

decisiontree_nd_val_pred = decisiontree_nd.predict(X_ND_val)
nd_results['Decision Tree'] = calculate_results(y_val, decisiontree_nd_val_pred)

# Results

In [None]:
pd.DataFrame(dd_results)

In [None]:
pd.DataFrame(nd_results)