# Imports and Dependencies

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.metrics import make_scorer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
import tensorflow

from helper_functions import custom_metric, calculate_results

In [2]:
# Folder containing the CSV files
folder_path = 'split_data/'

# Access the loaded DataFrames
X_ND_train = pd.read_csv(os.path.join(folder_path, "X_ND_train.csv"))
X_DD_train = pd.read_csv(os.path.join(folder_path, "X_DD_train.csv"))
X_ND_val = pd.read_csv(os.path.join(folder_path, "X_ND_val.csv"))
X_DD_val = pd.read_csv(os.path.join(folder_path, "X_DD_val.csv"))
y_train = pd.read_csv(os.path.join(folder_path, "y_train.csv")).squeeze("columns")
y_val = pd.read_csv(os.path.join(folder_path, "y_val.csv")).squeeze("columns")

In [3]:
custom_scorer = make_scorer(custom_metric, greater_is_better=True, custom_weight = 0.7)

In [6]:
dd_results = {}
nd_results = {}

# Logistic Regression

In [7]:
# Discriminatory Data

logreg_dd = LogisticRegression(random_state = 0, max_iter=500).fit(X_DD_train, y_train)
logreg_dd_val_pred = logreg_dd.predict(X_DD_val)
dd_results["Logistic Regression"] = calculate_results(y_val, logreg_dd_val_pred)

{'Accuracy': 0.58348725526413, 'Precision': 0.5727310401989225, 'Recall': 0.5303146584804298, 'Score': 0.5430395729959776}


In [9]:
# Non Discriminatory Data

logreg_nd = LogisticRegression(random_state = 0, max_iter=500).fit(X_ND_train, y_train)
logreg_nd_val_pred = logreg_nd.predict(X_ND_val)
nd_results["Logistic Regression"] = calculate_results(y_val, logreg_nd_val_pred)

{'Accuracy': 0.5792390099741411, 'Precision': 0.5726950354609929, 'Recall': 0.4957789716039908, 'Score': 0.5188537907610915}


# Random Forest

In [11]:
# Discriminatory Data

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 5, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, n_jobs=-1, scoring=custom_scorer)
grid_search.fit(X_DD_train, y_train)

# Print the best parameters found by grid search
print("Best parameters: ", grid_search.best_params_)

# Get the best model found by grid search
randomforest_dd = grid_search.best_estimator_
randomforest_dd_val_pred = randomforest_dd.predict(X_DD_val)
dd_results['Random Forest'] = calculate_results(y_val, randomforest_dd_val_pred)

Best parameters:  {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
{'Accuracy': 0.6200591060214259, 'Precision': 0.6116307442049613, 'Recall': 0.5771297006907138, 'Score': 0.587480013744988}


In [12]:
# Non Discriminatory Data

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 5, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, n_jobs=-1, scoring=custom_scorer)
grid_search.fit(X_ND_train, y_train)

# Print the best parameters found by grid search
print("Best parameters: ", grid_search.best_params_)

# Get the best model found by grid search
randomforest_nd = grid_search.best_estimator_
randomforest_nd_val_pred = randomforest_nd.predict(X_ND_val)
nd_results['Random Forest'] = calculate_results(y_val, randomforest_nd_val_pred)

Best parameters:  {'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 10}
{'Accuracy': 0.59235315847802, 'Precision': 0.5783889980353635, 'Recall': 0.5648503453568687, 'Score': 0.5689119411604172}


#

In [15]:
pd.DataFrame(dd_results)

Unnamed: 0,Logistic Regression,Random Forest
Accuracy,0.583487,0.620059
Precision,0.572731,0.611631
Recall,0.530315,0.57713
Score,0.54304,0.58748


In [16]:
pd.DataFrame(nd_results)

Unnamed: 0,Logistic Regression,Random Forest
Accuracy,0.579239,0.592353
Precision,0.572695,0.578389
Recall,0.495779,0.56485
Score,0.518854,0.568912
