# Imports and Dependencies

In [1]:
import pandas as pd
import os

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

from helper_functions import custom_metric, calculate_metrics

In [2]:
# Folder containing the CSV files
folder_path = 'split_data/'

# Access the loaded DataFrames
X_ND_train = pd.read_csv(os.path.join(folder_path, "X_ND_train.csv"))
X_DD_train = pd.read_csv(os.path.join(folder_path, "X_DD_train.csv"))
X_ND_val = pd.read_csv(os.path.join(folder_path, "X_ND_val.csv"))
X_DD_val = pd.read_csv(os.path.join(folder_path, "X_DD_val.csv"))
y_train = pd.read_csv(os.path.join(folder_path, "y_train.csv")).squeeze("columns")
y_val = pd.read_csv(os.path.join(folder_path, "y_val.csv")).squeeze("columns")

In [3]:
custom_scorer = make_scorer(custom_metric, greater_is_better=True, custom_weight=0.7)

In [4]:
dd_results = {}
nd_results = {}

# Logistic Regression

In [39]:
%%timeit -n 1 -r 1
# Discriminatory Data

logreg_dd_param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000, 1500]
}

# Create the GridSearchCV object
logreg_dd_grid_search = GridSearchCV(estimator=LogisticRegression(random_state=42),
                                           param_grid=logreg_dd_param_grid, cv=5, n_jobs=-1,
                                           scoring=custom_scorer)
logreg_dd_grid_search.fit(X_DD_train, y_train)

# Best model and estimator
logreg_dd = logreg_dd_grid_search.best_estimator_
print(f"Best Hyperparameters: {logreg_dd_grid_search.best_params_}")

logreg_dd_val_pred = logreg_dd.predict(X_DD_val)
dd_results['Log Reg'] = calculate_metrics(y_val, logreg_dd_val_pred)



Best Hyperparameters: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
{'Accuracy': 0.6281861839674917, 'Precision': 0.6126091910368401, 'Recall': 0.6189562547966232, 'Score': 0.6170521356686883}
2min 37s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [40]:
%%timeit -n 1 -r 1
# Discriminatory Data

logreg_nd_param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000, 1500]
}

# Create the GridSearchCV object
logreg_nd_grid_search = GridSearchCV(estimator=LogisticRegression(random_state=42),
                                     param_grid=logreg_nd_param_grid, cv=5, n_jobs=-1,
                                     scoring=custom_scorer)
logreg_nd_grid_search.fit(X_DD_train, y_train)

# Best model and estimator
logreg_nd = logreg_nd_grid_search.best_estimator_
print(f"Best Hyperparameters: {logreg_nd_grid_search.best_params_}")

logreg_nd_val_pred = logreg_nd.predict(X_ND_val)
nd_results['Log Reg'] = calculate_metrics(y_val, logreg_nd_val_pred)



Best Hyperparameters: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}


Feature names seen at fit time, yet now missing:
- CNT_CHILDREN
- CNT_FAM_MEMBERS
- CODE_GENDER_F
- CODE_GENDER_M
- CODE_GENDER_XNA
- ...



ValueError: X has 46 features, but LogisticRegression is expecting 72 features as input.

# Random Forest

In [7]:
%%timeit -n 1 -r 1
# Discriminatory Data

# Define the parameter grid for grid search
randomforest_dd_param_grid = {
    'n_estimators': [10, 33, 66, 100],
    'max_depth': [None, 5, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
randomforest_dd_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                                           param_grid=randomforest_dd_param_grid, cv=5, n_jobs=-1,
                                           scoring=custom_scorer)
randomforest_dd_grid_search.fit(X_DD_train, y_train)

# Best model and estimator
randomforest_dd = randomforest_dd_grid_search.best_estimator_
print(f"Best Hyperparameters: {randomforest_dd_grid_search.best_params_}")

randomforest_dd_val_pred = randomforest_dd.predict(X_DD_val)
dd_results['Random Forest'] = calculate_metrics(y_val, randomforest_dd_val_pred)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
{'Accuracy': 0.6200591060214259, 'Precision': 0.6116307442049613, 'Recall': 0.5771297006907138, 'Score': 0.587480013744988}
1min 6s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [8]:
%%timeit -n 1 -r 1
# Non Discriminatory Data

# Define the parameter grid for grid search
randomforest_nd_param_grid = {
    'n_estimators': [10, 30, 50, 80],
    'max_depth': [None, 5, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
randomforest_nd_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                                           param_grid=randomforest_nd_param_grid, cv=5, n_jobs=-1,
                                           scoring=custom_scorer)
randomforest_nd_grid_search.fit(X_ND_train, y_train)

# Best model and estimator
randomforest_nd = randomforest_nd_grid_search.best_estimator_
print(f"Best Hyperparameters: {randomforest_nd_grid_search.best_params_}")

randomforest_nd_val_pred = randomforest_nd.predict(X_ND_val)
nd_results['Random Forest'] = calculate_metrics(y_val, randomforest_nd_val_pred)

Best Hyperparameters: {'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 10}
{'Accuracy': 0.59235315847802, 'Precision': 0.5783889980353635, 'Recall': 0.5648503453568687, 'Score': 0.5689119411604172}
43.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Naive Bayes

In [28]:
%%timeit -n 1 -r 1
# Discriminatory Data

# Define the parameter grid for hyperparameter tuning
naivebayes_dd_param_grid = {
    'priors': [None, [0.5, 0.5], [0.3, 0.7], [0.7, 0.3]],  # Vary the prior probabilities
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-5]  # Vary the smoothing parameter
}

# Perform grid search using cross-validation
naivebayes_dd_grid_search = GridSearchCV(estimator=GaussianNB(),
                                         param_grid=naivebayes_dd_param_grid, cv=5, n_jobs=-1,
                                         scoring=custom_scorer)
naivebayes_dd_grid_search.fit(X_DD_train, y_train)

# Get the best hyperparameters and model
naivebayes_dd = naivebayes_dd_grid_search.best_estimator_
print(f"Best Hyperparameters: {naivebayes_dd_grid_search.best_params_}")

naivebayes_dd_val_pred = naivebayes_dd.predict(X_DD_val)
dd_results['Naive Bayes'] = calculate_metrics(y_val, naivebayes_dd_val_pred)

Best Hyperparameters: {'priors': [0.3, 0.7], 'var_smoothing': 1e-05}
{'Accuracy': 0.5264130033247137, 'Precision': 0.5045297670405522, 'Recall': 0.897544128933231, 'Score': 0.7796398203654273}
1.92 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [29]:
%%timeit -n 1 -r 1
# Non Discriminatory Data

# Define the parameter grid for hyperparameter tuning
naivebayes_nd_param_grid = {
    'priors': [None, [0.5, 0.5], [0.3, 0.7], [0.7, 0.3]],  # Vary the prior probabilities
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-5]  # Vary the smoothing parameter
}

# Perform grid search using cross-validation
naivebayes_nd_grid_search = GridSearchCV(estimator=GaussianNB(),
                                         param_grid=naivebayes_nd_param_grid, cv=5, n_jobs=-1,
                                         scoring=custom_scorer)
naivebayes_nd_grid_search.fit(X_ND_train, y_train)

# Get the best hyperparameters and model
naivebayes_nd = naivebayes_nd_grid_search.best_estimator_
print(f"Best Hyperparameters: {naivebayes_nd_grid_search.best_params_}")

naivebayes_nd_val_pred = naivebayes_nd.predict(X_ND_val)
nd_results['Naive Bayes'] = calculate_metrics(y_val, naivebayes_nd_val_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Best Hyperparameters: {'priors': [0.3, 0.7], 'var_smoothing': 1e-05}
{'Accuracy': 0.5038788326560768, 'Precision': 0.4917423616845582, 'Recall': 0.9140445126630852, 'Score': 0.7873538673695271}
751 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Decision Tree

In [30]:
%%timeit -n 1 -r 1
# Discriminatory Data

# Define the parameter grid for hyperparameter tuning
decisiontree_dd_param_grid = {
    'criterion': ['gini', 'entropy'],  # Criterion for splitting
    'max_depth': [None, 5, 10, 15],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required at each leaf node
}

# Perform grid search using cross-validation
decisiontree_dd_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=5),
                                           param_grid=decisiontree_dd_param_grid, cv=5, n_jobs=-1,
                                           scoring=custom_scorer)
decisiontree_dd_grid_search.fit(X_DD_train, y_train)

# Best model and estimator
decisiontree_dd = decisiontree_dd_grid_search.best_estimator_
print(f"Best Hyperparameters: {decisiontree_dd_grid_search.best_params_}")

decisiontree_dd_val_pred = decisiontree_dd.predict(X_DD_val)
dd_results['Decision Tree'] = calculate_metrics(y_val, decisiontree_dd_val_pred)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
{'Accuracy': 0.5925378647949759, 'Precision': 0.5664451827242525, 'Recall': 0.6542594013814275, 'Score': 0.6279151357842749}
11.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [31]:
%%timeit -n 1 -r 1
# Non Discriminatory Data

# Define the parameter grid for hyperparameter tuning
decisiontree_nd_param_grid = {
    'criterion': ['gini', 'entropy'],  # Criterion for splitting
    'max_depth': [None, 5, 10, 15],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required at each leaf node
}

# Perform grid search using cross-validation
decisiontree_nd_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=5),
                                           param_grid=decisiontree_nd_param_grid, cv=5, n_jobs=-1,
                                           scoring=custom_scorer)
decisiontree_nd_grid_search.fit(X_ND_train, y_train)

# Best model and estimator
decisiontree_nd = decisiontree_nd_grid_search.best_estimator_
print(f"Best Hyperparameters: {decisiontree_nd_grid_search.best_params_}")

decisiontree_nd_val_pred = decisiontree_nd.predict(X_ND_val)
nd_results['Decision Tree'] = calculate_metrics(y_val, decisiontree_nd_val_pred)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
{'Accuracy': 0.5823790173623938, 'Precision': 0.5608465608465608, 'Recall': 0.6101304681504222, 'Score': 0.5953452959592638}
6.32 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Support Vector Machine

In [13]:
%%timeit -n 1 -r 1
# Discriminatory Data

svm_dd = svm.SVC().fit(X_DD_train, y_train)
svm_dd_val_pred = svm_dd.predict(X_DD_val)
dd_results["SVM"] = calculate_metrics(y_val, svm_dd_val_pred)

{'Accuracy': 0.5718507572958995, 'Precision': 0.5767590618336887, 'Recall': 0.4151957022256332, 'Score': 0.4636647101080499}
40.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [14]:
%%timeit -n 1 -r 1
# Non Discriminatory Data

svm_nd = svm.SVC().fit(X_ND_train, y_train)
svm_nd_val_pred = svm_nd.predict(X_ND_val)
nd_results["SVM"] = calculate_metrics(y_val, svm_nd_val_pred)

{'Accuracy': 0.5672330993719985, 'Precision': 0.5700586041555674, 'Recall': 0.41059094397544127, 'Score': 0.45843124202947916}
30.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# XGBoost

In [None]:
%%timeit -n 1 -r 1
# Discriminatory Data

xgb_dd_param_grid = {
    'max_depth': [3, 5, 7],  # Maximum depth of a tree
    'learning_rate': [0.1, 0.01, 0.001],  # Learning rate of the model
    'n_estimators': [100, 200, 300],  # Number of trees (boosting rounds)
}

# Perform grid search with custom scorer
xgb_dd_grid_search = GridSearchCV(xgb.XGBClassifier(), xgb_dd_param_grid, scoring=custom_scorer, cv=5)
xgb_dd_grid_search.fit(X_DD_train, y_train)

# Best model and estimator
xgb_dd = xgb_dd_grid_search.best_estimator_
print(f"Best Hyperparameters: {xgb_dd_grid_search.best_params_}")

xgb_dd_val_pred = xgb_dd.predict(X_DD_val)
dd_results["XGBoost"] = calculate_metrics(y_val, xgb_dd_val_pred)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300}
{'Accuracy': 0.6276320650166236, 'Precision': 0.6136363636363636, 'Recall': 0.61128165771297, 'Score': 0.6119880694899881}
6min 55s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)



KeyboardInterrupt



In [20]:
%%timeit -n 1 -r 1
# Discriminatory Data

xgb_nd_param_grid = {
    'max_depth': [3, 5, 7],  # Maximum depth of a tree
    'learning_rate': [0.1, 0.01, 0.001],  # Learning rate of the model
    'n_estimators': [100, 200, 300],  # Number of trees (boosting rounds)
}

# Perform grid search with custom scorer
xgb_nd_grid_search = GridSearchCV(xgb.XGBClassifier(), xgb_nd_param_grid, scoring=custom_scorer, cv=5)
xgb_nd_grid_search.fit(X_ND_train, y_train)

# Best model and estimator
xgb_nd = xgb_nd_grid_search.best_estimator_
print(f"Best Hyperparameters: {xgb_nd_grid_search.best_params_}")

xgb_nd_val_pred = xgb_nd.predict(X_ND_val)
nd_results["XGBoost"] = calculate_metrics(y_val, xgb_nd_val_pred)

Best Hyperparameters: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 200}
{'Accuracy': 0.5930919837458442, 'Precision': 0.5754964406144624, 'Recall': 0.5894090560245587, 'Score': 0.5852352714015299}
4min 21s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Results

In [32]:
pd.DataFrame(dd_results)

Unnamed: 0,Logistic Regression,Random Forest,Naive Bayes,Decision Tree,SVM,XGBoost
Accuracy,0.583487,0.620059,0.526413,0.592538,0.571851,0.627632
Precision,0.572731,0.611631,0.50453,0.566445,0.576759,0.613636
Recall,0.530315,0.57713,0.897544,0.654259,0.415196,0.611282
Score,0.54304,0.58748,0.77964,0.627915,0.463665,0.611988


In [33]:
pd.DataFrame(nd_results)

Unnamed: 0,Logistic Regression,Random Forest,Naive Bayes,Decision Tree,SVM,XGBoost
Accuracy,0.579239,0.592353,0.503879,0.582379,0.567233,0.593092
Precision,0.572695,0.578389,0.491742,0.560847,0.570059,0.575496
Recall,0.495779,0.56485,0.914045,0.61013,0.410591,0.589409
Score,0.518854,0.568912,0.787354,0.595345,0.458431,0.585235


In [34]:
pd.DataFrame(dd_results).to_csv('dd_results.csv')

In [35]:
pd.DataFrame(nd_results).to_csv('nd_results.csv')

/Users/shunyao/uni/Internship/citi/neueda/hackathon
