In [2]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_recall_curve, auc
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE


In [3]:
X_train = pd.read_csv('../X_train.csv')
X_val = pd.read_csv('../X_val.csv')
X_test = pd.read_csv('../X_test.csv')
y_train = pd.read_csv('../y_train.csv')
y_val = pd.read_csv('../y_val.csv')
y_test = pd.read_csv('../y_test.csv')



In [6]:
X_train.head()

Unnamed: 0,hour_of_day,day_of_week,category,gender,age_group,state,job_category,distance,amount(usd)
0,15,Monday,kids_pets,F,21-40,NY,C2,52.770575,75.72
1,10,Tuesday,gas_transport,F,41-60,CA,C2,68.181551,79.74
2,20,Monday,home,M,41-60,MT,C4,66.90438,74.92
3,0,Saturday,gas_transport,F,0-20,IL,C2,65.195819,44.3
4,18,Saturday,home,M,41-60,MD,C2,71.833218,28.02


In [7]:
X_train_encoded = pd.get_dummies(X_train,columns = ['category', 'gender', 'state', 'day_of_week', 'job_category', 'age_group'])
X_val_encoded = pd.get_dummies(X_val,columns = ['category', 'gender', 'state', 'day_of_week', 'job_category', 'age_group'])
X_test_encoded = pd.get_dummies(X_test,columns = ['category', 'gender', 'state', 'day_of_week', 'job_category', 'age_group'])


Dimensionality is pretty high, might want to use PCA

## MLP Model no under/oversampling

In [9]:
scaler = StandardScaler()

columns_to_scale = ['hour_of_day', 'distance', 'amount(usd)']

X_train_scaled = X_train_encoded.copy()
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

X_val_scaled = X_val_encoded.copy()
X_val_scaled[columns_to_scale] = scaler.fit_transform(X_val[columns_to_scale])

X_test_scaled = X_test_encoded.copy()
X_test_scaled[columns_to_scale] = scaler.fit_transform(X_test[columns_to_scale])

In [10]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_train_scaled, y_train)

  y = column_or_1d(y, warn=True)


In [11]:
y_pred = mlp.predict(X_val_scaled)
f1 = f1_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
precision = precision_score(y_val, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_val_scaled)
roc_auc = roc_auc_score(y_val, y_pred_probs[:, 1])

precision_points, recall_points, thresholds = precision_recall_curve(y_val, y_pred_probs[:, 1])

pr_auc = auc(recall_points, precision_points)

metrics_base = [precision, recall, f1, pr_auc, roc_auc]

print(metrics_base)

[0.8884615384615384, 0.7181347150259068, 0.7942693409742121, 0.8172850119813182, 0.9882867599560713]


## MLP Model with PCA

In [12]:
pca = PCA().fit(X_train_scaled)
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1 
n_components

50

In [13]:
pca = PCA(n_components=n_components)
fit = pca.fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled) 
X_test_pca = pca.transform(X_test_scaled) 
explained_variance_ratio = pca.explained_variance_ratio_
print(f'Explained variance ratio: {explained_variance_ratio}')

Explained variance ratio: [0.1418236  0.13346578 0.13186342 0.06645766 0.05094044 0.0435478
 0.02643279 0.02581272 0.02249296 0.01922904 0.01687332 0.01516168
 0.01455947 0.01365796 0.0130504  0.01234575 0.01192939 0.01092307
 0.01025243 0.00964728 0.00959758 0.00941185 0.00906757 0.00887952
 0.00869554 0.00842818 0.00800145 0.00758367 0.00722656 0.0063168
 0.00591068 0.00555888 0.00505057 0.00470645 0.00462963 0.00447808
 0.00426348 0.00410619 0.00403767 0.00374759 0.00346468 0.00319902
 0.00307581 0.00305962 0.00298728 0.00295126 0.00290769 0.00289028
 0.00279182 0.00273185]


In [14]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_train_pca, y_train)

  y = column_or_1d(y, warn=True)


In [15]:
y_pred = mlp.predict(X_val_pca)
f1 = f1_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
precision = precision_score(y_val, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_val_pca)
roc_auc = roc_auc_score(y_val, y_pred_probs[:, 1])

precision_points, recall_points, thresholds = precision_recall_curve(y_val, y_pred_probs[:, 1])

pr_auc = auc(recall_points, precision_points)

metrics_base_pca = [precision, recall, f1, pr_auc, roc_auc]

print(metrics_base_pca)

[0.8733509234828496, 0.6860103626943005, 0.7684271619268718, 0.8181896195732928, 0.9914745900183219]


## Results

In [22]:
all_metrics = [metrics_base, metrics_base_pca]

df_metrics = pd.DataFrame(all_metrics, columns=['precision', 'recall', 'f1', 'PR_AUC', 'ROC_AUC'])

row_names = ['MLP Base Model','MLP Base Model with PCA']

df_metrics.index = row_names

df_metrics

Unnamed: 0,precision,recall,f1,PR_AUC,ROC_AUC
MLP Base Model,0.888462,0.718135,0.794269,0.817285,0.988287
MLP Base Model with PCA,0.873351,0.68601,0.768427,0.81819,0.991475


Stick to model without PCA based on lower f1 score, lower ROC_AUC and PR_AUC score

## MLP with random oversampling

In [17]:
ros = RandomOverSampler(random_state=42)

In [27]:
X_over, y_over = ros.fit_resample(X_train_scaled, y_train)

In [28]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_over, y_over)

  y = column_or_1d(y, warn=True)


In [29]:
y_pred = mlp.predict(X_val_scaled)
f1 = f1_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
precision = precision_score(y_val, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_val_scaled)
roc_auc = roc_auc_score(y_val, y_pred_probs[:, 1])

precision_points, recall_points, thresholds = precision_recall_curve(y_val, y_pred_probs[:, 1])

pr_auc = auc(recall_points, precision_points)

metrics_oversampling = [precision, recall, f1, pr_auc, roc_auc]

print(metrics_oversampling)

[0.1812732534105002, 0.9088082901554404, 0.30225745304153023, 0.7941185653333881, 0.9926223683238987]


## Random undersampling

In [30]:
rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X_train_scaled, y_train)

In [31]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_under, y_under)

  y = column_or_1d(y, warn=True)


In [32]:
y_pred = mlp.predict(X_val_scaled)
f1 = f1_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
precision = precision_score(y_val, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_val_scaled)
roc_auc = roc_auc_score(y_val, y_pred_probs[:, 1])

precision_points, recall_points, thresholds = precision_recall_curve(y_val, y_pred_probs[:, 1])

pr_auc = auc(recall_points, precision_points)

metrics_undersampling = [precision, recall, f1, pr_auc, roc_auc]

print(metrics_undersampling)

[0.07375932013148401, 0.9533678756476683, 0.136925137669296, 0.6596101505876483, 0.9888954446692667]


## MLP with SMOTE

In [33]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

In [34]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_train_smote, y_train_smote)

  y = column_or_1d(y, warn=True)


In [35]:
y_pred = mlp.predict(X_val_scaled)
f1 = f1_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
precision = precision_score(y_val, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_val_scaled)
roc_auc = roc_auc_score(y_val, y_pred_probs[:, 1])

precision_points, recall_points, thresholds = precision_recall_curve(y_val, y_pred_probs[:, 1])

pr_auc = auc(recall_points, precision_points)

metrics_smote = [precision, recall, f1, pr_auc, roc_auc]

print(metrics_smote)

[0.6257408975444538, 0.7658031088082902, 0.6887232059645852, 0.7831155591371928, 0.9889320988046579]


## Results on validation

In [36]:
all_metrics =[metrics_base, metrics_oversampling, metrics_undersampling, metrics_smote]

df_metrics = pd.DataFrame(all_metrics, columns=['precision', 'recall', 'f1', 'PR_AUC', 'ROC_AUC'])

row_names = ['MLP Base Model','MLP Oversampling', 'MLP Undersampling', 'MLP SMOTE']

df_metrics.index = row_names

df_metrics 

Unnamed: 0,precision,recall,f1,PR_AUC,ROC_AUC
MLP Base Model,0.888462,0.718135,0.794269,0.817285,0.988287
MLP Oversampling,0.181273,0.908808,0.302257,0.794119,0.992622
MLP Undersampling,0.073759,0.953368,0.136925,0.65961,0.988895
MLP SMOTE,0.625741,0.765803,0.688723,0.783116,0.988932


## Hyperparameter tuning based on results:

In [37]:
param_grid = {
    'hidden_layer_sizes': [(10,10),(20, 20)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.01, 0.1],
    'learning_rate': ['invscaling', 'adaptive'],
    'max_iter': [50, 1000],
    'random_state': [1],
    'learning_rate_init': [0.001, 0.1],
    'validation_fraction': [0.1, 0.2]
}

mlp = MLPClassifier()

grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='f1', n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

best_mlp = grid_search.best_estimator_

  y = column_or_1d(y, warn=True)


In [38]:
best_params = best_mlp.get_params()

for param_name, param_value in best_params.items():
    print(f'{param_name}: {param_value}')

activation: tanh
alpha: 0.0001
batch_size: auto
beta_1: 0.9
beta_2: 0.999
early_stopping: False
epsilon: 1e-08
hidden_layer_sizes: (20, 20)
learning_rate: invscaling
learning_rate_init: 0.001
max_fun: 15000
max_iter: 50
momentum: 0.9
n_iter_no_change: 10
nesterovs_momentum: True
power_t: 0.5
random_state: 1
shuffle: True
solver: adam
tol: 0.0001
validation_fraction: 0.1
verbose: False
warm_start: False


In [39]:
y_pred = best_mlp.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')

y_pred_probs = best_mlp.predict_proba(X_test_scaled)

roc_auc = roc_auc_score(y_test, y_pred_probs[:, 1])

precision_points, recall_points, thresholds = precision_recall_curve(y_test, y_pred_probs[:, 1])

pr_auc = auc(recall_points, precision_points)

metrics_after_tuning = [precision, recall, f1, pr_auc, roc_auc]

print(metrics_after_tuning)

[0.9348591549295775, 0.550259067357513, 0.6927592954990215, 0.7105937802133576, 0.9798609056778314]


In [40]:
final =[metrics_after_tuning]
df_final_metrics = pd.DataFrame(final, columns=['precision', 'recall', 'f1', 'PR_AUC', 'ROC_AUC'])

df_final_metrics

Unnamed: 0,precision,recall,f1,PR_AUC,ROC_AUC
0,0.934859,0.550259,0.692759,0.710594,0.979861
