In [1]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import precision_recall_curve, auc
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE


In [2]:
X = pd.read_csv(r"C:\Users\ngche\Downloads\X.csv")
y = pd.read_csv(r"C:\Users\ngche\Downloads\y.csv")


In [3]:
len(X['job'].unique())

497

transformer used for job classification

In [7]:
data = pd.read_csv(r"C:\Users\ngche\Downloads\fraudDataset_jobs_classified.csv")


In [6]:
data.columns

Index(['transaction_time', 'credit_card_number', 'merchant', 'category',
       'amount(usd)', 'first', 'last', 'gender', 'street', 'city', 'state',
       'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'transaction_id',
       'merch_lat', 'merch_long', 'is_fraud', 'time', 'hour_of_day',
       'day_of_week', 'month', 'year', 'age', 'age_group',
       'latitudinal_distance', 'longitudinal_distance', 'job_category'],
      dtype='object')

In [8]:
Y = data['is_fraud']

In [71]:
X_encoded = pd.get_dummies(X,columns = ['category', 'gender', 'state', 'day_of_week', 'job_categories'])
X_encoded = X_encoded.drop('job', axis=1)


In [72]:
len(X_encoded.columns)

90

Dimensionality is pretty high, might want to use PCA

In [73]:
y = y.values.ravel() if hasattr(y, "values") else y.ravel()

X_train, X_temp, y_train, y_temp = train_test_split(X_encoded, y, stratify=y, test_size=0.2, random_state=42) 

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, stratify=y_temp, test_size=0.5, random_state=42)


## MLP Model no under/oversampling

In [74]:
scaler = StandardScaler()

columns_to_scale = ['age', 'lat', 'long', 'amount(usd)']

X_train_scaled = X_train.copy()
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

X_val_scaled = X_val.copy()
X_val_scaled[columns_to_scale] = scaler.fit_transform(X_val[columns_to_scale])

X_test_scaled = X_test.copy()
X_test_scaled[columns_to_scale] = scaler.fit_transform(X_test[columns_to_scale])

In [75]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_train_scaled, y_train)

In [76]:
y_pred = mlp.predict(X_val_scaled)
f1 = f1_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
precision = precision_score(y_val, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_val_scaled)
roc_auc = roc_auc_score(y_val, y_pred_probs[:, 1])

precision_points, recall_points, thresholds = precision_recall_curve(y_val, y_pred_probs[:, 1])

pr_auc = auc(recall_points, precision_points)

metrics_base = [precision, recall, f1, pr_auc, roc_auc]

print(metrics_base)

[0.900497512437811, 0.5626943005181347, 0.9970988693269317, 0.715025659624779, 0.9783188933397838]


## MLP Model with PCA

In [77]:
pca = PCA().fit(X_train_scaled)
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1 
n_components

3

In [78]:
pca = PCA(n_components=n_components)
fit = pca.fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled) 
X_test_pca = pca.transform(X_test_scaled) 
explained_variance_ratio = pca.explained_variance_ratio_
print(f'Explained variance ratio: {explained_variance_ratio}')

Explained variance ratio: [0.70624366 0.17231495 0.10034361]


In [79]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_train_pca, y_train)

In [80]:
y_pred = mlp.predict(X_val_pca)
f1 = f1_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
precision = precision_score(y_val, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_val_pca)
roc_auc = roc_auc_score(y_val, y_pred_probs[:, 1])

precision_points, recall_points, thresholds = precision_recall_curve(y_val, y_pred_probs[:, 1])

pr_auc = auc(recall_points, precision_points)

metrics_base_pca = [precision, recall, f1, pr_auc, roc_auc]

print(metrics_base_pca)

[0.0, 0.0, 0.9921925732199355, 0.020937082381013127, 0.8157217195322058]


  _warn_prf(average, modifier, msg_start, len(result))


## Results

In [81]:
all_metrics = [metrics_base, metrics_base_pca]

df_metrics = pd.DataFrame(all_metrics, columns=['precision', 'recall', 'weighted f1', 'PR_AUC', 'ROC_AUC'])

row_names = ['MLP Base Model','MLP Base Model with PCA']

df_metrics.index = row_names

df_metrics

Unnamed: 0,precision,recall,weighted f1,PR_AUC,ROC_AUC
MLP Base Model,0.900498,0.562694,0.997099,0.715026,0.978319
MLP Base Model with PCA,0.0,0.0,0.992193,0.020937,0.815722


Stick to model without PCA based on lower f1 score, lower ROC_AUC and PR_AUC score

## MLP with random oversampling

In [82]:
ros = RandomOverSampler(random_state=42)

In [83]:
X_over, y_over = ros.fit_resample(X_train, y_train)

In [84]:
X_over = scaler.fit_transform(X_over)

In [85]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_over, y_over)

In [86]:
y_pred = mlp.predict(X_val_scaled)
f1 = f1_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
precision = precision_score(y_val, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_val_scaled)
roc_auc = roc_auc_score(y_val, y_pred_probs[:, 1])

precision_points, recall_points, thresholds = precision_recall_curve(y_val, y_pred_probs[:, 1])

pr_auc = auc(recall_points, precision_points)

metrics_oversampling = [precision, recall, f1, pr_auc, roc_auc]

print(metrics_oversampling)



[0.461839530332681, 0.24455958549222798, 0.9937497417898857, 0.25857458791303484, 0.827915976214964]




## Random undersampling

In [87]:
rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X_train, y_train)

In [88]:
X_under = scaler.fit_transform(X_under)

In [89]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_under, y_under)



In [90]:
y_pred = mlp.predict(X_val_scaled)
f1 = f1_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
precision = precision_score(y_val, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_val_scaled)
roc_auc = roc_auc_score(y_val, y_pred_probs[:, 1])

precision_points, recall_points, thresholds = precision_recall_curve(y_val, y_pred_probs[:, 1])

pr_auc = auc(recall_points, precision_points)

metrics_undersampling = [precision, recall, f1, pr_auc, roc_auc]

print(metrics_undersampling)



[0.004769526940797301, 0.9139896373056995, 0.002550033131364358, 0.21008233823050318, 0.6234503435158313]




## MLP with SMOTE

In [91]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [92]:
X_train_smote = scaler.fit_transform(X_train_smote)

In [93]:
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=50, activation = 'relu',
                    solver='adam', verbose=0, random_state=2,learning_rate = 'invscaling', learning_rate_init=0.001, alpha = 0.01, validation_fraction=0.1)
mlp.fit(X_train_smote, y_train_smote)

In [94]:
y_pred = mlp.predict(X_val_scaled)
f1 = f1_score(y_val, y_pred, average='binary')
recall = recall_score(y_val, y_pred, average='binary')
precision = precision_score(y_val, y_pred, average='binary')

y_pred_probs = mlp.predict_proba(X_val_scaled)
roc_auc = roc_auc_score(y_val, y_pred_probs[:, 1])

precision_points, recall_points, thresholds = precision_recall_curve(y_val, y_pred_probs[:, 1])

pr_auc = auc(recall_points, precision_points)

metrics_smote = [precision, recall, f1, pr_auc, roc_auc]

print(metrics_smote)



[0.0051524398476951744, 0.9886010362694301, 0.0008412166291088607, 0.33456867785828237, 0.41423831238917097]




## Results on validation

In [95]:
all_metrics =[metrics_base, metrics_oversampling, metrics_undersampling, metrics_smote]

df_metrics = pd.DataFrame(all_metrics, columns=['precision', 'recall', 'f1', 'PR_AUC', 'ROC_AUC'])

row_names = ['MLP Base Model','MLP Oversampling', 'MLP Undersampling', 'MLP SMOTE']

df_metrics.index = row_names

df_metrics 

Unnamed: 0,precision,recall,f1,PR_AUC,ROC_AUC
MLP Base Model,0.900498,0.562694,0.997099,0.715026,0.978319
MLP Oversampling,0.46184,0.24456,0.99375,0.258575,0.827916
MLP Undersampling,0.00477,0.91399,0.00255,0.210082,0.62345
MLP SMOTE,0.005152,0.988601,0.000841,0.334569,0.414238


## Hyperparameter tuning based on results:

In [96]:
param_grid = {
    'hidden_layer_sizes': [(10,10),(20, 20)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.01, 0.1],
    'learning_rate': ['invscaling', 'adaptive'],
    'max_iter': [50, 1000],
    'random_state': [1],
    'learning_rate_init': [0.001, 0.1],
    'validation_fraction': [0.1, 0.2]
}

mlp = MLPClassifier()

grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

best_mlp = grid_search.best_estimator_

In [None]:
best_params = best_mlp.get_params()

for param_name, param_value in best_params.items():
    print(f'{param_name}: {param_value}')

In [None]:
y_pred = best_mlp.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
precision = precision_score(y_test, y_pred, average='binary')

y_pred_probs = best_mlp.predict_proba(X_test_scaled)

roc_auc = roc_auc_score(y_test, y_pred_probs[:, 1])

metrics_after_tuning = [accuracy, f1, recall, precision, roc_auc]

print(metrics_after_tuning)

[0.9975437270567912, 0.9972086882037158, 0.5492227979274611, 0.9636363636363636, 0.9897045120810234]


In [None]:
final =[metrics_after_tuning]
df_final_metrics = pd.DataFrame(final, columns=['precision', 'recall', 'f1', 'PR_AUC', 'ROC_AUC'])

df_final_metrics

Unnamed: 0,precision,recall,f1,PR_AUC,ROC_AUC
0,0.997544,0.997209,0.549223,0.963636,0.989705
