In [1]:
# necessary libraries
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# evaluation matrices
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# machine learning models
from sklearn.model_selection import GridSearchCV
# from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

import cudf as df
from cuml.linear_model import LogisticRegression

# import tensorflow as tf

In [2]:
# Load the datasets
X_resampled = pd.read_csv("X_resampled.csv")
y_resampled = pd.read_csv("y_resampled.csv")
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [3]:
# Check the shape of the datasets
print("Shape of X_resampled:", X_resampled.shape)
print("Shape of y_resampled:", y_resampled.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_resampled: (2461812, 8)
Shape of y_resampled: (2461812, 1)
Shape of X_test: (555719, 8)
Shape of y_test: (555719, 1)


#### Standardization - The process of transforming data features to have a mean of zero and a standard deviation of one

In [4]:
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

In [5]:
y_resampled

Unnamed: 0,is_fraud
0,0
1,0
2,0
3,0
4,0
...,...
2461807,1
2461808,1
2461809,1
2461810,1


#### Normalization - Scaling individual data features to a specific range, typically [0, 1] or [-1, 1], to ensure uniformity

In [6]:
scaler = MinMaxScaler()
X_resampled_scaledN = scaler.fit_transform(X_resampled)
X_test_scaledN = scaler.transform(X_test)

---
## 1) Logistic Regression (LG)

In [7]:
# Define hyperparameter grids
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    # 'solver': ['liblinear', 'saga']
    # 'solver': ['qn']
}

#### a) LG with standardize data

In [8]:
# Start the timer
start_time = time.time()

In [None]:
grid_search_lr_scaled = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid_lr, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_lr_scaled.fit(X_resampled_scaled, y_resampled.values.ravel())
best_params_lr_scaled = grid_search_lr_scaled.best_params_
best_score_lr_scaled = grid_search_lr_scaled.best_score_

print("Best parameters for Logistic Regression with Standardized data: ", best_params_lr_scaled)
print("Best score for Logistic Regression with Standardized data: ", best_score_lr_scaled)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [None]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for Logistic regression with Standardization execution completed in {execution_time:.2f} minutes.")

#### b) LG with normalized data

In [None]:
# Start the timer
start_time = time.time()

In [None]:
grid_search_lr_normalized = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid_lr, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_lr_normalized.fit(X_resampled_scaledN, y_resampled.values.ravel())
best_params_lr_normalized = grid_search_lr_normalized.best_params_
best_score_lr_normalized = grid_search_lr_normalized.best_score_

print("Best parameters for Logistic Regression with Normalized data: ", best_params_lr_normalized)
print("Best score for Logistic Regression with Normalized data: ", best_score_lr_normalized)

In [None]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for Logistic regression with Normalized execution completed in {execution_time:.2f} minutes.")

In [None]:
# Evaluate Logistic Regression (Standardized)
best_lr_scaled = grid_search_lr_scaled.best_estimator_
y_pred_lr_scaled = best_lr_scaled.predict(X_test_scaled)

print("Logistic Regression (Standardized) Accuracy: ", accuracy_score(y_test, y_pred_lr_scaled))
print("Logistic Regression (Standardized) Classification Report:\n", classification_report(y_test, y_pred_lr_scaled))
print("Logistic Regression (Standardized) Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_scaled))

In [None]:
# Evaluate Logistic Regression (Normalized)
best_lr_normalized = grid_search_lr_normalized.best_estimator_
y_pred_lr_normalized = best_lr_normalized.predict(X_test_scaledN)

print("Logistic Regression (Normalized) Accuracy: ", accuracy_score(y_test, y_pred_lr_normalized))
print("Logistic Regression (Normalized) Classification Report:\n", classification_report(y_test, y_pred_lr_normalized))
print("Logistic Regression (Normalized) Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr_normalized))

---
## 2) Decision Trees

In [4]:
param_grid_dt = {
    'max_depth': [5, 10, 20, 50, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

In [11]:
# Start the timer
start_time = time.time()

In [12]:
# with tf.device('/GPU:0'):
grid_search_dt = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid_dt, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_dt.fit(X_resampled, y_resampled.values.ravel())
best_params_dt = grid_search_dt.best_params_
best_score_dt = grid_search_dt.best_score_

print("Best parameters for Decision Tree: ", best_params_dt)
print("Best score for Decision Tree: ", best_score_dt)

Fitting 3 folds for each of 270 candidates, totalling 810 fits
Best parameters for Decision Tree:  {'criterion': 'entropy', 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
Best score for Decision Tree:  0.9918722469465582


In [13]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for Decision Tree execution completed in {execution_time:.2f} minutes.")

Hyperparameter tuning for Decision Tree execution completed in 13.84 minutes.


In [14]:
# Evaluate Decision Tree
best_dt = grid_search_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)

print("Decision Tree Accuracy: ", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

Decision Tree Accuracy:  0.9078005250855199
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.91      0.95    553574
           1       0.03      0.65      0.05      2145

    accuracy                           0.91    555719
   macro avg       0.51      0.78      0.50    555719
weighted avg       0.99      0.91      0.95    555719

Decision Tree Confusion Matrix:
 [[503094  50480]
 [   757   1388]]


---
## 3) Random Forest

In [10]:
param_grid_rf = {
    'n_estimators': [100, 200, 500],
    'max_depth': [5, 10, 20, 50, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

In [11]:
# Start the timer
start_time = time.time()

In [None]:
# import tensorflow as tf

with tf.device('/CPU:0'):
    grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_rf, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
    grid_search_rf.fit(X_resampled, y_resampled.values.ravel())
    best_params_rf = grid_search_rf.best_params_
    best_score_rf = grid_search_rf.best_score_
    
    print("Best parameters for Random Forest: ", best_params_rf)
    print("Best score for Random Forest: ", best_score_rf)

Fitting 3 folds for each of 1620 candidates, totalling 4860 fits


In [None]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for Random Forest execution completed in {execution_time:.2f} minutes.")

In [None]:
# Evaluate Random Forest
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

print("Random Forest Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

---
## 4) Gradient Boosting Machines (GBM)

In [None]:
param_grid_gbm = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'subsample': [0.8, 1.0]
}

In [None]:
# Start the timer
start_time = time.time()

In [None]:
grid_search_gbm = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_grid_gbm, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_gbm.fit(X_resampled, y_resampled.values.ravel())
best_params_gbm = grid_search_gbm.best_params_
best_score_gbm = grid_search_gbm.best_score_

print("Best parameters for Gradient Boosting Machine: ", best_params_gbm)
print("Best score for Gradient Boosting Machine: ", best_score_gbm)

In [None]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for GBM execution completed in {execution_time:.2f} minutes.")

In [None]:
# Evaluate Gradient Boosting Machine
best_gbm = grid_search_gbm.best_estimator_
y_pred_gbm = best_gbm.predict(X_test)

print("Gradient Boosting Machine Accuracy: ", accuracy_score(y_test, y_pred_gbm))
print("Gradient Boosting Machine Classification Report:\n", classification_report(y_test, y_pred_gbm))
print("Gradient Boosting Machine Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gbm))

---
## 5) XGBoost Classifier

In [None]:
param_grid_xgb = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

In [None]:
# Start the timer
start_time = time.time()

In [None]:
grid_search_xgb = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid=param_grid_xgb, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_xgb.fit(X_resampled, y_resampled.values.ravel())
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_

print("Best parameters for XGBoost: ", best_params_xgb)
print("Best score for XGBoost: ", best_score_xgb)

In [None]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Hyperparameter tuning for XGBoost execution completed in {execution_time:.2f} minutes.")

In [None]:
# Evaluate XGBoost
best_xgb = grid_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

print("XGBoost Accuracy: ", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))