# Credit Card Fraud Detection

# Task 2: Predictive Modelling

## Required libraries 

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from adjustText import adjust_text
from geopy.distance import geodesic
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report
from xgboost import XGBClassifier
import pickle
import os
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

##

## Load training and test dataset 

In [3]:
with open("variables/X_train.pkl", "rb") as f:
    X_train = pickle.load(f)

with open("variables/y_train.pkl", "rb") as f:
    y_train = pickle.load(f)

with open("variables/X_test.pkl", "rb") as f:
    X_test = pickle.load(f)

with open("variables/y_test.pkl", "rb") as f:
    y_test = pickle.load(f)

with open("variables/kaggle_data.pkl", "rb") as f:
    kaggle_data = pickle.load(f)

with open("variables/index_mapping.pkl", "rb") as f:
    index_mapping = pickle.load(f)

## Class Imbalance - SMOTE and model pipeline

In [5]:
oversampling_rates = [0.5,0.6,0.7]  # Fraction of majority class
undersampling_rates = [0.8,0.9] # Fraction of total dataset for the majority class



def model_pipeline(model, submission_file, use_random_search=False, param_grid=None, n_iter=10):
    results = []
    best_model = None
    best_auc = 0 
    best_config = None 

    for oversampling_rate in oversampling_rates:
        for undersampling_rate in undersampling_rates:
            print(f"\nTesting Oversampling={oversampling_rate}, Undersampling={undersampling_rate}")
            
            # Define SMOTE and undersampler
            smote = SMOTE(sampling_strategy=oversampling_rate, random_state=42)
            undersampler = RandomUnderSampler(sampling_strategy=undersampling_rate, random_state=42)
            
            # Create pipeline
            pipeline = Pipeline(steps=[
                ('smote', smote),
                ('undersampler', undersampler),
                ('model', model)
            ])
            
            if use_random_search:
                search = RandomizedSearchCV(
                    estimator=pipeline,
                    param_distributions=param_grid,  # Prefix for model params
                    n_iter=n_iter,
                    scoring='roc_auc',
                    cv=5,
                    n_jobs=-1,
                    random_state=42 
                )
                # Train with hyperparameter tuning
                search.fit(X_train, y_train)
                best_pipeline = search.best_estimator_
                best_params = search.best_params_
                print(f"Best Parameters for this iteration: {best_params}")
            else:
                # Train pipeline without hyperparameter search
                pipeline.fit(X_train, y_train)
                best_pipeline = pipeline
                best_params = "Default parameters"


            
            # Predict on the test set
            y_pred = best_pipeline.predict(X_test)
            y_pred_proba = best_pipeline.predict_proba(X_test)[:, 1]  # Get probabilities for AUC

            
            # Evaluate model
            print("Classification Report:")
            report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
            
            
            # Confusion matrix
            print("Confusion Matrix:")
            print(confusion_matrix(y_test, y_pred))
            
            # Calculate AUC
            auc_score = roc_auc_score(y_test, y_pred_proba)
           

            # Store results
            results.append({
                'oversampling_rate': oversampling_rate,
                'undersampling_rate': undersampling_rate,
                'precision': report['1']['precision'],
                'recall': report['1']['recall'],
                'f1_score': report['1']['f1-score'],
                'auc': auc_score
            })

            # Check if current model is the best
            if auc_score > best_auc:
                best_auc = auc_score
                best_model = best_pipeline
                best_config = {
                    'oversampling_rate': oversampling_rate,
                    'undersampling_rate': undersampling_rate
                }

    # Print best configuration
    print("\nBest Configuration:")
    print(f"Oversampling Rate: {best_config['oversampling_rate']}")
    print(f"Undersampling Rate: {best_config['undersampling_rate']}")
    print(f"Best AUC: {best_auc:.4f}")

    # Predict probabilities for Kaggle submission
    test_probs = best_model.predict_proba(kaggle_data)[:, 1]  # Probabilities for class 1 (fraud)

    # Create submission DataFrame
    submission = pd.DataFrame({
        'index': index_mapping,
        'is_fraud': test_probs
    })

    # Save to CSV
    submission_file_name = f"{submission_file}.csv"
    submission.to_csv(f"submission/{submission_file_name}", index=False)
    print(f"Submission file created: '{submission_file_name}'")
    
    return results

## Random Forest Classifier

In [40]:
# Train a Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

model_pipeline(rf,"submission_random_forest")


Testing Oversampling=0.5, Undersampling=0.8
Classification Report:
Confusion Matrix:
[[5854   32]
 [ 114    0]]

Testing Oversampling=0.5, Undersampling=0.9
Classification Report:
Confusion Matrix:
[[5874   12]
 [ 114    0]]

Testing Oversampling=0.6, Undersampling=0.8
Classification Report:
Confusion Matrix:
[[5875   11]
 [ 114    0]]

Testing Oversampling=0.6, Undersampling=0.9
Classification Report:
Confusion Matrix:
[[5870   16]
 [ 114    0]]

Testing Oversampling=0.7, Undersampling=0.8
Classification Report:
Confusion Matrix:
[[5877    9]
 [ 114    0]]

Testing Oversampling=0.7, Undersampling=0.9
Classification Report:
Confusion Matrix:
[[5873   13]
 [ 114    0]]

Best Configuration:
Oversampling Rate: 0.5
Undersampling Rate: 0.9
Best AUC: 0.4676
Submission file created: 'submission_random_forest.csv'


[{'oversampling_rate': 0.5,
  'undersampling_rate': 0.8,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.4527215933138997)},
 {'oversampling_rate': 0.5,
  'undersampling_rate': 0.9,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.4675642768150413)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.8,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.42593114199021165)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.9,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.4334206651525177)},
 {'oversampling_rate': 0.7,
  'undersampling_rate': 0.8,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.42143042366364436)},
 {'oversampling_rate': 0.7,
  'undersampling_rate': 0.9,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.45924241882313666)}]

## Random Search - Random Forest Classifier

In [41]:

rf = RandomForestClassifier(random_state=42)

param_distributions = {
        'model__n_estimators': [100, 200, 500,1000],      
        'model__max_depth': [None, 10, 20, 30],     
        'model__min_samples_split': [2, 5, 10],         
        'model__min_samples_leaf': [1, 2, 4, 5],                           
    }

model_pipeline(rf,"submission_random_search_random_forest",True,param_distributions)




Testing Oversampling=0.5, Undersampling=0.8


Best Parameters for this iteration: {'model__n_estimators': 100, 'model__min_samples_split': 5, 'model__min_samples_leaf': 4, 'model__max_depth': 10}
Classification Report:
Confusion Matrix:
[[5806   80]
 [ 113    1]]

Testing Oversampling=0.5, Undersampling=0.9
Best Parameters for this iteration: {'model__n_estimators': 100, 'model__min_samples_split': 5, 'model__min_samples_leaf': 4, 'model__max_depth': 10}
Classification Report:
Confusion Matrix:
[[5808   78]
 [ 114    0]]

Testing Oversampling=0.6, Undersampling=0.8
Best Parameters for this iteration: {'model__n_estimators': 100, 'model__min_samples_split': 5, 'model__min_samples_leaf': 4, 'model__max_depth': 10}
Classification Report:
Confusion Matrix:
[[5868   18]
 [ 114    0]]

Testing Oversampling=0.6, Undersampling=0.9
Best Parameters for this iteration: {'model__n_estimators': 100, 'model__min_samples_split': 5, 'model__min_samples_leaf': 4, 'model__max_depth': 10}
Classification Report:
Confusion Matrix:
[[5786  100]
 [ 114 

[{'oversampling_rate': 0.5,
  'undersampling_rate': 0.8,
  'precision': 0.012345679012345678,
  'recall': 0.008771929824561403,
  'f1_score': 0.010256410256410256,
  'auc': np.float64(0.4405584765515556)},
 {'oversampling_rate': 0.5,
  'undersampling_rate': 0.9,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.44447499567811816)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.8,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.4421985561934057)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.9,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.4528899976751256)},
 {'oversampling_rate': 0.7,
  'undersampling_rate': 0.8,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.4532551221751286)},
 {'oversampling_rate': 0.7,
  'undersampling_rate': 0.9,
  'precision': 0.015503875968992248,
  'recall': 0.017543859649122806,
  'f1_score': 0.01646090534979424,
  'auc': n

**Score on Kaggle:** 0.52296

## XGBOOST

In [42]:

xgb = XGBClassifier(n_estimators=500, max_depth=5, learning_rate=0.1, random_state=42)


model_pipeline(xgb,"submission_xgboost")


Testing Oversampling=0.5, Undersampling=0.8
Classification Report:
Confusion Matrix:
[[4479 1407]
 [  95   19]]

Testing Oversampling=0.5, Undersampling=0.9
Classification Report:
Confusion Matrix:
[[4418 1468]
 [  94   20]]

Testing Oversampling=0.6, Undersampling=0.8
Classification Report:
Confusion Matrix:
[[4739 1147]
 [  96   18]]

Testing Oversampling=0.6, Undersampling=0.9
Classification Report:
Confusion Matrix:
[[4543 1343]
 [  96   18]]

Testing Oversampling=0.7, Undersampling=0.8
Classification Report:
Confusion Matrix:
[[4480 1406]
 [  94   20]]

Testing Oversampling=0.7, Undersampling=0.9
Classification Report:
Confusion Matrix:
[[4469 1417]
 [  94   20]]

Best Configuration:
Oversampling Rate: 0.6
Undersampling Rate: 0.9
Best AUC: 0.4426
Submission file created: 'submission_xgboost.csv'


[{'oversampling_rate': 0.5,
  'undersampling_rate': 0.8,
  'precision': 0.01332398316970547,
  'recall': 0.16666666666666666,
  'f1_score': 0.024675324675324677,
  'auc': np.float64(0.4379914277709224)},
 {'oversampling_rate': 0.5,
  'undersampling_rate': 0.9,
  'precision': 0.013440860215053764,
  'recall': 0.17543859649122806,
  'f1_score': 0.024968789013732832,
  'auc': np.float64(0.43144899285250166)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.8,
  'precision': 0.015450643776824034,
  'recall': 0.15789473684210525,
  'f1_score': 0.028146989835809225,
  'auc': np.float64(0.4331725295229239)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.9,
  'precision': 0.01322556943423953,
  'recall': 0.15789473684210525,
  'f1_score': 0.02440677966101695,
  'auc': np.float64(0.4426158413362663)},
 {'oversampling_rate': 0.7,
  'undersampling_rate': 0.8,
  'precision': 0.014025245441795231,
  'recall': 0.17543859649122806,
  'f1_score': 0.025974025974025976,
  'auc': np.float64

**Score on Kaggle:** 

## Random Search - XGBOOST

In [43]:
xgb = XGBClassifier(n_estimators=500, max_depth=5, learning_rate=0.1, random_state=42)

param_distributions = {
        'model__n_estimators': [100, 200, 300, 500, 1000],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
        'model__max_depth': [3, 5, 7, 10],
        'model__subsample': [0.6, 0.8, 1.0],
        'model__colsample_bytree': [0.6, 0.8, 1.0],
        'model__reg_alpha': [0, 0.1, 1, 10],
        'model__reg_lambda': [1, 10, 50],
        'model__min_child_weight': [1, 3, 5, 7]
    }


model_pipeline(xgb,"submission_random_search_xgboost",True, param_distributions)


Testing Oversampling=0.5, Undersampling=0.8




Best Parameters for this iteration: {'model__subsample': 1.0, 'model__reg_lambda': 1, 'model__reg_alpha': 0, 'model__n_estimators': 500, 'model__min_child_weight': 3, 'model__max_depth': 10, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.8}
Classification Report:
Confusion Matrix:
[[4178 1708]
 [  89   25]]

Testing Oversampling=0.5, Undersampling=0.9
Best Parameters for this iteration: {'model__subsample': 1.0, 'model__reg_lambda': 1, 'model__reg_alpha': 0, 'model__n_estimators': 500, 'model__min_child_weight': 3, 'model__max_depth': 10, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.8}
Classification Report:
Confusion Matrix:
[[4539 1347]
 [  97   17]]

Testing Oversampling=0.6, Undersampling=0.8
Best Parameters for this iteration: {'model__subsample': 1.0, 'model__reg_lambda': 1, 'model__reg_alpha': 0, 'model__n_estimators': 500, 'model__min_child_weight': 3, 'model__max_depth': 10, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.8}
Classification 

[{'oversampling_rate': 0.5,
  'undersampling_rate': 0.8,
  'precision': 0.014425851125216388,
  'recall': 0.21929824561403508,
  'f1_score': 0.02707092582566324,
  'auc': np.float64(0.4244505248850975)},
 {'oversampling_rate': 0.5,
  'undersampling_rate': 0.9,
  'precision': 0.012463343108504398,
  'recall': 0.14912280701754385,
  'f1_score': 0.023004059539918808,
  'auc': np.float64(0.43313825252904603)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.8,
  'precision': 0.014863258026159334,
  'recall': 0.21929824561403508,
  'f1_score': 0.02783964365256125,
  'auc': np.float64(0.4215936119605844)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.9,
  'precision': 0.016271501627150162,
  'recall': 0.30701754385964913,
  'f1_score': 0.03090507726269316,
  'auc': np.float64(0.43722541147295696)},
 {'oversampling_rate': 0.7,
  'undersampling_rate': 0.8,
  'precision': 0.01279317697228145,
  'recall': 0.15789473684210525,
  'f1_score': 0.023668639053254437,
  'auc': np.float64

**Score on Kaggle:** 0.52380

## Decision Tree


In [44]:
dt = DecisionTreeClassifier(random_state = 42)

parameter_grid = {
    'model__max_depth': [5, 10, 20, 30, 40, 50],
    'model__min_samples_split': [2, 5, 10, 20],  
    'model__min_samples_leaf': [1, 2, 4, 5],   
    'model__max_leaf_nodes': [None, 10, 20, 50, 100],
    'model__max_features': [1, 2, 3, 4, 5, 6, 7, 8,]
}

model_pipeline(dt,"submission_decision_tree",True, parameter_grid)


Testing Oversampling=0.5, Undersampling=0.8
Best Parameters for this iteration: {'model__min_samples_split': 10, 'model__min_samples_leaf': 2, 'model__max_leaf_nodes': 50, 'model__max_features': 8, 'model__max_depth': 40}
Classification Report:
Confusion Matrix:
[[4484 1402]
 [  92   22]]

Testing Oversampling=0.5, Undersampling=0.9
Best Parameters for this iteration: {'model__min_samples_split': 10, 'model__min_samples_leaf': 2, 'model__max_leaf_nodes': 50, 'model__max_features': 8, 'model__max_depth': 40}
Classification Report:
Confusion Matrix:
[[4703 1183]
 [  98   16]]

Testing Oversampling=0.6, Undersampling=0.8
Best Parameters for this iteration: {'model__min_samples_split': 5, 'model__min_samples_leaf': 4, 'model__max_leaf_nodes': None, 'model__max_features': 4, 'model__max_depth': 30}
Classification Report:
Confusion Matrix:
[[5219  667]
 [ 103   11]]

Testing Oversampling=0.6, Undersampling=0.9
Best Parameters for this iteration: {'model__min_samples_split': 10, 'model__min_

[{'oversampling_rate': 0.5,
  'undersampling_rate': 0.8,
  'precision': 0.01544943820224719,
  'recall': 0.19298245614035087,
  'f1_score': 0.02860858257477243,
  'auc': np.float64(0.4675978086568783)},
 {'oversampling_rate': 0.5,
  'undersampling_rate': 0.9,
  'precision': 0.013344453711426188,
  'recall': 0.14035087719298245,
  'f1_score': 0.024371667936024372,
  'auc': np.float64(0.4303990736269828)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.8,
  'precision': 0.016224188790560472,
  'recall': 0.09649122807017543,
  'f1_score': 0.027777777777777776,
  'auc': np.float64(0.4723302692681415)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.9,
  'precision': 0.018815331010452963,
  'recall': 0.23684210526315788,
  'f1_score': 0.03486120077469335,
  'auc': np.float64(0.4882191462345977)},
 {'oversampling_rate': 0.7,
  'undersampling_rate': 0.8,
  'precision': 0.015647226173541962,
  'recall': 0.09649122807017543,
  'f1_score': 0.02692778457772338,
  'auc': np.float64(0

## Neural Networks - Multi-Layer Perceptron (MLP)

In [45]:

mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)



model_pipeline(mlp,"submission_mlp")


Testing Oversampling=0.5, Undersampling=0.8




Classification Report:
Confusion Matrix:
[[5886    0]
 [ 114    0]]

Testing Oversampling=0.5, Undersampling=0.9




Classification Report:
Confusion Matrix:
[[5886    0]
 [ 114    0]]

Testing Oversampling=0.6, Undersampling=0.8
Classification Report:
Confusion Matrix:
[[5886    0]
 [ 114    0]]

Testing Oversampling=0.6, Undersampling=0.9
Classification Report:
Confusion Matrix:
[[5886    0]
 [ 114    0]]

Testing Oversampling=0.7, Undersampling=0.8




Classification Report:
Confusion Matrix:
[[5886    0]
 [ 114    0]]

Testing Oversampling=0.7, Undersampling=0.9
Classification Report:
Confusion Matrix:
[[5886    0]
 [ 114    0]]

Best Configuration:
Oversampling Rate: 0.5
Undersampling Rate: 0.8
Best AUC: 0.5000
Submission file created: 'submission_mlp.csv'




[{'oversampling_rate': 0.5,
  'undersampling_rate': 0.8,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.5)},
 {'oversampling_rate': 0.5,
  'undersampling_rate': 0.9,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.5)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.8,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.5)},
 {'oversampling_rate': 0.6,
  'undersampling_rate': 0.9,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.5)},
 {'oversampling_rate': 0.7,
  'undersampling_rate': 0.8,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.5)},
 {'oversampling_rate': 0.7,
  'undersampling_rate': 0.9,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.5)}]

## Random Search - Neural Networks (MLP)

In [8]:

mlp = MLPClassifier( max_iter=300, random_state=42)


parameter_grid = {
    'model__hidden_layer_sizes': [(50,), (100,), (100, 50), (150, 100)],
    'model__activation': ['relu', 'tanh'],
    'model__solver': ['adam', 'sgd'],
    'model__alpha': [0.0001, 0.001, 0.01],
    'model__learning_rate': ['constant', 'adaptive']
}

model_pipeline(mlp,"submission_random_search_mlp",True,parameter_grid)


Testing Oversampling=0.5, Undersampling=0.8




KeyboardInterrupt: 

## Support Vector Machine (SVM)

In [None]:

svm = SVC(kernel='rbf', probability=True, random_state=42)


model_pipeline(svm,"submission_svm")


Testing Oversampling=0.5, Undersampling=0.8
Classification Report:
Confusion Matrix:
[[5886    0]
 [ 114    0]]

Testing Oversampling=0.5, Undersampling=1.0
Classification Report:
Confusion Matrix:
[[2070 3816]
 [  48   66]]

Testing Oversampling=0.7, Undersampling=0.8
Classification Report:
Confusion Matrix:
[[5886    0]
 [ 114    0]]

Testing Oversampling=0.7, Undersampling=1.0
Classification Report:
Confusion Matrix:
[[2070 3816]
 [  48   66]]

Best Configuration:
Oversampling Rate: 0.7
Undersampling Rate: 0.8
Best AUC: 0.5413
Submission file created: 'submission_svm.csv'


[{'oversampling_rate': 0.5,
  'undersampling_rate': 0.8,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.4657274770344141)},
 {'oversampling_rate': 0.5,
  'undersampling_rate': 1.0,
  'precision': 0.017001545595054096,
  'recall': 0.5789473684210527,
  'f1_score': 0.03303303303303303,
  'auc': np.float64(0.5)},
 {'oversampling_rate': 0.7,
  'undersampling_rate': 0.8,
  'precision': 0.0,
  'recall': 0.0,
  'f1_score': 0.0,
  'auc': np.float64(0.541287384277888)},
 {'oversampling_rate': 0.7,
  'undersampling_rate': 1.0,
  'precision': 0.017001545595054096,
  'recall': 0.5789473684210527,
  'f1_score': 0.03303303303303303,
  'auc': np.float64(0.5)}]