## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings
from sklearn.preprocessing import StandardScaler, RobustScaler

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('data/creditcard.csv')

#### Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


#### 2. SCALING 'TIME' AND 'AMOUNT'

#### Preparing X and Y variables

In [4]:
X = df.drop(columns=['Class'], axis=1)
X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [5]:
y = df['Class']
y

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64

In [6]:
# Separate dataset into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((227845, 30), (56962, 30))

#### Prepping for data scalling

In [7]:
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_features = ['Time', 'Amount']

# Create transformation pipeline for numerical value in case if we need to process numerical value through 
# multiple steps
num_pipeline = Pipeline(
    steps=[
        # Add to the pipeline if need to handle missing value
        #("imputer", SimpleImputer(strategy="median")),
        ("robust scaler", RobustScaler())
    ]
)


preprocessor = ColumnTransformer(
    [
        ("RobustScaler", num_pipeline, numerical_features)
    ]
)

In [43]:
# Drop pre process columns
train_scaled_value = preprocessor.fit_transform(X_train)
X_train.drop(['Time', 'Amount'], axis=1, inplace=True)

test_scaled_value = preprocessor.transform(X_test)
X_test.drop(['Time', 'Amount'], axis=1, inplace=True)

X_train[['scaled_time', 'scaled_value']] = train_scaled_value
X_test[['scaled_time', 'scaled_value']] = test_scaled_value

X_test


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,scaled_time,scaled_value
43428,-16.526507,8.584972,-18.649853,9.505594,-13.793819,-2.832404,-16.701694,7.517344,-8.507059,-14.110184,...,1.190739,-1.127670,-2.358579,0.673461,-1.413700,-0.462762,-2.018575,-1.042804,-0.507399,4.785874
49906,0.339812,-2.743745,-0.134070,-1.385729,-1.451413,1.015887,-0.524379,0.224060,0.899746,-0.565012,...,-0.213436,-0.942525,-0.526819,-1.156992,0.311211,-0.746647,0.040996,0.102038,-0.475031,6.966713
29474,1.399590,-0.590701,0.168619,-1.029950,-0.539806,0.040444,-0.712567,0.002299,-0.971747,0.756801,...,0.102398,0.168269,-0.166639,-0.810250,0.505083,-0.232340,0.011409,0.004634,-0.578115,0.125874
276481,-0.432071,1.647895,-1.669361,-0.349504,0.785785,-0.630647,0.276990,0.586025,-0.484715,-1.376648,...,0.358932,0.873663,-0.178642,-0.017171,-0.207392,-0.157756,-0.237386,0.001934,0.967960,-0.286713
278846,2.014160,-0.137394,-1.015839,0.327269,-0.182179,-0.956571,0.043241,-0.160746,0.363241,0.259452,...,-0.238644,-0.616400,0.347045,0.061561,-0.360196,0.174730,-0.078043,-0.070571,0.983816,-0.295245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75723,-1.994348,1.503076,-0.365560,0.780223,-0.957956,0.038648,-0.453702,1.553565,-0.561964,-0.100318,...,0.224820,0.319275,-0.081356,-0.366704,-0.269380,-0.278170,0.082042,-0.015071,-0.334539,0.750909
252263,-0.234567,0.733694,0.486250,-0.718186,0.782227,-0.788837,1.056307,-0.175016,-0.244864,-0.708527,...,-0.202040,-0.574857,-0.024845,-0.428558,-0.563551,0.159926,0.094924,0.163736,0.834105,-0.167972
221246,0.040441,-0.109737,-1.266430,1.004783,2.223390,-0.670372,0.490662,-0.033739,-0.307052,0.402303,...,0.341151,0.930041,0.162391,-1.180279,-1.484172,-0.619133,0.357845,0.354379,0.678662,-0.200979
81910,-0.495048,0.991481,1.671584,-0.342474,0.470012,-0.348503,0.996077,-0.351891,-0.219231,0.579396,...,-0.324995,-0.474178,-0.145562,-0.011279,-0.162997,0.020511,0.040529,-0.269775,-0.300080,-0.257483


#### 2.1 Splitting the Data - NearMiss (Under sampling Technique)

Due to the significant class imbalance in the dataset, machine learning techniques like Decision Trees and Logistic Regression tend to be **biased towards the majority class**.

As a result, these models are more likely to **predict transactions as valid rather than fraudulent**, simply because valid transactions dominate the dataset.

NearMiss is an under-sampling technique. It aims to balance class distribution by randomly eliminating majority class examples.

Sources: https://www.researchgate.net/profile/Rahul-Pandya-7/publication/367510232_Heuristic_Approach_of_Over-Sampling_and_Under-_Sampling_in_Fraud_Detection/links/63d54b2d64fc860638f55f64/Heuristic-Approach-of-Over-Sampling-and-Under-Sampling-in-Fraud-Detection.pdf \
''This algorithm will in general eliminate cases of majority part classes when examples of two classes that are close.''

In [44]:
# Let's try to fit logistic regression with the imbalance data

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [45]:
models = {
    "LogisiticRegression": LogisticRegression(max_iter = 1000000),
    "K-NeighborsClassifier": KNeighborsClassifier(),
}

model_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    training_accuracy = accuracy_score(y_train_pred, y_train)     
    training_precision = precision_score(y_train_pred, y_train)     
    training_recall = recall_score(y_train_pred, y_train)     
    training_f1 = f1_score(y_train_pred, y_train)     
    
    testing_accuracy = accuracy_score(y_test, y_test_pred)     
    testing_precision = precision_score(y_test, y_test_pred)     
    testing_recall = recall_score(y_test, y_test_pred)     
    testing_f1 = f1_score(y_test, y_test_pred)     
    
    #Get name of model to print
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    
    print('Model performance for Training set')
    print("- Training Accuracy Score: {:.4f}".format(training_accuracy))
    print("- Training Precision Score: {:.4f}".format(training_precision))
    print("- Training Recall Score: {:.4f}".format(training_recall))
    print("- F1 Score: {:.4f}".format(training_f1))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Testing Accuracy Score: {:.4f}".format(testing_accuracy))
    print("- Testing Precision Score: {:.4f}".format(testing_precision))
    print("- Testing Recall Score: {:.4f}".format(testing_recall))
    print("- F1 Score: {:.4f}".format(testing_f1))

    
    print('='*35)
    print('\n')


LogisiticRegression
Model performance for Training set
- Training Accuracy Score: 0.9992
- Training Precision Score: 0.6294
- Training Recall Score: 0.8953
- F1 Score: 0.7392
----------------------------------
Model performance for Test set
- Testing Accuracy Score: 0.9991
- Testing Precision Score: 0.8636
- Testing Recall Score: 0.5816
- F1 Score: 0.6951


K-NeighborsClassifier
Model performance for Training set
- Training Accuracy Score: 0.9996
- Training Precision Score: 0.7843
- Training Recall Score: 0.9537
- F1 Score: 0.8607
----------------------------------
Model performance for Test set
- Testing Accuracy Score: 0.9995
- Testing Precision Score: 0.9481
- Testing Recall Score: 0.7449
- F1 Score: 0.8343




#####  Insights
- Even though Training and Testing accuracy score is very high, Testing recall score is very low (0.5816 and 0.7449) for both models
- This suggest that models **fail to recognize** fraud transactions from non-fraud transaction
- This is due to high class imbalance, where the **model bias the domniant class** (which is non fraud in this case)

In [46]:
import numpy as np
import pandas as pd

# Sklearn imports
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (accuracy_score, precision_score, 
                             recall_score, f1_score, classification_report)
from sklearn.model_selection import StratifiedKFold, cross_validate

# Imbalanced-learn imports
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbpipeline



# 1. Define the undersampling strategy
undersample = RandomUnderSampler(sampling_strategy='auto', random_state=42)

# 2. Define the models you want to evaluate
models = {
    "LogisticRegression": LogisticRegression(max_iter=1_000_000),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    # Add more models if needed
}

# 3. Define the scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

# 4. Set up stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 5. Iterate over each model, build the pipeline, and evaluate
for model_name, model in models.items():
    # Create a pipeline for undersampling + model
    pipeline = imbpipeline(steps=[
        ('undersample', undersample),
        ('model', model)
    ])

    # Perform cross-validation to get training metrics
    cv_results = cross_validate(
        pipeline,
        X_train, 
        y_train, 
        cv=skf, 
        scoring=scoring, 
        return_train_score=False
    )

    # Fit the pipeline on the entire training set
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(f"=== {model_name} ===")
    # Training metrics (mean of cross-validation folds)
    print("Mean Training Accuracy:   {:.3f} (+/- {:.3f})"
          .format(cv_results['test_accuracy'].mean(),
                  cv_results['test_accuracy'].std()))
    print("Mean Training Precision:  {:.3f} (+/- {:.3f})"
          .format(cv_results['test_precision'].mean(),
                  cv_results['test_precision'].std()))
    print("Mean Training Recall:     {:.3f} (+/- {:.3f})"
          .format(cv_results['test_recall'].mean(),
                  cv_results['test_recall'].std()))
    print("Mean Training F1 Score:   {:.3f} (+/- {:.3f})"
          .format(cv_results['test_f1'].mean(),
                  cv_results['test_f1'].std()))

    # Testing metrics (evaluating on the holdout set X_test, y_test)
    testing_accuracy = accuracy_score(y_test, y_pred)
    testing_precision = precision_score(y_test, y_pred)
    testing_recall = recall_score(y_test, y_pred)
    testing_f1 = f1_score(y_test, y_pred)

    print("\n--- Testing Results ---")
    print("Testing Accuracy:  {:.4f}".format(testing_accuracy))
    print("Testing Precision: {:.4f}".format(testing_precision))
    print("Testing Recall:    {:.4f}".format(testing_recall))
    print("Testing F1:        {:.4f}".format(testing_f1))

    print("===" * 30)


=== LogisticRegression ===
Mean Training Accuracy:   0.963 (+/- 0.004)
Mean Training Precision:  0.042 (+/- 0.006)
Mean Training Recall:     0.911 (+/- 0.024)
Mean Training F1 Score:   0.080 (+/- 0.010)

--- Testing Results ---
Testing Accuracy:  0.9632
Testing Precision: 0.0417
Testing Recall:    0.9286
Testing F1:        0.0799
=== KNeighborsClassifier ===
Mean Training Accuracy:   0.973 (+/- 0.005)
Mean Training Precision:  0.055 (+/- 0.008)
Mean Training Recall:     0.888 (+/- 0.027)
Mean Training F1 Score:   0.104 (+/- 0.015)

--- Testing Results ---
Testing Accuracy:  0.9668
Testing Precision: 0.0452
Testing Recall:    0.9082
Testing F1:        0.0860
=== Support Vector Classifier ===
Mean Training Accuracy:   0.984 (+/- 0.003)
Mean Training Precision:  0.090 (+/- 0.016)
Mean Training Recall:     0.883 (+/- 0.039)
Mean Training F1 Score:   0.163 (+/- 0.026)

--- Testing Results ---
Testing Accuracy:  0.9787
Testing Precision: 0.0681
Testing Recall:    0.8980
Testing F1:        0.

#### Insights
- Huge improvement for recall 0.9286 for Logistic Regression (from 0.58)
- High accuracy and recall score for testing set