In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek, SMOTEENN


In [2]:
df = pd.read_csv("Creditcard_data.csv")
print(df.head())


   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [3]:
X = df.drop(columns=['Class'])
y = df['Class']

print("Original class distribution:")
print(y.value_counts())


Original class distribution:
Class
0    763
1      9
Name: count, dtype: int64


In [4]:
smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X, y)

print("Balanced class distribution:")
print(pd.Series(y_bal).value_counts())


Balanced class distribution:
Class
0    763
1    763
Name: count, dtype: int64


In [5]:
samples = []

for i in range(1, 6):
    X_train, X_test, y_train, y_test = train_test_split(
        X_bal, y_bal, test_size=0.3, random_state=i
    )
    samples.append((X_train, X_test, y_train, y_test))


In [6]:
sampling_techniques = {
    "Sampling1_RUS": RandomUnderSampler(random_state=42),
    "Sampling2_ROS": RandomOverSampler(random_state=42),
    "Sampling3_SMOTE": SMOTE(random_state=42),
    "Sampling4_SMOTETomek": SMOTETomek(random_state=42),
    "Sampling5_SMOTEENN": SMOTEENN(random_state=42)
}


In [7]:
models = {
    "M1_LogisticRegression": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_RandomForest": RandomForestClassifier(),
    "M4_KNN": KNeighborsClassifier(),
    "M5_SVM": SVC()
}


In [8]:
results = pd.DataFrame(index=models.keys(), columns=sampling_techniques.keys())

for model_name, model in models.items():
    for samp_name, sampler in sampling_techniques.items():

        accuracies = []

        for X_train, X_test, y_train, y_test in samples:

            X_res, y_res = sampler.fit_resample(X_train, y_train)
            model.fit(X_res, y_res)
            y_pred = model.predict(X_test)

            acc = accuracy_score(y_test, y_pred)
            accuracies.append(acc)

        results.loc[model_name, samp_name] = round(np.mean(accuracies) * 100, 2)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [9]:
print("\nAccuracy Comparison Table (%)")
print(results)



Accuracy Comparison Table (%)
                      Sampling1_RUS Sampling2_ROS Sampling3_SMOTE  \
M1_LogisticRegression         91.75         91.83           91.79   
M2_DecisionTree               97.51         97.47           97.69   
M3_RandomForest               99.52         99.61           99.61   
M4_KNN                        84.67         84.59           84.63   
M5_SVM                        67.99         68.03           67.99   

                      Sampling4_SMOTETomek Sampling5_SMOTEENN  
M1_LogisticRegression                91.57              92.01  
M2_DecisionTree                      97.73              93.89  
M3_RandomForest                      99.04              97.16  
M4_KNN                               84.15              81.09  
M5_SVM                               67.73              66.03  


In [10]:
print("\nBest Sampling Technique for Each Model:\n")

for model in results.index:
    best_sampling = results.loc[model].astype(float).idxmax()
    best_accuracy = results.loc[model].astype(float).max()

    print(f"{model} → {best_sampling} ({best_accuracy}%)")



Best Sampling Technique for Each Model:

M1_LogisticRegression → Sampling5_SMOTEENN (92.01%)
M2_DecisionTree → Sampling4_SMOTETomek (97.73%)
M3_RandomForest → Sampling2_ROS (99.61%)
M4_KNN → Sampling1_RUS (84.67%)
M5_SVM → Sampling2_ROS (68.03%)
