In [1]:
!pip install imbalanced-learn




In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler


In [3]:
data = pd.read_csv("Creditcard_data.csv")

features = data.drop(columns=["Class"])
labels = data["Class"]


In [4]:
balancer = RandomOverSampler(random_state=101)
X_balanced, y_balanced = balancer.fit_resample(features, labels)

balanced_data = pd.concat(
    [pd.DataFrame(X_balanced, columns=features.columns),
     pd.DataFrame(y_balanced, columns=["Class"])],
    axis=1
)

print("Class distribution after balancing:")
print(balanced_data["Class"].value_counts())


Class distribution after balancing:
Class
0    763
1    763
Name: count, dtype: int64


In [11]:
# Sampling A — Simple Random
def sample_random(df):
    return df.sample(frac=0.5, random_state=101)


# Sampling B — Systematic
def sample_systematic(df):
    interval = 2
    return df.iloc[::interval]


# Sampling C — Stratified
def sample_stratified(df):
    return df.groupby("Class", group_keys=False).sample(frac=0.5, random_state=101)


# Sampling D — Cluster
def sample_cluster(df):
    clusters = df['Class'].unique()

    # pick both clusters but sample inside them
    sampled_parts = []

    for c in clusters:
        part = df[df['Class'] == c]
        sampled_parts.append(part.sample(frac=0.5, random_state=101))

    return pd.concat(sampled_parts)



# Sampling E — Bootstrap
def sample_bootstrap(df):
    return df.sample(n=len(df)//2, replace=True, random_state=101)


sampling_functions = [
    sample_random,
    sample_systematic,
    sample_stratified,
    sample_cluster,
    sample_bootstrap
]


In [12]:
model_list = [
    LogisticRegression(max_iter=1000),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    SVC()
]

model_labels = [
    "M1_Logistic",
    "M2_DecisionTree",
    "M3_RandomForest",
    "M4_KNN",
    "M5_SVM"
]

sampling_labels = [
    "Sampling1",
    "Sampling2",
    "Sampling3",
    "Sampling4",
    "Sampling5"
]

accuracy_table = pd.DataFrame(
    index=model_labels,
    columns=sampling_labels
)


In [13]:
for i, sampler in enumerate(sampling_functions):

    subset = sampler(balanced_data)

    X_sub = subset.drop("Class", axis=1)
    y_sub = subset["Class"]

    X_train, X_test, y_train, y_test = train_test_split(
        X_sub, y_sub,
        test_size=0.3,
        random_state=101
    )

    for j, clf in enumerate(model_list):

        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)

        acc = accuracy_score(y_test, predictions) * 100
        accuracy_table.iloc[j, i] = round(acc, 2)

print("Accuracy Table:")
print(accuracy_table)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy Table:
                Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1_Logistic         93.01     86.46     94.78     94.35     87.77
M2_DecisionTree     99.13     99.56     98.26      98.7     98.69
M3_RandomForest     100.0     100.0     100.0     100.0     100.0
M4_KNN              96.51     95.63     93.91     93.91     96.51
M5_SVM              66.81     69.43      68.7     67.39     74.67


In [14]:
best_results = pd.DataFrame(columns=["Best Sampling Technique", "Accuracy"])

for model in accuracy_table.index:
    best_sampling = accuracy_table.loc[model].astype(float).idxmax()
    best_accuracy = accuracy_table.loc[model].astype(float).max()

    best_results.loc[model] = [best_sampling, best_accuracy]

print("\nBest Sampling Technique per Model:")
print(best_results)



Best Sampling Technique per Model:
                Best Sampling Technique  Accuracy
M1_Logistic                   Sampling3     94.78
M2_DecisionTree               Sampling2     99.56
M3_RandomForest               Sampling1    100.00
M4_KNN                        Sampling1     96.51
M5_SVM                        Sampling5     74.67


In [15]:
accuracy_table.to_csv("accuracy_table.csv")
best_results.to_csv("best_sampling_results.csv")


In [16]:
from google.colab import files

files.download("accuracy_table.csv")
files.download("best_sampling_results.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
accuracy_table.to_csv("accuracy_table.csv", index=True)
best_results.to_csv("best_sampling_results.csv", index=True)
