In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.cluster import KMeans
from sklearn.utils import resample

In [None]:
df= pd.read_csv('Creditcard_data.csv')
print(df['Class'].value_counts())

Class
0    763
1      9
Name: count, dtype: int64


In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [9]:
#SAMPLING TECHNIQUES

#RST
X_r, _, y_r, _ = train_test_split(
    X_scaled, y, train_size=0.5, random_state=42
)

#stratified
X_s, _, y_s, _ = train_test_split(
    X_scaled, y, train_size=0.5, stratify=y, random_state=42
)

#cluster
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

cluster_df = pd.DataFrame(X_scaled)
cluster_df['cluster'] = clusters
cluster_df['Class'] = y.values

sample_cluster = cluster_df[cluster_df['cluster'] == 0]  #selects only one cluster

#separate class and target
X_c = sample_cluster.drop(['cluster', 'Class'], axis=1)
y_c = sample_cluster['Class']



#Systematic
step = 5
X_sys = X_scaled[::step]
y_sys = y.iloc[::step]

#KFold
StratifiedKFold(n_splits=5)


StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

In [10]:
models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": SVC(),
    "M5": KNeighborsClassifier()
}

In [14]:
# 8. Training & Evaluation


results = {}

for model_name, model in models.items():
    results[model_name] = {}

    for sample_name, (Xs, ys) in samples.items():

        # Skip if only one class exists
        if len(np.unique(ys)) < 2:
            print(f"Skipping {model_name} with {sample_name} sampling: "
                  f"target variable has only one class ({np.unique(ys)[0]}).")
            results[model_name][sample_name] = "NA"
            continue

        X_train, X_test, y_train, y_test = train_test_split(
            Xs, ys, test_size=0.3, random_state=42
        )

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = round(accuracy_score(y_test, y_pred) * 100, 2)
        results[model_name][sample_name] = acc

    # ---- Cross Validation ----
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = cross_val_score(model, X_scaled, y, cv=cv).mean()
    results[model_name]["CrossVal"] = round(cv_score * 100, 2)


# 9. Convert Results to Table
results_df = pd.DataFrame(results).T

# Ensure column order
results_df = results_df[
    ["Random", "Stratified", "Cluster", "Systematic", "CrossVal"]
]

print("\nFinal Accuracy Comparison Table:\n")
print(results_df)



try:
    from IPython.display import display
    display(results_df)
except:
    pass

Skipping M1 with Systematic sampling: target variable has only one class (0).
Skipping M2 with Systematic sampling: target variable has only one class (0).
Skipping M3 with Systematic sampling: target variable has only one class (0).
Skipping M4 with Systematic sampling: target variable has only one class (0).
Skipping M5 with Systematic sampling: target variable has only one class (0).

Final Accuracy Comparison Table:

   Random Stratified Cluster Systematic CrossVal
M1  99.14      99.14   98.51         NA     98.7
M2  99.14      98.28   97.01         NA    97.93
M3  99.14      99.14   98.51         NA    98.83
M4  99.14      99.14   98.51         NA    98.83
M5  99.14      99.14   98.51         NA    98.83


Unnamed: 0,Random,Stratified,Cluster,Systematic,CrossVal
M1,99.14,99.14,98.51,,98.7
M2,99.14,98.28,97.01,,97.93
M3,99.14,99.14,98.51,,98.83
M4,99.14,99.14,98.51,,98.83
M5,99.14,99.14,98.51,,98.83
