In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
import warnings

warnings.filterwarnings('ignore')


In [2]:
print("Loading dataset...")
df = pd.read_csv('Creditcard_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:\n{df['Class'].value_counts()}")
print(f"Class ratio: {df['Class'].value_counts()[0]/df['Class'].value_counts()[1]:.2f}:1")


Loading dataset...
Dataset shape: (772, 31)

Class distribution:
Class
0    763
1      9
Name: count, dtype: int64
Class ratio: 84.78:1


In [3]:
X = df.drop('Class', axis=1)
y = df['Class']


In [4]:
print("\n" + "="*60)
print("SPLITTING DATA")
print("="*60)

X_train_original, X_test, y_train_original, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set size: {X_train_original.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Training set class distribution:\n{pd.Series(y_train_original).value_counts()}")
print(f"Test set class distribution:\n{pd.Series(y_test).value_counts()}")



SPLITTING DATA
Training set size: 540 samples
Test set size: 232 samples
Training set class distribution:
Class
0    534
1      6
Name: count, dtype: int64
Test set class distribution:
Class
0    229
1      3
Name: count, dtype: int64


In [5]:
models = {
    'M1': LogisticRegression(max_iter=1000, random_state=42),
    'M2': DecisionTreeClassifier(random_state=42),
    'M3': RandomForestClassifier(n_estimators=100, random_state=42),
    'M4': SVC(kernel='linear', random_state=42),
    'M5': KNeighborsClassifier(n_neighbors=5)
}


In [6]:
print("\n" + "="*60)
print("SAMPLING TECHNIQUES")
print("="*60)

print("Sampling1: Random Over-Sampling")
print("Sampling2: Random Under-Sampling")
print("Sampling3: SMOTE")
print("Sampling4: ADASYN")
print("Sampling5: SMOTETomek")

sampling_techniques = {
    'Sampling1': RandomOverSampler(random_state=42),
    'Sampling2': RandomUnderSampler(random_state=42),
    'Sampling3': SMOTE(random_state=42),
    'Sampling4': ADASYN(random_state=42),
    'Sampling5': SMOTETomek(random_state=42)
}



SAMPLING TECHNIQUES
Sampling1: Random Over-Sampling
Sampling2: Random Under-Sampling
Sampling3: SMOTE
Sampling4: ADASYN
Sampling5: SMOTETomek


In [7]:
results = {model: {} for model in models.keys()}

print("\n" + "="*60)
print("TRAINING AND EVALUATION")
print("="*60)

for sampling_name, sampler in sampling_techniques.items():
    print(f"\nApplying {sampling_name}...")

    X_train_resampled, y_train_resampled = sampler.fit_resample(
        X_train_original, y_train_original
    )

    print(f"  Resampled training data shape: {X_train_resampled.shape}")
    print(f"  Class distribution after sampling: {pd.Series(y_train_resampled).value_counts().to_dict()}")

    for model_name, model in models.items():
        print(f"  Training {model_name}...", end=' ')

        model.fit(X_train_resampled, y_train_resampled)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred) * 100
        results[model_name][sampling_name] = accuracy

        print(f"Accuracy: {accuracy:.2f}%")



TRAINING AND EVALUATION

Applying Sampling1...
  Resampled training data shape: (1068, 30)
  Class distribution after sampling: {0: 534, 1: 534}
  Training M1... Accuracy: 91.81%
  Training M2... Accuracy: 96.98%
  Training M3... Accuracy: 99.14%
  Training M4... Accuracy: 91.81%
  Training M5... Accuracy: 97.84%

Applying Sampling2...
  Resampled training data shape: (12, 30)
  Class distribution after sampling: {0: 6, 1: 6}
  Training M1... Accuracy: 57.76%
  Training M2... Accuracy: 38.79%
  Training M3... Accuracy: 66.81%
  Training M4... Accuracy: 60.78%
  Training M5... Accuracy: 75.00%

Applying Sampling3...
  Resampled training data shape: (1068, 30)
  Class distribution after sampling: {0: 534, 1: 534}
  Training M1... Accuracy: 93.53%
  Training M2... Accuracy: 98.71%
  Training M3... Accuracy: 98.71%
  Training M4... Accuracy: 92.67%
  Training M5... Accuracy: 72.41%

Applying Sampling4...
  Resampled training data shape: (1068, 30)
  Class distribution after sampling: {0: 

In [8]:
print("\n" + "="*60)
print("RESULTS TABLE")
print("="*60)

results_df = pd.DataFrame(results).T
results_df = results_df[['Sampling1', 'Sampling2', 'Sampling3', 'Sampling4', 'Sampling5']]

print("\n", results_df.round(2))

results_df.round(2).to_csv('sampling_results.csv')
print("\nResults saved to 'sampling_results.csv'")



RESULTS TABLE

     Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1      91.81      57.76      93.53      92.67      92.67
M2      96.98      38.79      98.71      98.28      97.84
M3      99.14      66.81      98.71      99.14      98.71
M4      91.81      60.78      92.67      92.67      91.81
M5      97.84      75.00      72.41      72.84      73.71

Results saved to 'sampling_results.csv'
