In [1]:
import pandas as pd
import urllib.request
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import NearMiss
from imblearn.combine import SMOTEENN


# Load dataset
df = pd.read_csv('creditcard_data.csv')

# Separate features and target
X = df.drop(columns=['Class'])
y = df['Class']

# Scale data (optional, but can help with convergence)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Define sampling techniques
samplers = {
    'Sampling1': SMOTE(),
    'Sampling2': RandomUnderSampler(),
    'Sampling3': RandomOverSampler(),
    'Sampling4': NearMiss(),
    'Sampling5': SMOTEENN()
}

# Define machine learning models
models = {
    'M1': LogisticRegression(max_iter=500),
    'M2': RandomForestClassifier(),
    'M3': SVC(),
    'M4': KNeighborsClassifier(),
    'M5': GradientBoostingClassifier()
}

# Initialize an empty DataFrame for results
results = pd.DataFrame(columns=["Model", "Sampling", "Accuracy"])

# Loop through sampling techniques and models
for sampling_name, sampler in samplers.items():
    # Apply the sampling technique on training data
    X_sampled, y_sampled = sampler.fit_resample(X_train, y_train)
    
    for model_name, model in models.items():
        # Train the model
        model.fit(X_sampled, y_sampled)
        
        # Predict on test data
        y_pred = model.predict(X_test)
        
        # Calculate accuracy
        acc = accuracy_score(y_test, y_pred)
        
        # Append results
        new_row = pd.DataFrame([{"Model": model_name, "Sampling": sampling_name, "Accuracy": acc}])
        results = pd.concat([results, new_row], ignore_index=True)

# Display results
print(results)

# Determine which sampling technique gives the highest accuracy for each model
best_sampling_technique = results.loc[results.groupby("Model")["Accuracy"].idxmax()]

# Display best sampling techniques for each model
print("\nBest Sampling Technique for Each Model:")
print(best_sampling_technique)


  results = pd.concat([results, new_row], ignore_index=True)


   Model   Sampling  Accuracy
0     M1  Sampling1  0.896552
1     M2  Sampling1  0.987069
2     M3  Sampling1  0.948276
3     M4  Sampling1  0.922414
4     M5  Sampling1  0.974138
5     M1  Sampling2  0.478448
6     M2  Sampling2  0.676724
7     M3  Sampling2  0.560345
8     M4  Sampling2  0.409483
9     M5  Sampling2  0.784483
10    M1  Sampling3  0.896552
11    M2  Sampling3  0.987069
12    M3  Sampling3  0.943966
13    M4  Sampling3  0.943966
14    M5  Sampling3  0.982759
15    M1  Sampling4  0.331897
16    M2  Sampling4  0.211207
17    M3  Sampling4  0.250000
18    M4  Sampling4  0.758621
19    M5  Sampling4  0.443966
20    M1  Sampling5  0.883621
21    M2  Sampling5  0.987069
22    M3  Sampling5  0.939655
23    M4  Sampling5  0.900862
24    M5  Sampling5  0.969828

Best Sampling Technique for Each Model:
   Model   Sampling  Accuracy
0     M1  Sampling1  0.896552
1     M2  Sampling1  0.987069
2     M3  Sampling1  0.948276
13    M4  Sampling3  0.943966
14    M5  Sampling3  0.982759