In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from sklearn.metrics import accuracy_score

In [None]:
# Load data
df = pd.read_csv('https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv')
X = df.drop(columns=['Time', 'Class'])
y = df['Class']

In [None]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier()
}

In [None]:
# Define sampling techniques
samplers = {
    'Random Over-Sampling': RandomOverSampler(),
    'Random Under-Sampling': RandomUnderSampler(),
    'SMOTE': SMOTE(),
    'ADASYN': ADASYN(),
    'Tomek Links': SMOTETomek(),
    'Stratified Sampling': StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42),
    'Systematic Sampling': None  # Replace None with the appropriate implementation
}

In [None]:

results = []
for model_name, model in models.items():
    for sampler_name, sampler in samplers.items():
        if sampler_name == 'Stratified Sampling':
            # Apply stratified sampling
            for train_idx, val_idx in sampler.split(X_train, y_train):
                X_resampled, y_resampled = X_train.iloc[train_idx], y_train.iloc[train_idx]
                X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]
        elif sampler_name == 'Systematic Sampling':
            # Apply systematic sampling
            step = 2
            indices = list(range(0, len(X_train), step))
            X_resampled, y_resampled = X_train.iloc[indices], y_train.iloc[indices]
        else:
            # Apply other resampling techniques
            X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
        # Fit model to resampled data
        model.fit(X_resampled, y_resampled)
        # Make predictions on test data
        y_pred = model.predict(X_test)
        # Compute accuracy score on test data
        accuracy = accuracy_score(y_test, y_pred)
        # Append results to results list
        results.append({'Model': model_name, 'Sampler': sampler_name, 'Accuracy': accuracy})
# Convert results list to DataFrame
results_df = pd.DataFrame(results)


In [None]:
results_df

In [None]:
# Reshape the DataFrame to have models as rows and samplers as columns
table = pd.pivot_table(results_df, values='Accuracy', index=['Model'], columns=['Sampler'])

In [None]:
table

Sampler,ADASYN,Random Over-Sampling,Random Under-Sampling,SMOTE,Stratified Sampling,Systematic Sampling,Tomek Links
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
K-Nearest Neighbors,0.922414,0.943966,0.336207,0.918103,0.987069,0.987069,0.918103
Logistic Regression,0.931034,0.939655,0.719828,0.926724,0.982759,0.982759,0.926724
Random Forest,0.987069,0.987069,0.74569,0.987069,0.987069,0.987069,0.987069
Support Vector Machine,0.672414,0.672414,0.262931,0.672414,0.987069,0.987069,0.672414
XGBoost,0.952586,0.965517,0.594828,0.952586,0.987069,0.982759,0.952586
