**Importing the libraries**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score

**Load the dataset**

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv')

In [3]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,575,-0.572263,0.731748,1.541254,0.150506,1.108974,0.372152,1.084879,-0.146329,-0.274447,...,-0.143508,-0.107582,-0.418263,-0.731029,0.877525,-0.364150,-0.177509,-0.256545,26.72,0
768,579,-1.296845,-0.511605,2.404726,-0.310762,-0.319551,-0.542842,-0.173310,0.260423,-1.202688,...,-0.071270,-0.161175,0.088496,0.285390,0.281069,-0.370130,0.043410,0.092318,80.00,0
769,579,1.214170,0.210481,0.484651,0.479768,-0.261955,-0.527039,0.021782,-0.106888,-0.037631,...,-0.224292,-0.594609,0.159877,0.091873,0.140964,0.227406,-0.017389,0.016030,5.98,0
770,580,1.267030,-0.071114,0.037680,0.512683,0.242392,0.705212,-0.226582,0.109483,0.657565,...,-0.164468,-0.177225,-0.222918,-1.245505,0.678360,0.525059,0.002920,-0.003333,12.36,0


**Split the dataset into training and testing**

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = {'Time', 'Class'}, axis = 1), df['Class'], test_size=0.3, random_state=42)

In [5]:
X_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
541,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,-2.772272,...,0.126911,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00
333,-1.118946,-0.071366,2.807769,1.025675,-0.100748,0.508680,0.620313,-0.213137,0.333039,-0.074899,...,0.101302,-0.045782,0.455553,0.170942,0.076211,0.197637,-0.286674,-0.230530,-0.405084,100.37
306,-0.342871,-0.199546,1.976353,-0.003495,-1.170366,0.883501,-0.151879,0.160106,0.137973,-0.060122,...,-0.572162,-0.313443,0.086207,0.109600,-0.098951,-0.943009,-0.618657,0.253306,0.240271,99.82
507,-0.837689,0.777698,1.841252,3.056892,0.303627,0.615335,0.531504,-0.081955,-0.522527,1.149725,...,0.032002,-0.070069,0.556788,0.217681,0.100721,-0.332479,0.252526,0.138865,-0.085152,29.18
338,-0.216867,0.900896,1.502850,0.812492,0.193952,-0.031488,0.490795,0.120991,-0.907336,-0.106269,...,0.023362,0.147497,0.463470,-0.045124,0.224126,-0.275402,-0.415339,0.108635,0.052981,10.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,-0.378245,0.732925,-0.120154,0.185755,2.594269,3.797183,0.059088,0.976768,-0.412661,0.006754,...,0.315572,-0.107582,-0.157140,-0.194659,1.013897,0.145503,-0.237620,0.411372,0.202788,11.45
106,-0.426072,-0.060304,2.220828,0.024742,-0.584964,0.460623,-0.322526,0.434776,1.252404,-1.012094,...,-0.200077,0.149485,0.769878,-0.092634,0.150536,-0.234230,0.504710,0.069158,0.041024,21.80
270,-0.549414,0.676861,2.151950,1.014523,-0.620012,0.076154,0.041578,0.342672,0.124723,-0.048092,...,0.104755,0.212024,0.850203,-0.185597,0.544990,-0.130609,-0.196374,0.422119,0.203313,20.70
435,-0.907420,1.103912,1.288489,1.243612,-0.068032,0.214040,0.324000,0.436037,-0.437409,0.179318,...,0.028251,0.022520,0.399523,-0.049081,0.220258,-0.162924,-0.286994,0.015071,-0.104668,15.08


In [7]:
y_train

541    1
333    0
306    0
507    0
338    0
      ..
71     0
106    0
270    0
435    0
102    0
Name: Class, Length: 540, dtype: int64

In [8]:
y_train.value_counts()

y_test.value_counts()

0    229
1      3
Name: Class, dtype: int64

**Models to use**

In [9]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier()
}

**Samplers**

In [10]:
samplers = {
    'Random Over-Sampling': RandomOverSampler(),
    'Random Under-Sampling': RandomUnderSampler(),
    'SMOTE': SMOTE(),
    'ADASYN': ADASYN(),
    'Tomek Links': SMOTETomek(),
    'Stratified Sampling': StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42),
    'Systematic Sampling': None,  
   
}

In [11]:
results = []

In [12]:
for model_name, model in models.items():
    for sampler_name, sampler in samplers.items():
        
        if sampler_name == 'Stratified Sampling':
            # Stratified Sampling
            for train_idx, val_idx in sampler.split(X_train, y_train):
                X_resampled, y_resampled = X_train.iloc[train_idx], y_train.iloc[train_idx]
                X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

        elif sampler_name == 'Systematic Sampling':
            # Systematic Sampling
            step = 2
            indices = list(range(0, len(X_train), step))
            X_resampled, y_resampled = X_train.iloc[indices], y_train.iloc[indices]
      
        else:
            # All other resampling techniques
            X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
        # Fit the model to the resampled data
        model.fit(X_resampled, y_resampled)
      
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Compute the accuracy score of the model on the test data
        accuracy = accuracy_score(y_test, y_pred)
        # Append the results to the results list
        results.append({'Model': model_name, 'Sampler': sampler_name, 'Accuracy': accuracy})


In [13]:
results_df = pd.DataFrame(results)

In [14]:
results_df

Unnamed: 0,Model,Sampler,Accuracy
0,Logistic Regression,Random Over-Sampling,0.939655
1,Logistic Regression,Random Under-Sampling,0.775862
2,Logistic Regression,SMOTE,0.931034
3,Logistic Regression,ADASYN,0.931034
4,Logistic Regression,Tomek Links,0.926724
5,Logistic Regression,Stratified Sampling,0.982759
6,Logistic Regression,Systematic Sampling,0.982759
7,Random Forest,Random Over-Sampling,0.987069
8,Random Forest,Random Under-Sampling,0.706897
9,Random Forest,SMOTE,0.987069


In [15]:
table = pd.pivot_table(results_df, values='Accuracy', index=['Model'], columns=['Sampler'])

In [16]:
table

Sampler,ADASYN,Random Over-Sampling,Random Under-Sampling,SMOTE,Stratified Sampling,Systematic Sampling,Tomek Links
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
K-Nearest Neighbors,0.918103,0.943966,0.599138,0.918103,0.987069,0.987069,0.918103
Logistic Regression,0.931034,0.939655,0.775862,0.931034,0.982759,0.982759,0.926724
Random Forest,0.987069,0.987069,0.706897,0.987069,0.987069,0.987069,0.987069
Support Vector Machine,0.672414,0.676724,0.590517,0.672414,0.987069,0.987069,0.672414
XGBoost,0.952586,0.965517,0.827586,0.952586,0.987069,0.982759,0.948276
