In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler

In [2]:
data = pd.read_csv('/content/Creditcard_data.csv')

X = data.drop('Class', axis=1)
y = data['Class']

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
df_balanced = pd.concat([X_resampled, y_resampled], axis=1)

z = 1.96
p = 0.5
e = 0.05
n = int((z**2 * p * (1-p)) / e**2)

In [3]:
def simple_random_sampling(df, sample_size):
    return df.sample(n=sample_size, random_state=42)

In [4]:
def systematic_sampling(df, sample_size):
    step = len(df) // sample_size
    indices = np.arange(0, len(df), step)[:sample_size]
    return df.iloc[indices]

In [5]:
def stratified_sampling(df, sample_size):
    return df.groupby('Class', group_keys=False).apply(lambda x: x.sample(int(sample_size/2), random_state=42))

In [6]:
def cluster_sampling(df, sample_size):
    num_clusters = 10
    df['cluster'] = np.random.randint(0, num_clusters, size=len(df))
    selected_clusters = np.random.choice(num_clusters, size=2, replace=False)
    sample = df[df['cluster'].isin(selected_clusters)]
    return sample.drop('cluster', axis=1)

In [7]:
def bootstrap_sampling(df, sample_size):
    return df.sample(n=sample_size, replace=True, random_state=42)

In [8]:
samples = {
    'Simple Random': simple_random_sampling(df_balanced, n),
    'Systematic': systematic_sampling(df_balanced, n),
    'Stratified': stratified_sampling(df_balanced, n),
    'Cluster': cluster_sampling(df_balanced, n),
    'Bootstrap': bootstrap_sampling(df_balanced, n)
}

models = {
    'M1 (Logistic Regression)': LogisticRegression(max_iter=1000),
    'M2 (Decision Tree)': DecisionTreeClassifier(random_state=42),
    'M3 (Random Forest)': RandomForestClassifier(random_state=42),
    'M4 (SVM)': SVC(),
    'M5 (KNN)': KNeighborsClassifier()
}

results = {}

  return df.groupby('Class', group_keys=False).apply(lambda x: x.sample(int(sample_size/2), random_state=42))


In [9]:
for sample_name, sample_df in samples.items():
    X_sample = sample_df.drop('Class', axis=1)
    y_sample = sample_df['Class']

    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model_accuracies = {}
    for model_name, model in models.items():
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        model_accuracies[model_name] = accuracy

    results[sample_name] = model_accuracies

results_df = pd.DataFrame(results)
print("Model Accuracies using different Sampling Techniques:")
print(results_df)

Model Accuracies using different Sampling Techniques:
                          Simple Random  Systematic  Stratified   Cluster  \
M1 (Logistic Regression)       0.883117    0.896104    0.909091  0.921875   
M2 (Decision Tree)             0.974026    0.974026    0.987013  0.937500   
M3 (Random Forest)             0.987013    1.000000    0.987013  1.000000   
M4 (SVM)                       0.909091    0.948052    0.987013  0.937500   
M5 (KNN)                       0.948052    0.857143    0.935065  0.906250   

                          Bootstrap  
M1 (Logistic Regression)   0.961039  
M2 (Decision Tree)         0.974026  
M3 (Random Forest)         1.000000  
M4 (SVM)                   0.961039  
M5 (KNN)                   0.948052  


In [10]:
best_technique = results_df.mean().idxmax()
print(f"The sampling technique with the highest average accuracy is: {best_technique}")

The sampling technique with the highest average accuracy is: Bootstrap
