In [42]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler

In [43]:
df = pd.read_csv('/content/Creditcard_data.csv')

X = df.drop('Class', axis=1)
y = df['Class']

#Converting dataset into balanced class dataset
ros = RandomOverSampler(random_state=42)
X_new, y_new = ros.fit_resample(X, y)
df_new = pd.concat([X_new, y_new], axis=1)

z = 1.96
p = 0.5
e = 0.05
n = int((z**2 * p * (1-p)) / e**2)

In [44]:
def simple_random_sampling(df, sample_size):
  return df.sample(n=sample_size, random_state=42)

In [45]:
def systematic_sampling(df, sample_size):
  step = len(df) // sample_size
  indices = np.arange(0, len(df), step)[:sample_size]
  return df.iloc[indices]

In [46]:
def stratified_sampling(df, sample_size):
  n = int(sample_size / len(df['Class'].unique()))
  samples = []
  for col in df['Class'].unique():
    col_sample = df[df['Class'] == col].sample(n, random_state=42)
    samples.append(col_sample)
  return pd.concat(samples)

In [47]:
def cluster_sampling(df, sample_size):
  df['cluster'] = np.random.randint(0, 10, size=len(df))
  random_clusters = np.random.choice(10, size=2, replace=False)
  sample = df[df['cluster'].isin(random_clusters)]
  return sample.drop('cluster', axis=1)

In [48]:
def bootstrap_sampling(df, sample_size):
  return df.sample(n=sample_size, replace=True, random_state=42)

In [49]:
samples = {
    'Simple Random': simple_random_sampling(df_new, n),   #Sampling1
    'Systematic': systematic_sampling(df_new, n),         #Sampling2
    'Stratified': stratified_sampling(df_new, n),         #Sampling3
    'Cluster': cluster_sampling(df_new, n),               #Sampling4
    'Bootstrap': bootstrap_sampling(df_new, n)            #Sampling5
}

models = {
    'M1 (Logistic Regression)': LogisticRegression(max_iter=1000),
    'M2 (Decision Tree)': DecisionTreeClassifier(random_state=42),
    'M3 (Random Forest)': RandomForestClassifier(random_state=42),
    'M4 (SVM)': SVC(),
    'M5 (KNN)': KNeighborsClassifier()
}

results = {}

In [50]:
#Calculating model accuracies using 5 sampling methods

for sampling_method, dff in samples.items():
  X_sample = dff.drop('Class', axis=1)
  y_sample = dff['Class']

  X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

  #scaling
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  accuracies = {}
  for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[model_name] = accuracy

  results[sampling_method] = accuracies

df_result = pd.DataFrame(results)
print("Model Accuracies using 5 Sampling Methods:")
print(df_result)

Model Accuracies using 5 Sampling Methods:
                          Simple Random  Systematic  Stratified  Cluster  \
M1 (Logistic Regression)       0.883117    0.896104    0.909091  0.84375   
M2 (Decision Tree)             0.974026    0.974026    0.987013  0.93750   
M3 (Random Forest)             0.987013    1.000000    0.987013  1.00000   
M4 (SVM)                       0.909091    0.948052    0.987013  0.96875   
M5 (KNN)                       0.948052    0.857143    0.935065  0.90625   

                          Bootstrap  
M1 (Logistic Regression)   0.961039  
M2 (Decision Tree)         0.974026  
M3 (Random Forest)         1.000000  
M4 (SVM)                   0.961039  
M5 (KNN)                   0.961039  


In [51]:
#Best technique results may change when different samples are choosen
best_method = df_result.mean().idxmax()
print(f"Sampling method with highest average accuracy: {best_method}")

Sampling method with highest average accuracy: Bootstrap
