In [1]:
import pandas as pd
import numpy as np
import random
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('Creditcard_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772 entries, 0 to 771
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    772 non-null    int64  
 1   V1      772 non-null    float64
 2   V2      772 non-null    float64
 3   V3      772 non-null    float64
 4   V4      772 non-null    float64
 5   V5      772 non-null    float64
 6   V6      772 non-null    float64
 7   V7      772 non-null    float64
 8   V8      772 non-null    float64
 9   V9      772 non-null    float64
 10  V10     772 non-null    float64
 11  V11     772 non-null    float64
 12  V12     772 non-null    float64
 13  V13     772 non-null    float64
 14  V14     772 non-null    float64
 15  V15     772 non-null    float64
 16  V16     772 non-null    float64
 17  V17     772 non-null    float64
 18  V18     772 non-null    float64
 19  V19     772 non-null    float64
 20  V20     772 non-null    float64
 21  V21     772 non-null    float64
 22  V2

#Data Pre-processing

In [3]:
# class count
class_count_0, class_count_1 = df['Class'].value_counts()

# Separate class
class_0 = df[df['Class'] == 0]
class_1 = df[df['Class'] == 1]

# print the shape of the class
print('class 0:', class_0.shape)
print('class 1:', class_1.shape)

class 0: (763, 31)
class 1: (9, 31)


In [4]:
# Separate the dataset into features and labels
X = df.drop('Class', axis=1)
y = df['Class']

# Apply SMOTE to balance the classes
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

# Create a new balanced dataset
df_balanced = pd.concat([X_res, y_res], axis=1)

In [5]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526 entries, 0 to 1525
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    1526 non-null   int64  
 1   V1      1526 non-null   float64
 2   V2      1526 non-null   float64
 3   V3      1526 non-null   float64
 4   V4      1526 non-null   float64
 5   V5      1526 non-null   float64
 6   V6      1526 non-null   float64
 7   V7      1526 non-null   float64
 8   V8      1526 non-null   float64
 9   V9      1526 non-null   float64
 10  V10     1526 non-null   float64
 11  V11     1526 non-null   float64
 12  V12     1526 non-null   float64
 13  V13     1526 non-null   float64
 14  V14     1526 non-null   float64
 15  V15     1526 non-null   float64
 16  V16     1526 non-null   float64
 17  V17     1526 non-null   float64
 18  V18     1526 non-null   float64
 19  V19     1526 non-null   float64
 20  V20     1526 non-null   float64
 21  V21     1526 non-null   float64
 22  

#Simple Random Sampling

In [6]:
# Determine the sample size using the formula
Z = 1.96  # 95% confidence level
p = len(df_balanced[df_balanced['Class'] == 1]) / len(df_balanced)
q = 1 - p
E = 0.05  # 5% margin of error
n = int((Z**2 * p * q) / E**2)
print('sample size:', n)

# Simple Random Sampling
random_state = 42
sample_random = df_balanced.sample(n=n, replace=True, random_state=random_state)

sample size: 384


#Systematic Sampling

In [7]:
# Determine the sample size using the formula
n = 300
N = len(df_balanced)
k = int(N / n)
print('sample size:', n)
print('step size:', k)

# Systematic Sampling
start_index = 0
step = k
sample_systematic = df_balanced.iloc[start_index::step]

sample size: 300
step size: 5


#Stratified Sampling

In [8]:
# Calculate the sample size using the specified formula
Z = 1.96 # 95% confidence level
p = 0.5 # proportion of the minority class
e = 0.1 # desired margin of error
S = 2 # number of strata
n = int((Z**2 * p * (1 - p)) / ((e/S)**2))
print('sample size:', n)

# Create the StratifiedShuffleSplit object
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=n, random_state=42)

# Get the indices of the training and test sets
X = df_balanced.drop('Class', axis=1)
y = df_balanced['Class']
for train_index, test_index in stratified_split.split(X, y):
    sample_strat = df_balanced.loc[test_index]

sample size: 384


#Cluster Sampling

In [9]:
# set the values of the parameters for the formula
Z = 1.96  # z-score for 95% confidence interval
p = 0.5   # estimated proportion of population with the target characteristic
e = 0.15  # margin of error
C = 3    # average cluster size

# use the formula to calculate the required sample size
n = int((Z**2 * p * (1 - p)) / ((e/C)**2))
print('sample size:', n)

# randomly select n/C clusters as the sample
unique_clusters = df_balanced['Time'].unique()  # assuming the 'Time' column represents the cluster ID
sample_clusters = random.sample(list(unique_clusters), int(n/C))

# collect data from all units in the selected clusters
sample_cluster = pd.DataFrame(columns=df_balanced.columns)
for cluster in sample_clusters:
    cluster_data = df_balanced[df_balanced['Time'] == cluster]
    sample_cluster_data = cluster_data.sample(n=C, random_state=42, replace=True)
    sample_cluster = sample_cluster.append(sample_cluster_data)
sample_cluster['Class'] = sample_cluster['Class'].astype(int)

sample size: 384


#Convenience sample

In [10]:
# Sort the data by the index column to ensure consistent order
df_conv = df_balanced.sort_values(by='Time')

# Set the sample size
n = 300
print('sample size:', n)

# Choose a convenience sample by selecting the first n rows from the sorted data
sample_conv = df_conv.head(n=n)

sample size: 300


#Models

In [11]:
# define the 5 models
models = [RandomForestClassifier(random_state=42), LogisticRegression(max_iter=1500, random_state=42), GaussianNB(), KNeighborsClassifier(), SVC(random_state=42)]

# define the number of folds for cross-validation
k = 5

# create an empty DataFrame to store the accuracies
accuracy_df = pd.DataFrame(columns=['sample', 'RandomForestClassifier', 'LogisticRegression', 'GaussianNB', 'KNeighborsClassifier', 'SVC'])

# loop through each sample and model to perform cross-validation and store the accuracies in the DataFrame
for sample_name, sample in [('cluster', sample_cluster), ('random', sample_random), ('systematic', sample_systematic), ('strat', sample_strat), ('conv', sample_conv)]:
    X = sample.iloc[:, :-1]  # select all columns except the last one as input features
    y = sample.iloc[:, -1]   # select the last column as the binary label
    accuracies = {'sample': sample_name}
    for model in models:
        # fit the model to the data
        model.fit(X, y)
        
        # perform cross-validation and calculate the mean accuracy
        cv = KFold(n_splits=k, shuffle=True, random_state=42)
        scores = cross_val_score(model, X, y, cv=cv)
        accuracy = scores.mean()
        
        # append the accuracy to the accuracies dictionary
        accuracies[model.__class__.__name__] = accuracy
    
    # append the accuracies for this sample to the accuracy DataFrame
    accuracy_df = accuracy_df.append(accuracies, ignore_index=True)

# print the accuracy DataFrame
print(accuracy_df)

       sample  RandomForestClassifier  LogisticRegression  GaussianNB  \
0     cluster                0.994771            0.898360    0.832980   
1      random                0.987013            0.953144    0.846206   
2  systematic                0.993548            0.872871    0.830196   
3       strat                0.979050            0.940123    0.835748   
4        conv                0.986667            0.963333    0.976667   

   KNeighborsClassifier       SVC  
0              0.820198  0.689781  
1              0.799248  0.666746  
2              0.699260  0.666631  
3              0.770848  0.643301  
4              0.836667  0.690000  
