# **Sampling**
# By: Vansh Singla

# **Import all Libraries**

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.cluster import KMeans
import math
import warnings
warnings.filterwarnings('ignore')

# **Setup Data**

In [None]:
data = pd.read_csv('Creditcard_data.csv')

# Balance the dataset (Oversampling Minority Class)
df_majority = data[data.Class == 0]
df_minority = data[data.Class == 1]

# Upsample
df_minority_upsampled = resample(df_minority,
                                 replace=True,     # with replacement
                                 n_samples=len(df_majority),
                                 random_state=42)

df_balanced = pd.concat([df_majority, df_minority_upsampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# **Choose Models**

In [20]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

X = df_balanced.drop('Class', axis=1)
y = df_balanced['Class']
X_pool, X_test, y_pool, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
df_pool = pd.concat([X_pool, y_pool], axis=1).reset_index(drop=True)

# **Sample Size & Functions**

In [21]:
# Calculate Sample Size
n = math.ceil((1.96**2 * 0.5 * 0.5) / (0.05**2))
print(f"Calculated Sample Size: {n}")



# Sampling Functions
def simple_random_sampling(df, n):
    return df.sample(n=n, random_state=42)

def stratified_sampling(df, n):
    return df.groupby('Class', group_keys=False).apply(lambda x: x.sample(int(n/2), random_state=42))

def systematic_sampling(df, n):
    k = len(df) // n
    start = np.random.randint(0, k)
    indices = np.arange(start, len(df), step=k)
    return df.iloc[indices[:n]]

def cluster_sampling(df, n):
    # Use KMeans to create 10 clusters
    kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
    df_temp = df.copy()
    df_temp['cluster'] = kmeans.fit_predict(df_temp.drop('Class', axis=1))

    # Randomly select clusters until sample size is met
    clusters = df_temp['cluster'].unique()
    np.random.shuffle(clusters)

    selected_data = []
    current_size = 0
    for c in clusters:
        cluster_data = df_temp[df_temp['cluster'] == c]
        selected_data.append(cluster_data)
        current_size += len(cluster_data)
        if current_size >= n:
            break

    sample = pd.concat(selected_data).drop('cluster', axis=1)
    return sample

def bootstrap_sampling(df, n):
    return resample(df, n_samples=n, replace=True, random_state=42)

samplings = {
    "Simple Random Sampling": simple_random_sampling,
    "Stratified Sampling": stratified_sampling,
    "Systematic Sampling": systematic_sampling,
    "Cluster Sampling": cluster_sampling,
    "Bootstrap Sampling": bootstrap_sampling
}

Calculated Sample Size: 385


# **Sampling & Modelling**

In [22]:
results = {}

for s_name, s_func in samplings.items():
    sample_df = s_func(df_pool, n)

    X_train = sample_df.drop('Class', axis=1)
    y_train = sample_df['Class']

    accuracies = {}
    for m_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accuracies[m_name] = acc * 100

    results[s_name] = accuracies

# **Display Output**

In [23]:
results_df = pd.DataFrame(results)
results_df.to_csv('sampling_results.csv')
results_df

Unnamed: 0,Simple Random Sampling,Stratified Sampling,Systematic Sampling,Cluster Sampling,Bootstrap Sampling
Logistic Regression,93.137255,93.79085,92.48366,78.75817,93.464052
Decision Tree,98.039216,99.019608,97.385621,80.392157,97.712418
Random Forest,99.346405,99.673203,99.673203,81.699346,99.673203
SVM,68.627451,67.973856,69.607843,60.130719,68.954248
Gradient Boosting,98.69281,99.019608,99.346405,82.026144,98.69281
