In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import resample

In [2]:
# Step 1: Load the dataset
file_path = "/Users/radhikarajdev/Downloads/Creditcard_data.csv"
data = pd.read_csv(file_path)
X = data.drop("Class", axis=1)
y = data["Class"]

In [3]:
# Step 2: Balance the dataset using SMOTE (oversampling)
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

In [4]:
# Step 3: Sample size calculation
def calculate_sample_size(Z, p, E, total_population=None, sampling_type=None, **kwargs):
    """
    Calculate sample size for different sampling techniques.
    """
    n = (Z**2 * p * (1 - p)) / (E**2)  # Base calculation
    if total_population:  # Adjust for finite population
        n = n / (1 + (n - 1) / total_population)
    if sampling_type == "stratified":
        strata_count = kwargs.get("strata_count", 1)
        n *= strata_count
    elif sampling_type == "cluster":
        cluster_count = kwargs.get("cluster_count", 1)
        n /= cluster_count
    return int(np.ceil(n))

# Parameters for sample size calculation
Z = 1.96  # Z-score for 95% confidence
p = 0.5   # Proportion of the population
E = 0.05  # Margin of error
total_population = len(X)  # Total population size

# Calculate sample sizes for each sampling method
sample_size_srs = calculate_sample_size(Z, p, E, total_population)
step_size = total_population // sample_size_srs
sample_size_stratified = calculate_sample_size(Z, p, E, total_population, sampling_type="stratified", strata_count=3)
sample_size_cluster = calculate_sample_size(Z, p, E, total_population, sampling_type="cluster", cluster_count=5)
sample_size_undersampling = calculate_sample_size(Z, p, E, total_population=min(y.value_counts()))

In [5]:
# Step 4: Sampling methods
def simple_random_sampling(X, y, n):
    return resample(pd.concat([X, y], axis=1), n_samples=n, random_state=42)

def systematic_sampling(X, y, step):
    data_combined = pd.concat([X, y], axis=1)
    return data_combined.iloc[::step, :]

def stratified_sampling(X, y, n):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for train_index, _ in skf.split(X, y):
        stratified_sample = pd.concat([X.iloc[train_index], y.iloc[train_index]], axis=1)
        return stratified_sample.sample(n=n, random_state=42)

def cluster_sampling(X, y, n_clusters, n_samples_per_cluster):
    data_combined = pd.concat([X, y], axis=1)
    data_combined["Cluster"] = np.random.randint(0, n_clusters, data_combined.shape[0])
    sampled_clusters = data_combined[data_combined["Cluster"].isin(range(n_clusters))]
    return sampled_clusters.sample(n=n_samples_per_cluster, random_state=42)

def undersampling(X, y, n):
    under_sampler = RandomUnderSampler(random_state=42)
    X_under, y_under = under_sampler.fit_resample(X, y)
    return pd.concat([X_under, y_under], axis=1).sample(n=n, random_state=42)

In [6]:
# Step 5: Apply ML Models
def evaluate_models(X_train, X_test, y_train, y_test):
    results = {}
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(random_state=42),
        "SVM": SVC(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "Neural Network": MLPClassifier(max_iter=1000)
    }
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[name] = accuracy_score(y_test, y_pred)
    return results

In [7]:
samples = {
    "Sampling1": simple_random_sampling(X_balanced, y_balanced, sample_size_srs),
    "Sampling2": systematic_sampling(X_balanced, y_balanced, step=step_size),
    "Sampling3": stratified_sampling(X_balanced, y_balanced, n=sample_size_stratified),
    "Sampling4": cluster_sampling(X_balanced, y_balanced, n_clusters=5, n_samples_per_cluster=sample_size_cluster),
    "Sampling5": undersampling(X_balanced, y_balanced, n=sample_size_undersampling),
}

In [8]:
# Store results in a DataFrame
all_results = []
top_models = []

for method, sample in samples.items():
    X_sample = sample.drop("Class", axis=1)
    y_sample = sample["Class"]
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=42)
    results = evaluate_models(X_train, X_test, y_train, y_test)
    sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)  # Sort by accuracy

    # Append all results
    for model, accuracy in sorted_results:
        all_results.append({"Sampling Method": method, "Model": model, "Accuracy": accuracy})
    
    # Store top model for this sampling method
    top_model, top_accuracy = sorted_results[0]
    top_models.append({"Sampling Method": method, "Top Model": top_model, "Accuracy": top_accuracy})

# Convert results to DataFrames
results_df = pd.DataFrame(all_results)
top_models_df = pd.DataFrame(top_models)

# Save to CSV
results_df.to_csv("sampling_results.csv", index=False)
top_models_df.to_csv("top_models.csv", index=False)

# Display top models
print("Top Models for Each Sampling Method:")
print(top_models_df)

Top Models for Each Sampling Method:
  Sampling Method            Top Model  Accuracy
0       Sampling1        Random Forest  0.987179
1       Sampling2        Random Forest  0.993464
2       Sampling3        Random Forest  1.000000
3       Sampling4  Logistic Regression  0.937500
4       Sampling5       Neural Network  1.000000
