In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
import warnings

warnings.filterwarnings("ignore")

def load_and_process_data():
    url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"

    print(f"Downloading dataset from {url}...")
    try:
        data = pd.read_csv(url)
    except Exception as e:
        print(f"Error downloading data: {e}")
        print("Please ensure you have internet access or the URL is correct.")
        return None

    target_col = 'Class'
    if target_col not in data.columns:
        target_col = data.columns[-1]

    X = data.drop(target_col, axis=1)
    y = data[target_col]

    print(f"Original dataset shape: {data.shape}")
    print(f"Original class distribution:\n{y.value_counts()}")

    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X, y)

    balanced_df = pd.concat([X_resampled, y_resampled], axis=1)

    print(f"Balanced dataset shape: {balanced_df.shape}")
    print(f"Balanced class distribution:\n{balanced_df[target_col].value_counts()}")

    return balanced_df, target_col

def calculate_sample_size(N, confidence_level=0.95, margin_of_error=0.05, p=0.5):
    z_score_map = {0.90: 1.645, 0.95: 1.96, 0.99: 2.576}
    Z = z_score_map.get(confidence_level, 1.96)

    n0 = (Z**2 * p * (1-p)) / (margin_of_error**2)

    n = n0 / (1 + ((n0 - 1) / N))

    return int(math.ceil(n))

def simple_random_sampling(df, n, random_state=42):
    return df.sample(n=n, random_state=random_state)

def systematic_sampling(df, n, random_state=42):
    step = len(df) // n
    start = np.random.randint(0, step)
    indices = np.arange(start, len(df), step)
    return df.iloc[indices].head(n)

def stratified_sampling(df, n, target_col, random_state=42):
    return df.groupby(target_col, group_keys=False).apply(lambda x: x.sample(int(n/2), random_state=random_state))

def cluster_sampling(df, n, random_state=42):
    k = 20
    df_temp = df.copy()
    df_temp['cluster_id'] = np.random.randint(0, k, size=len(df))

    clusters_needed = max(1, int(n / (len(df)/k)))
    selected_clusters = np.random.choice(k, clusters_needed, replace=False)

    sample = df_temp[df_temp['cluster_id'].isin(selected_clusters)]
    return sample.drop('cluster_id', axis=1)

def bootstrap_sampling(df, n, random_state=42):
    return df.sample(n=n, replace=True, random_state=random_state)

def main():
    balanced_df, target_col = load_and_process_data()
    if balanced_df is None:
        return

    N = len(balanced_df)
    sample_size = calculate_sample_size(N, confidence_level=0.95, margin_of_error=0.05)
    print(f"\nCalculated Sample Size (95% Conf, 5% Error): {sample_size}")

    samplings = {
        "Simple Random": lambda df: simple_random_sampling(df, sample_size),
        "Systematic": lambda df: systematic_sampling(df, sample_size),
        "Stratified": lambda df: stratified_sampling(df, sample_size, target_col),
        "Cluster": lambda df: cluster_sampling(df, sample_size),
        "Bootstrap": lambda df: bootstrap_sampling(df, sample_size)
    }

    models = {
        "M1 (Logistic Regression)": LogisticRegression(max_iter=1000),
        "M2 (Decision Tree)": DecisionTreeClassifier(random_state=42),
        "M3 (Random Forest)": RandomForestClassifier(random_state=42),
        "M4 (SVM)": SVC(),
        "M5 (KNN)": KNeighborsClassifier()
    }

    X = balanced_df.drop(target_col, axis=1)
    y = balanced_df[target_col]

    X_pool, X_test, y_pool, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pool_df = pd.concat([X_pool, y_pool], axis=1)

    results = {model_name: {} for model_name in models.keys()}

    print("\nStarting Model Evaluation...")
    print("-" * 80)

    for sample_name, sampler_func in samplings.items():
        print(f"Processing {sample_name} Sampling...")

        sample_df = sampler_func(pool_df)

        X_sample = sample_df.drop(target_col, axis=1)
        y_sample = sample_df[target_col]

        for model_name, model in models.items():
            try:
                model.fit(X_sample, y_sample)
                preds = model.predict(X_test)
                acc = accuracy_score(y_test, preds)
                results[model_name][sample_name] = acc * 100
            except Exception as e:
                print(f"Error training {model_name} on {sample_name}: {e}")
                results[model_name][sample_name] = np.nan

    print("-" * 80)
    print("\nFinal Accuracy Table (%):")

    results_df = pd.DataFrame(results).T
    cols = ["Simple Random", "Systematic", "Stratified", "Cluster", "Bootstrap"]
    available_cols = [c for c in cols if c in results_df.columns]
    results_df = results_df[available_cols]

    print(results_df)

    results_df.to_csv("sampling_assignment_results.csv")
    print("\nResults saved to 'sampling_assignment_results.csv'")

    print("\n--- Summary ---")
    for model in results_df.index:
        best_sampling = results_df.loc[model].idxmax()
        best_acc = results_df.loc[model].max()
        print(f"For {model}, best technique: {best_sampling} ({best_acc:.2f}%)者にしました。")

if __name__ == "__main__":
    main()

Downloading dataset from https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv...
Original dataset shape: (772, 31)
Original class distribution:
Class
0    763
1      9
Name: count, dtype: int64
Balanced dataset shape: (1526, 31)
Balanced class distribution:
Class
0    763
1    763
Name: count, dtype: int64

Calculated Sample Size (95% Conf, 5% Error): 308

Starting Model Evaluation...
--------------------------------------------------------------------------------
Processing Simple Random Sampling...
Processing Systematic Sampling...
Processing Stratified Sampling...
Processing Cluster Sampling...
Processing Bootstrap Sampling...
--------------------------------------------------------------------------------

Final Accuracy Table (%):
                          Simple Random  Systematic  Stratified    Cluster  \
M1 (Logistic Regression)      92.483660   90.522876   90.196078  85.947712   
M2 (Decision Tree)            98.039216   96.405229   95.751