# Assignment 2 â€” Sampling (Colab Notebook)

**Name:** Rohan Malhotra  
**Roll No:** 102303437  

This notebook:
- Loads the credit card dataset
- Applies SMOTE to balance classes
- Creates 5 samples (n=500) using 5 sampling techniques
- Trains 5 ML models
- Produces an accuracy table + graphs


In [None]:
!pip -q install imbalanced-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans

## Upload dataset

In [None]:
from google.colab import files
uploaded = files.upload()

# Expecting Creditcard_data.csv
df = pd.read_csv("Creditcard_data.csv")
df.head()

## SMOTE balancing + split

In [None]:
X = df.drop(columns=["Class"])
y = df["Class"]

seed = 42
smote = SMOTE(random_state=seed, k_neighbors=3)
Xb, yb = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(
    Xb, yb, test_size=0.2, random_state=seed, stratify=yb
)

print("Balanced class counts:", pd.Series(yb).value_counts().to_dict())

## Sampling techniques (n=500)

In [None]:
def sample_srs(X, y, n, seed=42):
    rng = np.random.default_rng(seed)
    idx = rng.choice(len(X), size=n, replace=False)
    return X.iloc[idx], y.iloc[idx]

def sample_stratified(X, y, n, seed=42):
    rng = np.random.default_rng(seed)
    n0 = n // 2
    n1 = n - n0
    idx0 = rng.choice(np.where(y.values == 0)[0], size=n0, replace=False)
    idx1 = rng.choice(np.where(y.values == 1)[0], size=n1, replace=False)
    idx = np.concatenate([idx0, idx1])
    rng.shuffle(idx)
    return X.iloc[idx], y.iloc[idx]

def sample_systematic(X, y, n, seed=42):
    rng = np.random.default_rng(seed)
    idx = np.arange(len(X))
    rng.shuffle(idx)
    k = max(1, len(X) // n)
    start = int(rng.integers(0, k))
    sys_idx = idx[start::k][:n]
    return X.iloc[sys_idx], y.iloc[sys_idx]

def sample_cluster(X, y, n, seed=42, k_clusters=10):
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)
    k = min(k_clusters, max(2, len(X) // 5))
    km = KMeans(n_clusters=k, random_state=seed, n_init=10)
    labels = km.fit_predict(Xs)

    rng = np.random.default_rng(seed)
    clusters = np.unique(labels)
    rng.shuffle(clusters)

    chosen = []
    for c in clusters:
        chosen.extend(np.where(labels == c)[0].tolist())
        if len(chosen) >= n:
            break

    chosen = np.array(chosen)
    if len(chosen) > n:
        chosen = rng.choice(chosen, size=n, replace=False)
    return X.iloc[chosen], y.iloc[chosen]

def sample_bootstrap(X, y, n, seed=42):
    rng = np.random.default_rng(seed)
    idx = rng.choice(len(X), size=n, replace=True)
    return X.iloc[idx], y.iloc[idx]

SAMPLERS = {
    "Sampling1_SRS": sample_srs,
    "Sampling2_Stratified": sample_stratified,
    "Sampling3_Systematic": sample_systematic,
    "Sampling4_Cluster": sample_cluster,
    "Sampling5_Bootstrap": sample_bootstrap,
}

## Models

In [None]:
MODELS = {
    "M1_LogReg": Pipeline([("scaler", StandardScaler()),
                          ("clf", LogisticRegression(max_iter=2000, random_state=seed))]),
    "M2_DecisionTree": DecisionTreeClassifier(random_state=seed),
    "M3_RandomForest": RandomForestClassifier(n_estimators=200, random_state=seed, n_jobs=-1),
    "M4_GradBoost": GradientBoostingClassifier(random_state=seed),
    "M5_SVC": Pipeline([("scaler", StandardScaler()),
                        ("clf", SVC(kernel="rbf", C=2.0, gamma="scale", random_state=seed))]),
}

## Train + Evaluate (Accuracy)

In [None]:
n = 500
results = pd.DataFrame(index=MODELS.keys(), columns=SAMPLERS.keys(), dtype=float)

for sname, sfn in SAMPLERS.items():
    Xs, ys = sfn(pd.DataFrame(X_train), pd.Series(y_train), n, seed=seed)
    for mname, model in MODELS.items():
        model.fit(Xs, ys)
        pred = model.predict(X_test)
        results.loc[mname, sname] = accuracy_score(y_test, pred)

results.round(4)

## Graphs

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
im = ax.imshow(results.values, aspect="auto")
ax.set_xticks(range(results.shape[1]))
ax.set_xticklabels(results.columns, rotation=45, ha="right")
ax.set_yticks(range(results.shape[0]))
ax.set_yticklabels(results.index)
ax.set_title("Accuracy Matrix (Models x Sampling)")
fig.colorbar(im, ax=ax)
fig.tight_layout()
plt.show()

In [None]:
best_per_model = results.max(axis=1).sort_values(ascending=False)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(range(len(best_per_model)), best_per_model.values)
ax.set_xticks(range(len(best_per_model)))
ax.set_xticklabels(best_per_model.index, rotation=45, ha="right")
ax.set_ylabel("Best Accuracy")
ax.set_title("Best Accuracy per Model")
fig.tight_layout()
plt.show()

In [None]:
best_per_sampling = results.max(axis=0).sort_values(ascending=False)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(range(len(best_per_sampling)), best_per_sampling.values)
ax.set_xticks(range(len(best_per_sampling)))
ax.set_xticklabels(best_per_sampling.index, rotation=45, ha="right")
ax.set_ylabel("Best Accuracy")
ax.set_title("Best Accuracy per Sampling Technique")
fig.tight_layout()
plt.show()

## Save outputs (optional)

In [None]:
results.to_csv("accuracy_matrix.csv", index=True)
print("Saved: accuracy_matrix.csv")