<a href="https://colab.research.google.com/github/samuelhtampubolon/SDPM2025/blob/main/Make_Classification_Codeset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Tujuan: simulasi dataset fraud sangat imbang (0.5–1.0%) pakai make_classification (built-in generator),
# lalu atasi imbalance dengan SMOTE (imblearn) dan evaluasi biaya.
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, precision_recall_curve, confusion_matrix
import numpy as np

# (Opsional) pip install imbalanced-learn jika belum terpasang
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Buat dataset imbalanced (fraud ~0.8%)
X, y = make_classification(
    n_samples=120_000,
    n_features=30,
    n_informative=8,
    n_redundant=4,
    n_repeated=0,
    n_clusters_per_class=2,
    weights=[0.992, 0.008],  # 0=legit, 1=fraud
    flip_y=0.001,
    class_sep=1.5,
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

In [3]:
# Pipeline: Scaling -> SMOTE -> Logistic Regression (dengan class_weight utk robustness)
pipe = ImbPipeline(steps=[
    ("scaler", StandardScaler(with_mean=False)),  # with_mean=False aman untuk data besar/sparse
    ("smote", SMOTE(k_neighbors=5, random_state=42)),
    ("logreg", LogisticRegression(
        solver="lbfgs",
        max_iter=1000,
        n_jobs=-1,
        class_weight="balanced"
    ))
])

pipe.fit(X_train, y_train)

In [4]:
# Evaluasi PR-AUC, threshold berbasis biaya
proba = pipe.predict_proba(X_test)[:, 1]
ap = average_precision_score(y_test, proba)
prec, rec, thr = precision_recall_curve(y_test, proba)

In [5]:
# Misal biaya: FN jauh lebih mahal daripada FP
cost_fp, cost_fn = 1.0, 20.0
# Minimalkan expected cost untuk tiap threshold kandidat
def expected_cost(p, y_true, t):
    y_hat = (p >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()
    return cost_fp*fp + cost_fn*fn

costs = [expected_cost(proba, y_test, t) for t in thr[:-1]]  # exclude last inf
best_idx = int(np.argmin(costs))
best_thr = float(thr[best_idx])

y_pred = (proba >= best_thr).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f"Average Precision (PR-AUC): {ap:.3f}")
print(f"Best threshold by cost: {best_thr:.3f}")
print(f"Confusion Matrix @best_thr -> TN:{tn} FP:{fp} FN:{fn} TP:{tp}")
print(f"Precision: {prec[best_idx]:.3f} | Recall: {rec[best_idx]:.3f}")

Average Precision (PR-AUC): 0.257
Best threshold by cost: 0.835
Confusion Matrix @best_thr -> TN:28810 FP:938 FN:112 TP:140
Precision: 0.130 | Recall: 0.556
