In [None]:
# ===============================
# Modeling Pertemuan 5 (Final, Safe untuk dataset kecil)
# ===============================

# Step 0 — Import Library
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import joblib

# Step 1 — Load Data & Split
df = pd.read_csv("processed_kelulusan.csv")
X = df.drop("Lulus", axis=1)
y = df["Lulus"]

# Split dataset (stratify aman karena dataset kecil)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=True, random_state=42)

print("Shapes:")
print(X_train.shape, X_val.shape, X_test.shape)

# Step 2 — Preprocessing + Baseline Logistic Regression
num_cols = X_train.select_dtypes(include="number").columns
pre = ColumnTransformer([("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                                           ("sc", StandardScaler())]), num_cols)],
                        remainder="drop")
pipe_lr = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42))])
pipe_lr.fit(X_train, y_train)
y_val_pred = pipe_lr.predict(X_val)

print("\nBaseline Logistic Regression")
print("F1(val):", f1_score(y_val, y_val_pred, average="macro"))
print(classification_report(y_val, y_val_pred, digits=3))

# Step 3 — Random Forest
pipe_rf = Pipeline([("pre", pre), ("clf", RandomForestClassifier(n_estimators=300, max_features="sqrt",
                                                                 class_weight="balanced", random_state=42))])
pipe_rf.fit(X_train, y_train)
y_val_rf = pipe_rf.predict(X_val)
print("\nRandom Forest")
print("F1(val):", f1_score(y_val, y_val_rf, average="macro"))

# Step 4 — GridSearchCV (safe untuk dataset kecil)
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)  # n_splits kecil karena data mini
param_grid = {"clf__max_depth": [None, 12], "clf__min_samples_split": [2, 5]}
gs = GridSearchCV(pipe_rf, param_grid=param_grid, cv=skf, scoring="f1_macro", n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)

best_rf = gs.best_estimator_
y_val_best = best_rf.predict(X_val)
print("\nBest RF Params:", gs.best_params_)
print("Best CV F1:", gs.best_score_)
print("F1(val):", f1_score(y_val, y_val_best, average="macro"))

# Step 5 — Test Set Evaluation
final_model = best_rf
y_test_pred = final_model.predict(X_test)
print("\nTest Set Evaluation")
print("F1(test):", f1_score(y_test, y_test_pred, average="macro"))
print(classification_report(y_test, y_test_pred, digits=3))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_test_pred))

if hasattr(final_model, "predict_proba"):
    y_test_proba = final_model.predict_proba(X_test)[:,1]
    print("ROC-AUC(test):", roc_auc_score(y_test, y_test_proba))
    fpr, tpr, _ = roc_curve(y_test, y_test_proba)
    plt.figure()
    plt.plot(fpr, tpr, marker='.')
    plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC Curve (Test)")
    plt.show()

# Step 6 — Save Model
joblib.dump(final_model, "model.pkl")
print("Model tersimpan ke model.pkl")

# Step 7 — Opsional Flask Endpoint (komentar)
"""
from flask import Flask, request, jsonify
app = Flask(__name__)
MODEL = joblib.load("model.pkl")

@app.route("/predict", methods=["POST"])
def predict():
    data = request.get_json(force=True)
    X = pd.DataFrame([data])
    yhat = MODEL.predict(X)[0]
    proba = None
    if hasattr(MODEL, "predict_proba"):
        proba = float(MODEL.predict_proba(X)[:,1][0])
    return jsonify({"prediction": int(yhat), "proba": proba})

if __name__ == "__main__":
    app.run(port=5000)
"""
