In [1]:
# =========================================================
# LEMBAR KERJA PERTEMUAN 6 - RANDOM FOREST KLASIFIKASI
# =========================================================

# Langkah 1 — Muat Data & Split (Pilihan A)
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

# Import modul sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve

print("==========================================")
print("LANGKAH 1: Muat Data & Split 70/15/15")
print("==========================================")

try:
    # Menggunakan processed_kelulusan.csv dari Pertemuan 4
    df = pd.read_csv(r"C:\.venv\Machine-learning\dataset.csv")
except FileNotFoundError:
    print("ERROR: Pastikan file 'processed_kelulusan.csv' ada di direktori ini.")
    exit()

X = df.drop("Lulus", axis=1)
y = df["Lulus"]

# Split 1: Train (70%) dan Temporary (30%) - Menggunakan Stratified
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Split 2: Temporary (30%) dibagi 50/50 menjadi Val (15%) dan Test (15%)
# PENTING: Stratify dihapus karena dataset sangat kecil (3 baris di y_temp)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42 # Hapus stratify=y_temp
)

print(f"Train Shape: {X_train.shape}, Val Shape: {X_val.shape}, Test Shape: {X_test.shape}")
print(f"Proporsi Lulus di Train: {y_train.mean():.2f}")


# =========================================================
# LANGKAH 2 — Pipeline & Baseline Random Forest
# =========================================================
print("\n==========================================")
print("LANGKAH 2: Pipeline & Baseline Random Forest")
print("==========================================")

num_cols = X_train.select_dtypes(include="number").columns

# Pipeline Preprocessing (Imputer dan Scaler)
pre = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                      ("sc", StandardScaler())]), num_cols),
], remainder="drop")

# Model Random Forest Baseline
rf = RandomForestClassifier(
    n_estimators=300, max_features="sqrt",
    class_weight="balanced", random_state=42
)

# Pipeline Penuh
pipe = Pipeline([("pre", pre), ("clf", rf)])
pipe.fit(X_train, y_train)

# Evaluasi Baseline pada Validation Set
y_val_pred = pipe.predict(X_val)
print("Baseline RF — F1(val):", f1_score(y_val, y_val_pred, average="macro"))
print("Baseline Classification Report (Val):\n", classification_report(y_val, y_val_pred, digits=3))


# =========================================================
# LANGKAH 3 — Validasi Silang (Cross-Validation)
# =========================================================
print("\n==========================================")
print("LANGKAH 3: Validasi Silang (Cross-Validation)")
print("==========================================")

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) # n_splits diperkecil karena data sedikit
scores = cross_val_score(pipe, X_train, y_train, cv=skf, scoring="f1_macro", n_jobs=-1)
print(f"CV F1-macro (train): {scores.mean():.4f} ± {scores.std():.4f}")


# =========================================================
# LANGKAH 4 — Tuning Ringkas (GridSearch)
# =========================================================
print("\n==========================================")
print("LANGKAH 4: Tuning Ringkas (GridSearch)")
print("==========================================")

param = {
  # Mengurangi jumlah pilihan karena data kecil
  "clf__max_depth": [None, 3, 5],
  "clf__min_samples_split": [2, 5]
}

gs = GridSearchCV(pipe, param_grid=param, cv=skf,
                  scoring="f1_macro", n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)

best_model = gs.best_estimator_
y_val_best = best_model.predict(X_val)

print("Best params:", gs.best_params_)
print("Best RF — F1(val):", f1_score(y_val, y_val_best, average="macro"))


# =========================================================
# LANGKAH 5 — Evaluasi Akhir (Test Set)
# =========================================================
print("\n==========================================")
print("LANGKAH 5: Evaluasi Akhir (Test Set)")
print("==========================================")

# Gunakan model terbaik hasil GridSearch
final_model = best_model  

y_test_pred = final_model.predict(X_test)
print("F1(test):", f1_score(y_test, y_test_pred, average="macro"))
print("Classification Report (Test):\n", classification_report(y_test, y_test_pred, digits=3))
print("Confusion Matrix (test):\n", confusion_matrix(y_test, y_test_pred))

# ROC-AUC dan Kurva
if hasattr(final_model, "predict_proba"):
    y_test_proba = final_model.predict_proba(X_test)[:,1]
    
    # Menghitung dan mencetak ROC-AUC
    try:
        roc_auc = roc_auc_score(y_test, y_test_proba)
        print("ROC-AUC(test):", roc_auc)
    except ValueError as e:
        print(f"ROC-AUC tidak dapat dihitung karena: {e}. (Perlu minimal 2 kelas di test set)")

    # Plot ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_test_proba)
    plt.figure()
    plt.plot(fpr, tpr)
    plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC Curve (Test) - AUC: {roc_auc:.3f}")
    plt.tight_layout()
    plt.close() # Tutup figure agar tidak menumpuk

    # Plot PR Curve
    prec, rec, _ = precision_recall_curve(y_test, y_test_proba)
    plt.figure()
    plt.plot(rec, prec)
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("PR Curve (Test)")
    plt.tight_layout()
    plt.close()


# =========================================================
# LANGKAH 6 — Pentingnya Fitur (Feature Importance)
# =========================================================
print("\n==========================================")
print("LANGKAH 6: Pentingnya Fitur (Feature Importance)")
print("==========================================")

try:
    importances = final_model.named_steps["clf"].feature_importances_
    fn = final_model.named_steps["pre"].get_feature_names_out()
    
    # Filter hanya nama kolom (hapus awalan 'num__')
    feature_names = [name.split('__')[1] for name in fn]

    top = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)
    
    print("Top feature importance:")
    for name, val in top:
        print(f"{name}: {val:.4f}")
except Exception as e:
    print("Feature importance tidak tersedia atau gagal:", e)


# =========================================================
# LANGKAH 7 — Simpan Model
# =========================================================
print("\n==========================================")
print("LANGKAH 7: Simpan Model")
print("==========================================")

joblib.dump(final_model, "rf_model.pkl")
print("Model disimpan sebagai rf_model.pkl")


# =========================================================
# LANGKAH 8 — Cek Inference Lokal
# =========================================================
print("\n==========================================")
print("LANGKAH 8: Cek Inference Lokal")
print("==========================================")

# Muat model
mdl = joblib.load("rf_model.pkl")

# Input fiktif
sample = pd.DataFrame([{
  "IPK": 3.4,
  "Jumlah_Absensi": 4,
  "Waktu_Belajar_Jam": 7,
  # Pastikan kolom hasil feature engineering juga ada
  "Rasio_Absensi": 4/14, 
  "IPK_x_Study": 3.4*7
}])

# Lakukan prediksi
prediksi = mdl.predict(sample)[0]
print(f"Input Sample: IPK 3.4, Absensi 4, Waktu Belajar 7")
print("Prediksi Kelulusan (1=Lulus, 0=Tidak Lulus):", int(prediksi))

LANGKAH 1: Muat Data & Split 70/15/15
Train Shape: (7, 5), Val Shape: (1, 5), Test Shape: (2, 5)
Proporsi Lulus di Train: 0.57

LANGKAH 2: Pipeline & Baseline Random Forest
Baseline RF — F1(val): 1.0
Baseline Classification Report (Val):
               precision    recall  f1-score   support

           1      1.000     1.000     1.000         1

    accuracy                          1.000         1
   macro avg      1.000     1.000     1.000         1
weighted avg      1.000     1.000     1.000         1


LANGKAH 3: Validasi Silang (Cross-Validation)
CV F1-macro (train): 1.0000 ± 0.0000

LANGKAH 4: Tuning Ringkas (GridSearch)
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best params: {'clf__max_depth': None, 'clf__min_samples_split': 2}
Best RF — F1(val): 1.0

LANGKAH 5: Evaluasi Akhir (Test Set)
F1(test): 1.0
Classification Report (Test):
               precision    recall  f1-score   support

           0      1.000     1.000     1.000         2

    accuracy         




LANGKAH 6: Pentingnya Fitur (Feature Importance)
Top feature importance:
IPK: 0.2509
IPK_x_Study: 0.2096
Waktu_Belajar_Jam: 0.2062
Rasio_Absensi: 0.1856
Jumlah_Absensi: 0.1478

LANGKAH 7: Simpan Model
Model disimpan sebagai rf_model.pkl

LANGKAH 8: Cek Inference Lokal
Input Sample: IPK 3.4, Absensi 4, Waktu Belajar 7
Prediksi Kelulusan (1=Lulus, 0=Tidak Lulus): 1
