## **Library**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

## **Baca Dataset dan Preprocessing**

In [None]:
df = pd.read_csv("CitarumWater.csv")

# Encode kolom kategorikal
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Pisahkan fitur dan target
# Ganti "target" jika nama kolom target berbeda
X = df.drop("target", axis=1)
y = df["target"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## **Model Bagging dan Boosting**

In [None]:
# Bagging - Random Forest
bagging_model = RandomForestClassifier(random_state=42)
bagging_model.fit(X_train, y_train)

# Boosting - XGBoost
boosting_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
boosting_model.fit(X_train, y_train)

## **Evaluasi Model**

In [None]:
def evaluate(model, name):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    print(f"\n=== Evaluasi Model: {name} ===")
    print(f"Akurasi   : {acc:.2f}")
    print(f"Presisi   : {prec:.2f}")
    print(f"Recall    : {rec:.2f}")
    print(f"F1 Score  : {f1:.2f}")
    print(f"AUC Score : {auc:.2f}")

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr, label=f'{name} (AUC={auc:.2f})')

# Evaluasi kedua model
evaluate(bagging_model, "Random Forest (Bagging)")
evaluate(boosting_model, "XGBoost (Boosting)")

# Plot ROC Curve
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()

# **Persamaan Matematika**

### **1. Accuracy**
Accuracy mengukur proporsi jumlah prediksi yang benar dibandingkan total data.  
Persamaan matematikanya adalah:

$$
\text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN}
$$

Di mana:
- \( TP \) = True Positive  
- \( TN \) = True Negative  
- \( FP \) = False Positive  
- \( FN \) = False Negative

---

### **2. Precision**
Precision mengukur ketepatan model dalam memprediksi kelas positif.  
Persamaan matematikanya adalah:

$$
\text{Precision} = \frac{TP}{TP + FP}
$$

---

### **3. Recall (Sensitivity)**
Recall mengukur seberapa banyak dari total positif yang berhasil diprediksi dengan benar.  
Persamaan matematikanya adalah:

$$
\text{Recall} = \frac{TP}{TP + FN}
$$

---

### **4. F1-Score**
F1-Score adalah rata-rata harmonik dari Precision dan Recall, cocok saat data tidak seimbang.  
Persamaan matematikanya adalah:

$$
\text{F1-Score} = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}
$$

---

### **5. AUC - Area Under Curve**
AUC mengukur luas di bawah kurva ROC. Nilai AUC mendekati 1 menunjukkan performa yang sangat baik.  

$$
\text{AUC} = \int_{0}^{1} \text{TPR}(x)\,dx
$$

Dengan definisi:
- \( \text{TPR} = \frac{TP}{TP + FN} \)
- \( \text{FPR} = \frac{FP}{FP + TN} \)