In [None]:
# --- CELL 1: HEADER & INSTALL (opsional) ---
# Jalankan jika ada paket yang belum tersedia (bardasarkan runtime Colab biasanya sudah ada)
!pip install -q imbalanced-learn

# --- CELL 2: IMPORT LIBRARIES ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import seaborn as sns
import joblib
from IPython.display import display, Markdown

# Seed untuk reproduksibilitas
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# --- CELL 3: UPLOAD DATASET (Colab-friendly) ---
from google.colab import files
print("Silakan upload CSV dataset (misal: student_stress.csv). Jika sudah ter-upload, masukkan nama filenya pada variable DATA_PATH.")
uploaded = files.upload()  # interaktif: akan muncul dialog upload

# Jika kamu sudah upload secara manual ke Drive, bisa pakai path Drive.
# Setelah upload melalui dialog, file akan tersimpan di working directory.
DATA_PATH = list(uploaded.keys())[0]  # ambil file yang diupload pertama
print("Menggunakan file:", DATA_PATH)

# --- CELL 4: LOAD DATA ---
df = pd.read_csv(DATA_PATH)
display(df.head())
print("\nUkuran dataset (baris, kolom):", df.shape)

# --- CELL 5: DATA OVERVIEW (formal akademik) ---
display(Markdown("### 1. Descriptive Overview"))
print("Ringkasan statistik numerik:")
display(df.describe(include=[np.number]).T)

print("\nRingkasan kolom non-numerik / unik:")
for col in df.select_dtypes(include=['object','category']).columns:
    print(f"- {col}: {df[col].nunique()} unique values; contoh: {df[col].unique()[:5]}")

print("\nMissing values per kolom:")
display(df.isna().sum())

# --- CELL 6: PREPROCESSING (missing, encoding, feature selection) ---
# Asumsi: ada kolom target bernama 'stress_level' atau 'Stress' atau 'stress'
# Jika nama kolom target berbeda, ganti variabel TARGET_NAME di sini.
POSSIBLE_TARGETS = ['stress_level','stress','Stress','Stress_Level','stress_level_label']
target_candidates = [c for c in df.columns if c in POSSIBLE_TARGETS]
if len(target_candidates) == 0:
    print("Peringatan: tidak menemukan kolom target otomatis. Silakan set TARGET_NAME sesuai dataset.")
    TARGET_NAME = input("Masukkan nama kolom target (mis. stress_level): ").strip()
else:
    TARGET_NAME = target_candidates[0]
    print("Mendeteksi kolom target:", TARGET_NAME)

# Pisahkan feature dan target
y = df[TARGET_NAME].copy()
X = df.drop(columns=[TARGET_NAME])

# Jika target berupa teks (Low/Medium/High) -> encode numerik
if y.dtype == 'object' or str(y.dtype).startswith('category'):
    le_mapping = {k:i for i,k in enumerate(sorted(y.unique()))}
    print("Mapping target label:", le_mapping)
    y = y.map(le_mapping)

# Pilih kolom numerik dan kategorikal
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
print("\nNumerical cols:", num_cols)
print("Categorical cols:", cat_cols)

# Imputer untuk numerik dan kategorik
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

if len(num_cols) > 0:
    X[num_cols] = num_imputer.fit_transform(X[num_cols])
if len(cat_cols) > 0:
    X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

# Encode kategori dengan OrdinalEncoder (sederhana dan reproducible)
if len(cat_cols) > 0:
    enc = OrdinalEncoder()
    X[cat_cols] = enc.fit_transform(X[cat_cols])

# Scaling numerik (gabungkan agar pipeline sederhana)
scaler = StandardScaler()
if len(num_cols) > 0:
    X[num_cols] = scaler.fit_transform(X[num_cols])

print("\nContoh data setelah preprocessing:")
display(X.head())

# --- CELL 7: TRAIN-TEST SPLIT ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

# --- CELL 8: MODEL - Logistic Regression (simple) ---
# Gunakan class_weight='balanced' jika class imbalance signifikan
model = LogisticRegression(max_iter=2000, solver='liblinear', class_weight='balanced', random_state=RANDOM_STATE)
model.fit(X_train, y_train)

# Cross validation (stratified)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
print("CV accuracy scores:", cv_scores)
print("CV accuracy mean: {:.4f}".format(cv_scores.mean()))

# --- CELL 9: EVALUATION ON TEST SET ---
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1] if model.classes_.shape[0] == 2 else None

acc = accuracy_score(y_test, y_pred)
print("\nTest Accuracy: {:.4f}".format(acc))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
display(cm)

# If binary, compute ROC-AUC
if y_proba is not None:
    auc = roc_auc_score(y_test, y_proba)
    print("\nROC AUC: {:.4f}".format(auc))

# --- CELL 10: VISUALIZATIONS (confusion matrix + ROC + feature importance) ---
# Confusion matrix heatmap
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC curve (hanya binary)
if y_proba is not None:
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure(figsize=(6,4))
    plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
    plt.plot([0,1],[0,1],'--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()

# Feature importance for Logistic Regression: absolute coefficient magnitudes
coef = model.coef_
# Untuk multiclass, ambil rata-rata absolut koef per feature
if coef.ndim == 2 and coef.shape[0] > 1:
    feat_imp = np.mean(np.abs(coef), axis=0)
else:
    feat_imp = np.abs(coef).flatten()

feat_imp_series = pd.Series(feat_imp, index=X.columns).sort_values(ascending=False)
display(Markdown("### Feature importance (Logistic Regression coefficients - absolute)"))
display(feat_imp_series.head(20))

plt.figure(figsize=(8,6))
sns.barplot(x=feat_imp_series.values[:15], y=feat_imp_series.index[:15])
plt.title('Top 15 Feature Importance (LR | abs(coef))')
plt.xlabel('Absolute Coefficient')
plt.show()

# --- CELL 11: SAVE MODEL & PREPROCESSORS ---
joblib.dump({
    'model': model,
    'scaler': scaler,
    'num_imputer': num_imputer,
    'cat_imputer': cat_imputer,
    'encoder': enc if len(cat_cols) > 0 else None,
    'feature_columns': X.columns.tolist()
}, 'student_stress_lr_pipeline.joblib')

print("Model dan pipeline disimpan ke 'student_stress_lr_pipeline.joblib'")

# --- CELL 12: PREDICT ON NEW SAMPLE (contoh) ---
# Contoh: ambil baris pertama dari test set sebagai demo prediksi
sample = X_test.iloc[[0]]
sample_true = y_test.iloc[0]
pred = model.predict(sample)[0]
proba = model.predict_proba(sample) if hasattr(model, "predict_proba") else None
print("True label:", sample_true)
print("Predicted label:", pred)
if proba is not None:
    print("Predicted probabilities:", proba)

# --- CELL 13: NOTES FOR REPORT (formal academic) ---
display(Markdown("## Catatan untuk Laporan (copy-paste bagian ini ke laporan formal)"))
display(Markdown(
"""
**Pendahuluan (singkat)**
- Latar belakang: stres di kalangan mahasiswa merupakan isu signifikan yang mempengaruhi performa akademik, serta kesehatan mental.
- Tujuan: memodelkan tingkat stres mahasiswa menggunakan fitur akademik dan non-akademik untuk membantu intervensi kampus.

**Dataset**
- Sumber: Kaggle / dataset lokal (file: `{}`)
- Jumlah sampel: {} baris, {} fitur (setelah preprocessing).
- Target: `{}` (dikode menjadi numerik untuk klasifikasi).

**Metodologi**
- Preprocessing: imputasi median/most frequent, encoding kategori (OrdinalEncoder), standardisasi (StandardScaler).
- Model: Logistic Regression (simpel, interpretabel) dengan class_weight='balanced'.
- Evaluasi: Accuracy, Precision, Recall, F1-score, Confusion Matrix, ROC-AUC (jika binary).

**Hasil Ringkas**
- CV accuracy (5-fold): mean {:.4f}
- Test accuracy: {:.4f}
- Insight utama: fitur paling berpengaruh (koefisien) = {}

**Kesimpulan & Saran**
- Model sederhana sudah memberi baseline; rekomendasi: evaluasi model lebih lanjut dengan data tambahan, eksperimen RandomForest/XGBoost untuk perbandingan, dan implementasi sistem peringatan dini di kampus.
""".format(DATA_PATH, df.shape[0], X.shape[1], TARGET_NAME, cv_scores.mean(), acc, feat_imp_series.head(5).to_dict())
))

# --- END OF NOTEBOOK ---


Silakan upload CSV dataset (misal: student_stress.csv). Jika sudah ter-upload, masukkan nama filenya pada variable DATA_PATH.


KeyboardInterrupt: 