Draft Notebook buat ngulik SVM (rbf) -> ini yg sementara high score (0.68659)

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE

from models.SVM import MulticlassSVM 

In [2]:
# Load Dataset
df_train = pd.read_csv('dataset/train.csv')
df_test_kaggle = pd.read_csv('dataset/test.csv')
test_ids = df_test_kaggle['Student_ID']

# Drop Fitur 'Sampah' (ID & Makro Ekonomi)
noise_cols = ['Unemployment rate', 'Inflation rate', 'GDP']
drop_cols = ['Target', 'Student_ID'] + noise_cols

X = df_train.drop(drop_cols, axis=1, errors='ignore')
y = df_train['Target']

# Bersihkan data test juga
X_test_kaggle_clean = df_test_kaggle.drop(['Student_ID'] + noise_cols, axis=1, errors='ignore')

# Imputasi Median (Standar Wajib)
num_cols = X.select_dtypes(include=np.number).columns
for col in num_cols:
    median_val = X[col].median()
    X[col] = X[col].fillna(median_val)
    X_test_kaggle_clean[col] = X_test_kaggle_clean[col].fillna(median_val)

print(f"Sisa Kolom Fitur: {X.shape[1]}")

Sisa Kolom Fitur: 33


In [None]:
# Note: ini masih simple preprocessing, bisa dicoba-coba teknik lainnya buat optimize (feature engineering, encoding, scaling, dll).

# --- ENGINEERING ---
def add_academic_ratios(df):
    data = df.copy()
    # Rasio Kelulusan Smt 1 & 2
    data['Pass_Rate_1'] = data['Curricular units 1st sem (approved)'] / (data['Curricular units 1st sem (enrolled)'] + 1e-9)
    data['Pass_Rate_2'] = data['Curricular units 2nd sem (approved)'] / (data['Curricular units 2nd sem (enrolled)'] + 1e-9)
    # Trend Nilai (Naik/Turun)
    data['Grade_Diff'] = data['Curricular units 2nd sem (grade)'] - data['Curricular units 1st sem (grade)']
    return data

X_eng = add_academic_ratios(X)
X_test_eng = add_academic_ratios(X_test_kaggle_clean)

# --- MANUAL TARGET ENCODING ---
# Mengubah Kategori -> Skor Probabilitas (0-2)
# Mencegah dimensi meledak akibat One-Hot Encoding

# 1. Identifikasi Kolom Kategorikal
potential_cats = ['Marital status', 'Application mode', 'Course', 'Previous qualification', 'Nacionality', 
                  "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation",
                  'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 
                  'Scholarship holder', 'International', 'Daytime/evening attendance\t']
cat_cols = [c for c in potential_cats if c in X_eng.columns]

# 2. Encode Target ke Angka dulu (0,1,2)
le = LabelEncoder()
y_num = le.fit_transform(y) 

# 3. Encoding
# Kita pakai seluruh data train untuk mapping (karena ini preprocessing statis)
# Tapi idealnya di dalam CV. Untuk simplifikasi kode from scratch, kita pakai Global Mean smoothing.
# TODO: belum coba implementasi K-Fold Target Encoding untuk hasil lebih baik.

target_mappings = {}
global_mean = y_num.mean()

for col in cat_cols:
    # Buat mapping: Kategori -> Rata-rata Target
    # Contoh: Jurusan A rata-rata targetnya 1.8 (hampir semua lulus), Jurusan B 0.5 (banyak dropout)
    mapping = pd.DataFrame({'feature': X_eng[col], 'target': y_num}).groupby('feature')['target'].mean()
    target_mappings[col] = mapping
    
    # Apply Mapping
    X_eng[col] = X_eng[col].map(mapping).fillna(global_mean) # Kalau ada kategori baru, isi rata-rata global
    X_test_eng[col] = X_test_eng[col].map(mapping).fillna(global_mean)

print(f"Dimensi Akhir: {X_eng.shape[1]} Fitur (Sangat Ringkas & Padat!)")

Dimensi Akhir: 36 Fitur (Sangat Ringkas & Padat!)


In [4]:
# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_eng)
X_test_final = scaler.transform(X_test_eng)

# SMOTE (k=10)
print("Sedang menyeimbangkan data dengan SMOTE...")
smote = SMOTE(random_state=42, k_neighbors=10)
X_train_bal, y_train_bal = smote.fit_resample(X_scaled, y_num)

print(f"Shape: {X_train_bal.shape}")

Sedang menyeimbangkan data dengan SMOTE...
Shape: (4638, 36)


In [None]:
from sklearn.svm import SVC

X_tr, X_val, y_tr, y_val = train_test_split(X_train_bal, y_train_bal, test_size=0.2, random_state=42, stratify=y_train_bal)

print("--- UJI COBA MODEL ---")

print("\nTraining SVM sklearn...")
clf_sklearn = SVC()
clf_sklearn.fit(X_tr, y_tr)
pred_sklearn = clf_sklearn.predict(X_val)
print(f"SVM sklearn F1-Macro: {f1_score(y_val, pred_sklearn, average='macro'):.4f}")

print("\nTraining SVM RBF...")
svm = MulticlassSVM(C=10.0, kernel='rbf', kernel_param=0.01, max_iter=2000)
svm.fit(X_tr, y_tr)
pred_svm = svm.predict(X_val)
print(f"SVM F1-Macro: {f1_score(y_val, pred_svm, average='macro'):.4f}")

# TODO: perlu tuning hyperparameter lebih lanjut dan banding model lain
# bisa jadi ini cuma karena setting tertentu cocok di validation split ini, tapi buat sementara cukup

--- UJI COBA MODEL ---

Training SVM sklearn...
SVM sklearn F1-Macro: 0.7702

Training SVM RBF...
SVM F1-Macro: 0.7869


In [7]:
best_model = svm # bisa diganti sesuai hasil uji coba model
best_model.fit(X_train_bal, y_train_bal)

print("Predicting Test Set...")
y_pred_final = best_model.predict(X_test_final)
y_pred_str = le.inverse_transform(y_pred_final.astype(int))

# Save
sub = pd.DataFrame({'Student_ID': test_ids, 'Target': y_pred_str})
sub.to_csv('submission-2.1.csv', index=False)
print("Submission Ready: submission-2.1.csv")
print("Distribusi Prediksi:")
print(sub['Target'].value_counts())

Predicting Test Set...
Submission Ready: submission-2.1.csv
Distribusi Prediksi:
Target
Graduate    627
Dropout     393
Enrolled    308
Name: count, dtype: int64
