In [None]:
"""
STRATEGY yg beda dari sebelumnya:
coba banyakin Feature Engineering + disaring RFE (Recursive Feature Elimination)
"""

# ============================================================================
# 1. IMPORTS
# ============================================================================

import sys
import os
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import BorderlineSMOTE 

from models.SVM import MulticlassSVM

warnings.filterwarnings('ignore')
print("Libraries imported. One-Hot Strategy Ready.")

# ============================================================================
# FEATURE ENGINEERING
# ============================================================================

class FeatureEngineer:
    def transform(self, df):
        df = df.copy()
        
        # 1. ACADEMIC METRICS
        df['Grade_Mean'] = (df['Curricular units 1st sem (grade)'] + df['Curricular units 2nd sem (grade)']) / 2
        df['Grade_Trend'] = df['Curricular units 2nd sem (grade)'] - df['Curricular units 1st sem (grade)']
        
        # Approval Rates (Rasio Kelulusan)
        df['Approval_Rate_S1'] = df['Curricular units 1st sem (approved)'] / (df['Curricular units 1st sem (enrolled)'] + 1e-9)
        df['Approval_Rate_S2'] = df['Curricular units 2nd sem (approved)'] / (df['Curricular units 2nd sem (enrolled)'] + 1e-9)
        df['Approval_Rate_Mean'] = (df['Approval_Rate_S1'] + df['Approval_Rate_S2']) / 2
        
        # Risk Indicators
        # Total SKS gagal (Enrolled - Approved)
        df['Failures_Total'] = (df['Curricular units 1st sem (enrolled)'] - df['Curricular units 1st sem (approved)']) + \
                               (df['Curricular units 2nd sem (enrolled)'] - df['Curricular units 2nd sem (approved)'])
        
        # Participation (Evaluations / Enrolled)
        df['Eval_Participation'] = (df['Curricular units 1st sem (evaluations)'] + df['Curricular units 2nd sem (evaluations)']) / \
                                   (df['Curricular units 1st sem (enrolled)'] + df['Curricular units 2nd sem (enrolled)'] + 1e-9)

        # 2. FINANCIAL & DEMOGRAPHIC
        # Financial Pressure Score (Bobot: Hutang=3, SPP Nunggak=4, Beasiswa=-2)
        df['Financial_Risk'] = ((~df['Scholarship holder'].astype(bool)).astype(int) * 2 + 
                                df['Debtor'] * 3 + 
                                (~df['Tuition fees up to date'].astype(bool)).astype(int) * 4)
        
        # Age Grouping (Binning Sederhana)
        # ini nanti bisa pake yang dari notebook utama aja binningnya
        df['Age_Risk'] = (df['Age at enrollment'] > 25).astype(int)
        
        # 3. INTERACTION FEATURES (Sinyal Kuat)
        # Prestasi x Ekonomi
        df['Fin_Performance'] = df['Financial_Risk'] * (1 - df['Approval_Rate_Mean'])
        
        return df

# Load Data
print("\nLoading & Engineering Data...")
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')
test_ids = df_test['Student_ID']

X_train_raw = df_train.drop(['Target', 'Student_ID'], axis=1)
y_train_raw = df_train['Target']
X_test_raw = df_test.drop('Student_ID', axis=1)

# Imputasi Median (Standard)
num_cols = X_train_raw.select_dtypes(include=np.number).columns
for col in num_cols:
    med = X_train_raw[col].median()
    X_train_raw[col] = X_train_raw[col].fillna(med)
    X_test_raw[col] = X_test_raw[col].fillna(med)

# Apply Engineering
fe = FeatureEngineer()
X_train_eng = fe.transform(X_train_raw)
X_test_eng = fe.transform(X_test_raw)

# Encode Target
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train_raw)

print(f"Total Fitur Awal: {X_train_eng.shape[1]}")

def reduce_cardinality(df_train, df_test, col, top_k=30):
    """Menjaga Top-K kategori, sisanya jadi 'Other'"""
    top = df_train[col].value_counts().nlargest(top_k).index
    
    df_train[col] = df_train[col].apply(lambda x: str(x) if x in top else 'Other')
    df_test[col] = df_test[col].apply(lambda x: str(x) if x in top else 'Other')
    return df_train, df_test

# Kolom kategorikal penting yang perlu dipecah
target_cats = ['Course', 'Application mode', "Mother's occupation", "Father's occupation"]

print(f"\nMereduksi kardinalitas kategori (Top 30)...")
for col in target_cats:
    X_train_eng, X_test_eng = reduce_cardinality(X_train_eng, X_test_eng, col, top_k=30)

# Semua yang string/object + target_cats masuk ke OneHot
cat_features = X_train_eng.select_dtypes(include=['object', 'category']).columns.tolist()
cat_features = list(set(cat_features + target_cats)) 

# Sisanya numerik
num_features = [c for c in X_train_eng.columns if c not in cat_features]

print(f"Numeric Features: {len(num_features)}")
print(f"Categorical Features: {len(cat_features)}")

# --- PIPELINE: Scaling + OneHot ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

print("\nTransforming Data to One-Hot Representation...")
X_train_prep = preprocessor.fit_transform(X_train_eng)
X_test_prep = preprocessor.transform(X_test_eng)

print(f"Dimensi Data One-Hot: {X_train_prep.shape}")

# ============================================================================
# 4. RFE & SMOTE HELPER
# ============================================================================

class RFESmoteRunner:
    def __init__(self, n_features_to_select=40):
        self.n_features = n_features_to_select
        # RFE menggunakan LogReg sebagai 'Juri'
        self.selector = RFE(estimator=LogisticRegression(max_iter=1000), 
                            n_features_to_select=n_features_to_select,
                            step=2) 
        self.smote = BorderlineSMOTE(random_state=42, k_neighbors=3)
    
    def fit_resample(self, X, y):
        # 1. Select Best Features
        print(f"   Running RFE (Selecting Top {self.n_features})...", end="")
        X_sel = self.selector.fit_transform(X, y)
        print(" Done.")
        
        # 2. SMOTE
        X_res, y_res = self.smote.fit_resample(X_sel, y)
        return X_res, y_res
    
    def transform(self, X):
        return self.selector.transform(X)

# ============================================================================
# 5. TUNING LOOP
# ============================================================================

print("\n" + "="*50)
print("TUNING SVM (Pada 40 Fitur Terbaik)")
print("="*50)

# coba pilih 40 fitur terbaik dari ratusan fitur One-Hot (mungkin inikah bottlenecknya?)
prep_rfe = RFESmoteRunner(n_features_to_select=40)

# Split Validasi
X_tr, X_val, y_tr, y_val = train_test_split(X_train_prep, y_train_enc, test_size=0.2, random_state=42, stratify=y_train_enc)

# Fit RFE sekali di Training Split (untuk efisiensi tuning)
X_tr_ready, y_tr_ready = prep_rfe.fit_resample(X_tr, y_tr)
X_val_ready = prep_rfe.transform(X_val)

configs = [
    {'C': 1.0, 'gamma': 0.01},
    {'C': 3.0, 'gamma': 0.01}, 
    {'C': 5.0, 'gamma': 0.01},
    {'C': 10.0, 'gamma': 0.005},
    {'C': 10.0, 'gamma': 0.01}, # yg highscore sebelumnya
]

best_score = 0
best_config = {}

for conf in configs:
    print(f"Testing SVM: {conf} ... ", end="")
    
    # Train SVM
    model = MulticlassSVM(C=conf['C'], kernel='rbf', kernel_param=conf['gamma'], max_iter=500)
    model.fit(X_tr_ready, y_tr_ready)
    
    # Eval
    preds = model.predict(X_val_ready)
    score = f1_score(y_val, preds, average='macro')
    
    print(f"F1: {score:.4f}")
    
    if score > best_score:
        best_score = score
        best_config = conf

print(f"\nKONFIGURASI JUARA: {best_config}")
print(f"Best Validation Score: {best_score:.4f}")

# ============================================================================
# 6. FINAL TRAINING & SUBMISSION
# ============================================================================

print("\n" + "="*50)
print("TRAINING FINAL MODEL (FULL DATA)")
print("="*50)

final_C = 10.0
final_G = 0.01

# 1. RFE ulang pada FULL DATA
final_rfe = RFESmoteRunner(n_features_to_select=40)
print("Processing Full Dataset (RFE + SMOTE)...")
X_train_final, y_train_final = final_rfe.fit_resample(X_train_prep, y_train_enc)
X_test_final = final_rfe.transform(X_test_prep)

# 2. Train Model
print(f"Training SVM (C={final_C}, Gamma={final_G})...")
final_model = MulticlassSVM(C=final_C, kernel='rbf', kernel_param=final_G, max_iter=2000)
final_model.fit(X_train_final, y_train_final)

# 3. Predict
print("Predicting test set...")
y_test_pred = final_model.predict(X_test_final)
y_test_str = le.inverse_transform(y_test_pred.astype(int))

# 4. Save
filename = f'submission_OneHot_RFE_40feats.csv'
df_sub = pd.DataFrame({'Student_ID': test_ids, 'Target': y_test_str})
df_sub.to_csv(filename, index=False)

print(f"Submission Ready: {filename}")
print("\nDistribusi Prediksi:")
print(df_sub['Target'].value_counts())