# Student Performance Analysis
### Professional Machine Learning Pipeline with XAI and Cross-Validation

This notebook provides a comprehensive analysis of student performance using the UCI Student Performance dataset. It implements:
1. **Advanced Feature Engineering**
2. **Imbalance Handling (SMOTE)**
3. **Model Comparison (LR, RF, XGB, SVM, k-NN, LGBM)**
4. **Stratified 5-Fold Cross-Validation**
5. **Explainable AI (SHAP, LIME, PDP)**

In [None]:
import logging
import pandas as pd
import numpy as np
import os
import sys
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Add project root to path to import src modules
sys.path.append(os.path.abspath('..'))

from src.config import Config
from src.data_loader import StudentDataLoader
from src.preprocessor import StudentPreprocessor
from src.models import ModelEvaluator
from src.visualizer import Visualizer

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## 1. Data Loading and Merging

In [None]:
loader = StudentDataLoader()
raw_df = loader.load()
print(f"Initial Dataset Shape: {raw_df.shape}")
raw_df.head()

## 2. Preprocessing & Feature Engineering

In [None]:
preprocessor = StudentPreprocessor()
X, y = preprocessor.transform(raw_df)
print(f"Features Shape: {X.shape}")
X.head()

## 3. Train/Test Split & Imbalance Handling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=Config.TEST_SIZE, 
    random_state=Config.RANDOM_STATE,
    stratify=y
)

smote = SMOTE(random_state=Config.RANDOM_STATE)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_res), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

## 4. Model Evaluation & Cross-Validation

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=Config.RANDOM_STATE),
    'Random Forest': RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=Config.RANDOM_STATE),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=Config.RANDOM_STATE),
    'SVM': SVC(probability=True, class_weight='balanced', random_state=Config.RANDOM_STATE),
    'k-NN': KNeighborsClassifier(n_neighbors=5),
    'LightGBM': LGBMClassifier(random_state=Config.RANDOM_STATE, verbose=-1)
}

evaluator = ModelEvaluator(models)
metrics = evaluator.run_eval(X_train_scaled, X_test_scaled, y_train_res, y_test)

# Display results
results_df = pd.DataFrame(metrics).T
results_df[['accuracy', 'cv_accuracy_mean', 'precision', 'recall', 'f1', 'roc_auc']]

## 5. Visualizations & Model Interpretation

In [None]:
Visualizer.setup_style()
best_model_name = max(metrics, key=lambda x: metrics[x]['f1'])
print(f"Best Model: {best_model_name}")

model = metrics[best_model_name]['model']
y_pred = model.predict(X_test_scaled)
y_probs = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else y_pred

# 1. Confusion Matrix
Visualizer.save_confusion_matrix(y_test, y_pred, best_model_name)

# 2. ROC Curve
Visualizer.save_roc_curve(y_test, y_probs, best_model_name)

# 3. Feature Importance
Visualizer.save_feature_importance(model, list(X.columns), best_model_name)

# 4. SHAP Global Plot
Visualizer.save_shap_plots(model, X_test_scaled, list(X.columns))

# 5. LIME Local Plot
Visualizer.save_lime_explanation(model, X_train_scaled, X_test_scaled, list(X.columns))