In [None]:
# ======================================================
# Student Academic Success Prediction
# ======================================================
# Goal: Predict final math grades (G3) for secondary students and
# understand how demographics, family context, and school-related factors
# contribute to academic performance.
# Key Tasks:
#   1. Explore and visualize the student performance data.
#   2. Handle missing values and encode categorical/ordinal variables.
#   3. Use statistical tests (correlation + ANOVA) to select relevant features.
#   4. Engineer an absences_sum feature and build pipelines with and without
#      prior term grades (G1, G2).
#   5. Train and compare multiple models (Linear, Lasso, SVR).
#   6. Fine-tune SVR using GridSearchCV.
#   7. Evaluate the best model on a held-out test set (RMSE, MAE, R²).
#   8. Derive an “at-risk student” classification view from the regression model.
# Tools: Python, pandas, numpy, matplotlib, seaborn, scikit-learn, joblib
# Dataset: UCI Machine Learning Repository (Student Performance dataset)

# -------------------- IMPORT LIBRARIES --------------------
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import f_oneway
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.svm import SVR
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    confusion_matrix,
    classification_report,
)
from pathlib import Path

# ---------- PROJECT SETUP ----------
project_dir = Path.cwd()

fig_dir     = project_dir / "figures"
models_dir  = project_dir / "models"
outputs_dir = project_dir / "outputs"

for d in [fig_dir, models_dir, outputs_dir]:
    d.mkdir(exist_ok=True)

def save_plot(filename, width=8, height=5, dpi=300):
    """
    Save the current Matplotlib figure into the figures/ folder.
    """
    plt.gcf().set_size_inches(width, height)
    plt.savefig(fig_dir / filename, dpi=dpi, bbox_inches="tight")

def save_output(df, filename):
    """
    Save a pandas DataFrame into the outputs/ directory as CSV.
    """
    filepath = outputs_dir / filename
    df.to_csv(filepath, index=False)
    print(f"Saved output: {filepath}")
    return filepath

# ---------- LABELS / UNITS FOR NUMERIC FEATURES ----------
numeric_feature_labels = {
    "age": "Age (years)",
    "absences_G1": "Absences in Period 1",
    "absences_G2": "Absences in Period 2",
    "absences_G3": "Absences in Period 3",
    "G1": "Grade Period 1 (0–20)",
    "G2": "Grade Period 2 (0–20)",
    "G3": "Final Grade G3 (0–20)",
}

# -------------------- PLOT STYLE --------------------
sns.set(style="whitegrid")
plt.rc("font", size=14)
plt.rc("axes", labelsize=14, titlesize=14)
plt.rc("legend", fontsize=12)
plt.rc("xtick", labelsize=10)
plt.rc("ytick", labelsize=10)

# -------------------- LOAD DATA --------------------
data_path = project_dir / "student_academic_success_data.csv"

student_df = pd.read_csv(data_path)

print("Dataset shape:", student_df.shape, "\n")
print("First 5 rows:\n", student_df.head(), "\n")
print("Info:\n")
student_df.info()
print("\nMissing values per column:\n", student_df.isna().sum(), "\n")

# Copy for EDA + imputation
student_df_copy = student_df.copy()

# -------------------- COLUMN GROUPING --------------------
numeric_columns_full = ["age", "absences_G1", "absences_G2", "absences_G3", "G1", "G2", "G3"]

categorical_columns_full = [
    "school", "sex", "address", "famsize", "Pstatus",
    "Mjob", "Fjob", "reason", "guardian",
    "schoolsup", "famsup", "paid", "activities",
    "nursery", "higher", "internet", "romantic",
]

ordinal_columns_full = [
    "Medu", "Fedu", "traveltime", "studytime", "failures",
    "famrel", "freetime", "goout", "Dalc", "Walc", "health",
]

# Impute missing values for EDA only
num_imputer_eda = SimpleImputer(strategy="mean")
student_df_copy[numeric_columns_full] = num_imputer_eda.fit_transform(
    student_df_copy[numeric_columns_full]
)

cat_imputer_eda = SimpleImputer(strategy="most_frequent")
student_df_copy[categorical_columns_full] = cat_imputer_eda.fit_transform(
    student_df_copy[categorical_columns_full]
)

ord_imputer_eda = SimpleImputer(strategy="most_frequent")
student_df_copy[ordinal_columns_full] = ord_imputer_eda.fit_transform(
    student_df_copy[ordinal_columns_full]
)

print("Remaining missing values after EDA imputations:\n",
      student_df_copy.isna().sum(), "\n")

# -------------------- EXPLORATORY DATA ANALYSIS (EDA) --------------------
# Numeric stats
numeric_stats = student_df_copy[numeric_columns_full].describe()
print("\nNumeric summary statistics:\n", numeric_stats, "\n")
save_output(numeric_stats.reset_index(), "numeric_summary_stats.csv")

# ---- Histograms for numeric columns ----
fig, axes = plt.subplots(3, 3, figsize=(14, 9))
axes = axes.flatten()

for ax, col in zip(axes, numeric_columns_full):
    ax.hist(student_df_copy[col], bins=20)
    ax.set_title(col)
    ax.set_xlabel(numeric_feature_labels.get(col, col))
    ax.set_ylabel("Frequency")

# Remove any unused subplots
for j in range(len(numeric_columns_full), len(axes)):
    fig.delaxes(axes[j])

fig.suptitle("Numeric Feature Distributions", y=1.02)
plt.tight_layout()
save_plot("hist_numeric.png", width=14, height=9, dpi=300)
plt.show()

# Categorical stats
categorical_stats = student_df_copy[categorical_columns_full].describe(include="object")
print("\nCategorical summary statistics:\n", categorical_stats, "\n")
save_output(categorical_stats.reset_index(), "categorical_summary_stats.csv")

# Ordinal stats
ordinal_stats = student_df_copy[ordinal_columns_full].describe()
print("\nOrdinal summary statistics:\n", ordinal_stats, "\n")
save_output(ordinal_stats.reset_index(), "ordinal_summary_stats.csv")

# -------------------- CORRELATION ANALYSIS (NUMERIC) --------------------
corr_matrix = student_df_copy[numeric_columns_full].corr()
print("\nCorrelation matrix (numeric features):\n", corr_matrix, "\n")
print("\nCorrelation of numeric features with G3:\n",
      corr_matrix["G3"].sort_values(ascending=False), "\n")

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Correlation Heatmap — Numeric Features")
plt.tight_layout()
save_plot("correlation_heatmap_numeric.png", width=8, height=6, dpi=300)
plt.show()

save_output(corr_matrix.reset_index(), "correlation_matrix_numeric.csv")

# -------------------- ANOVA FOR CATEGORICAL & ORDINAL FEATURES --------------------
anova_results = {}

for col in categorical_columns_full + ordinal_columns_full:
    if col in ordinal_columns_full:
        levels = sorted(student_df_copy[col].unique())
    else:
        levels = student_df_copy[col].unique()

    grouped = [student_df_copy[student_df_copy[col] == level]["G3"] for level in levels]
    f_stat, p_value = f_oneway(*grouped)
    anova_results[col] = {"F-statistic": f_stat, "p-value": p_value}

    # Plot only statistically significant features
    if p_value < 0.05:
        plt.figure(figsize=(8, 6))
        sns.boxplot(x=col, y="G3", data=student_df_copy, palette="viridis")
        plt.title(f"G3 by {col} (ANOVA Significant)")
        f_text = f"F-stat: {f_stat:.2f}"
        p_text = f"p-value: {p_value:.3f}"
        plt.text(
            0.5,
            0.9,
            f_text + "\n" + p_text,
            ha="center",
            va="center",
            transform=plt.gca().transAxes,
            fontsize=12,
            color="black",
            bbox=dict(facecolor="white", alpha=0.7),
        )
        plt.xlabel(col)
        plt.ylabel("Final Grade (G3, 0–20)")
        plt.xticks(rotation=45)
        plt.tight_layout()
        save_plot(f"boxplot_G3_by_{col}.png", width=10, height=6, dpi=300)
        plt.show()

print("\nANOVA results (F-statistic, p-value) — sample:\n")
for k, v in list(anova_results.items())[:5]:
    print(k, "->", v)

anova_df = (
    pd.DataFrame.from_dict(anova_results, orient="index")
    .reset_index()
    .rename(columns={"index": "feature"})
)
save_output(anova_df, "anova_results_G3.csv")

# -------------------- DEFINE FEATURES FOR MODELING --------------------
# Based on significance & domain logic
numeric_model_columns = ["age", "absences_G1", "absences_G2", "absences_G3", "G1", "G2"]
categorical_model_columns = ["sex", "address", "Mjob", "paid", "higher", "romantic"]
ordinal_model_columns = ["Medu", "Fedu", "failures", "goout"]

# Target and features
X = student_df.drop("G3", axis=1)
y = student_df["G3"].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nX_train missing values:\n", X_train.isna().sum(), "\n")

# -------------------- CUSTOM TRANSFORMER - ABSENCE FEATURE ENGINEERING --------------------
class FinalProjectTransformer(BaseEstimator, TransformerMixin):
    """
    Adds absences_sum = absences_G1 + absences_G2 + absences_G3
    Optionally drops G1, G2 (for models that do not use prior grades).
    """

    def __init__(self, drop_grades=False):
        self.drop_grades = drop_grades

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed["absences_sum"] = (
            X_transformed["absences_G1"]
            + X_transformed["absences_G2"]
            + X_transformed["absences_G3"]
        )
        X_transformed.drop(["absences_G1", "absences_G2", "absences_G3"], axis=1, inplace=True)
        if self.drop_grades:
            X_transformed.drop(["G1", "G2"], axis=1, inplace=True)
        return X_transformed

# -------------------- PIPELINES & COLUMN TRANSFORMERS --------------------
# Imputer that returns DataFrame
imputer = SimpleImputer(strategy="mean")
imputer.set_output(transform="pandas")

# Numeric pipeline WITH grades (G1, G2 kept)
numeric_pipeline_with_grades = Pipeline(
    steps=[
        ("imputer", imputer),
        ("feature_engineering", FinalProjectTransformer(drop_grades=False)),
        ("scaler", StandardScaler()),
    ]
)

# Numeric pipeline WITHOUT grades (G1, G2 removed)
numeric_pipeline_without_grades = Pipeline(
    steps=[
        ("imputer", imputer),
        ("feature_engineering", FinalProjectTransformer(drop_grades=True)),
        ("scaler", StandardScaler()),
    ]
)

# Categorical pipeline
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"),
)

# Ordinal pipeline
ordinal_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(),
)

# Column transformer WITH grades
column_transformer_with_grades = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline_with_grades, numeric_model_columns),
        ("cat", categorical_pipeline, categorical_model_columns),
        ("ord", ordinal_pipeline, ordinal_model_columns),
    ]
)

# Column transformer WITHOUT grades
column_transformer_without_grades = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline_without_grades, numeric_model_columns),
        ("cat", categorical_pipeline, categorical_model_columns),
        ("ord", ordinal_pipeline, ordinal_model_columns),
    ]
)

# Fit-transform on train
X_train_transformed_with = column_transformer_with_grades.fit_transform(X_train)
X_train_transformed_without = column_transformer_without_grades.fit_transform(X_train)

print("Transformed shape WITH grades:", X_train_transformed_with.shape)
print("Transformed shape WITHOUT grades:", X_train_transformed_without.shape)

# -------------------- BASELINE MODEL COMPARISON (CV) --------------------
lin_reg = LinearRegression()
svm_reg = SVR()
lasso_reg = Lasso()

results = []

def cv_rmse(model, X, y, label):
    scores = cross_val_score(
        model,
        X,
        y,
        cv=3,
        scoring="neg_root_mean_squared_error",
    )
    mean_rmse = -scores.mean()
    print(f"{label} — CV RMSE: {mean_rmse:.3f}")
    return mean_rmse

# Linear Regression
rmse_lin_with = cv_rmse(lin_reg, X_train_transformed_with, y_train, "Linear Regression (with grades)")
rmse_lin_without = cv_rmse(lin_reg, X_train_transformed_without, y_train, "Linear Regression (without grades)")
results.append({"model": "Linear (with grades)", "cv_rmse": rmse_lin_with})
results.append({"model": "Linear (without grades)", "cv_rmse": rmse_lin_without})

# SVR (default hyperparameters)
rmse_svm_with = cv_rmse(svm_reg, X_train_transformed_with, y_train, "SVR (with grades)")
rmse_svm_without = cv_rmse(svm_reg, X_train_transformed_without, y_train, "SVR (without grades)")
results.append({"model": "SVR (with grades)", "cv_rmse": rmse_svm_with})
results.append({"model": "SVR (without grades)", "cv_rmse": rmse_svm_without})

# Lasso
rmse_lasso_with = cv_rmse(lasso_reg, X_train_transformed_with, y_train, "Lasso (with grades)")
rmse_lasso_without = cv_rmse(lasso_reg, X_train_transformed_without, y_train, "Lasso (without grades)")
results.append({"model": "Lasso (with grades)", "cv_rmse": rmse_lasso_with})
results.append({"model": "Lasso (without grades)", "cv_rmse": rmse_lasso_without})

results_df = pd.DataFrame(results)
print("\nCross-validated RMSE comparison:\n", results_df, "\n")
save_output(results_df, "cv_rmse_summary_students.csv")

# -------------------- HYPERPARAMETER TUNING - SVR --------------------
param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"],
    "epsilon": [0.1, 0.2, 0.3],
}

svm_reg = SVR()

# WITH grades
grid_search_with = GridSearchCV(
    svm_reg,
    param_grid=param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
)
grid_search_with.fit(X_train_transformed_with, y_train)

best_params_with = grid_search_with.best_params_
best_score_with = -grid_search_with.best_score_  # positive RMSE

print("Best SVR params (WITH grades):", best_params_with)
print(f"Best CV RMSE (WITH grades): {best_score_with:.3f}\n")

# WITHOUT grades
grid_search_without = GridSearchCV(
    svm_reg,
    param_grid=param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
)
grid_search_without.fit(X_train_transformed_without, y_train)

best_params_without = grid_search_without.best_params_
best_score_without = -grid_search_without.best_score_

print("Best SVR params (WITHOUT grades):", best_params_without)
print(f"Best CV RMSE (WITHOUT grades): {best_score_without:.3f}\n")

# -------------------- FINAL MODELS & TEST EVALUATION --------------------
# Build final models using best params from GridSearch
best_svm_with = SVR(**best_params_with)
best_svm_without = SVR(**best_params_without)

best_svm_with.fit(X_train_transformed_with, y_train)
best_svm_without.fit(X_train_transformed_without, y_train)

# Transform test set
X_test_transformed_with = column_transformer_with_grades.transform(X_test)
X_test_transformed_without = column_transformer_without_grades.transform(X_test)

# Predictions
y_pred_with = best_svm_with.predict(X_test_transformed_with)
y_pred_without = best_svm_without.predict(X_test_transformed_without)

# Metrics
rmse_with = np.sqrt(mean_squared_error(y_test, y_pred_with))
rmse_without = np.sqrt(mean_squared_error(y_test, y_pred_without))

r2_with = r2_score(y_test, y_pred_with)
r2_without = r2_score(y_test, y_pred_without)

mae_with = mean_absolute_error(y_test, y_pred_with)
mae_without = mean_absolute_error(y_test, y_pred_without)

print("=== Final SVR Model Performance (Test Set) ===")
print(f"WITH grades:    RMSE = {rmse_with:.2f}, MAE = {mae_with:.2f}, R² = {r2_with:.3f}")
print(f"WITHOUT grades: RMSE = {rmse_without:.2f}, MAE = {mae_without:.2f}, R² = {r2_without:.3f}\n")

# Comparison DataFrame
metrics_df = pd.DataFrame(
    {
        "setting": ["With grades", "Without grades"],
        "rmse": [rmse_with, rmse_without],
        "mae": [mae_with, mae_without],
        "r2": [r2_with, r2_without],
    }
)
print("Metrics comparison:\n", metrics_df, "\n")
save_output(metrics_df, "test_metrics_with_without_grades.csv")

# -------------------- VISUALIZATION - RMSE & R² COMPARISON --------------------
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# RMSE barplot
sns.barplot(
    x="setting",
    y="rmse",
    data=metrics_df,
    palette="Blues",
    ax=axes[0],
)
axes[0].set_title("Root Mean Squared Error (RMSE)")
axes[0].set_ylabel("RMSE")
axes[0].set_xlabel("Model Setting")

# R² barplot
sns.barplot(
    x="setting",
    y="r2",
    data=metrics_df,
    palette="Greens",
    ax=axes[1],
)
axes[1].set_title("R² (Coefficient of Determination)")
axes[1].set_ylabel("R²")
axes[1].set_xlabel("Model Setting")
axes[1].set_ylim(0, 1)

plt.tight_layout()
save_plot("rmse_r2_with_vs_without_grades.png", width=14, height=6, dpi=300)
plt.show()

# -------------------- AT-RISK STUDENT ANALYSIS --------------------
# Define at-risk: final grade < 10
y_true_at_risk = (y_test < 10).astype(int)
y_pred_at_risk = (y_pred_with < 10).astype(int)

cm = confusion_matrix(y_true_at_risk, y_pred_at_risk)
print("=== At-Risk Classification (Using Best SVR Regression) ===")
print("Confusion matrix (0 = not at risk, 1 = at risk):\n", cm, "\n")

print("Classification report:\n")
print(
    classification_report(
        y_true_at_risk,
        y_pred_at_risk,
        target_names=["Not at risk", "At risk"],
    )
)

# -------------------- SAVE BEST MODEL (WITH GRADES) --------------------
model_path = models_dir / "school_best_svm_with_grades.pkl"
joblib.dump(best_svm_with, model_path)
print(f"\nSaved best model to: {model_path}")

# ---------- DEMO PREDICTIONS ----------
demo_preds = best_svm_with.predict(X_test_transformed_with[:5])
print("\nDemo predictions (first 5 test rows):")
print("Predicted G3:", np.round(demo_preds, 2))
print("Actual   G3:", y_test.iloc[:5].values)

# -------------------- PIPELINE COMPLETE --------------------
print("\nPipeline complete:")
print("- Project setup (figures/, outputs/, models/ directories)")
print("- Data loading & validation (shape, dtypes, missingness)")
print("- EDA outputs saved (numeric/categorical/ordinal summaries)")
print("- Correlation analysis for numeric features + heatmap")
print("- ANOVA feature screening + significant boxplots")
print("- Stratified train/test split")
print("- Feature engineering: absences_sum and optional removal of G1/G2")
print("- Preprocessing pipelines: imputation + scaling + one-hot + ordinal encoding")
print("- Baseline model comparison (Linear, Lasso, SVR) using CV RMSE")
print("- Hyperparameter tuning: GridSearchCV for SVR (with vs without grades)")
print("- Final evaluation on held-out test set (RMSE, MAE, R²)")
print("- At-risk view derived from regression (G3 < 10) + confusion matrix/report")
print("- Artifacts saved to /figures, /outputs, and /models")
print(f"- Best model saved as {model_path.name}")
