# 🎓 Student Academic Performance Prediction — End-to-End ML
**Author:** Safia Alam H B  
**Goal:** Predict students' average exam score from demographic and academic features.

## What you'll learn
- Data loading with flexible column mapping
- EDA (quick, practical)
- Feature engineering (`average_score`, `pass_fail`)
- Model comparison: Linear Regression, Random Forest, Gradient Boosting
- Hyperparameter tuning with RandomizedSearchCV
- Persisting the best pipeline to `models/best_model.pkl`


In [None]:
# Imports
import os, json, math, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import joblib

DATA_PATH = os.path.join("..", "data", "StudentsPerformance.csv")
MODEL_DIR = os.path.join("..", "models")
os.makedirs(MODEL_DIR, exist_ok=True)

print("Using data path:", DATA_PATH)
print("Model directory:", MODEL_DIR)


In [None]:
# Load dataset with flexible column handling
def load_data(path):
    if not os.path.exists(path):
        raise FileNotFoundError(
            f"Dataset not found at {path}. Please place 'StudentsPerformance.csv' in the data/ directory."
        )
    df = pd.read_csv(path)

    # Try to standardize expected column names
    # Common variants seen across datasets
    col_map = {
        "math score": ["math score","math_score","math","maths","MathScore","mathscore"],
        "reading score": ["reading score","reading_score","reading","ReadingScore","readingscore"],
        "writing score": ["writing score","writing_score","writing","WritingScore","writingscore"],
        "gender": ["gender","Gender"],
        "race/ethnicity": ["race/ethnicity","race","ethnicity","RaceEthnicity"],
        "parental level of education": ["parental level of education","parental_level_of_education","parental education","parental_education"],
        "lunch": ["lunch","Lunch"],
        "test preparation course": ["test preparation course","test_prep","test preparation","test_preparation_course"],
    }
    lower_cols = {c.lower(): c for c in df.columns}
    new_cols = {}
    for std, variants in col_map.items():
        found = None
        for v in variants:
            if v.lower() in lower_cols:
                found = lower_cols[v.lower()]
                break
        if found is not None:
            new_cols[found] = std
    df = df.rename(columns=new_cols)

    required = ["math score","reading score","writing score","gender","race/ethnicity","parental level of education","lunch","test preparation course"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns after mapping: {missing}. Please update the mapping above to match your dataset.")
    return df

df = load_data(DATA_PATH)
df.head()


## Quick EDA


In [None]:
print(df.shape)
df.describe(include='all').T.head(20)


In [None]:
# Distribution plots for scores
fig = plt.figure(figsize=(6,4))
plt.hist(df["math score"].dropna(), bins=20)
plt.title("Math Score Distribution")
plt.xlabel("Score")
plt.ylabel("Count")
plt.show()

fig = plt.figure(figsize=(6,4))
plt.hist(df["reading score"].dropna(), bins=20)
plt.title("Reading Score Distribution")
plt.xlabel("Score")
plt.ylabel("Count")
plt.show()

fig = plt.figure(figsize=(6,4))
plt.hist(df["writing score"].dropna(), bins=20)
plt.title("Writing Score Distribution")
plt.xlabel("Score")
plt.ylabel("Count")
plt.show()


In [None]:
# Feature engineering
df["average_score"] = df[["math score","reading score","writing score"]].mean(axis=1)
df["pass_fail"] = (df["average_score"] >= 60).astype(int)  # simple rule; adjust as needed

df[["math score","reading score","writing score","average_score","pass_fail"]].head()


## Train/Test Split & Preprocessing
We'll predict **average_score** (regression).

In [None]:
target = "average_score"
num_features = ["math score","reading score","writing score"]
cat_features = ["gender","race/ethnicity","parental level of education","lunch","test preparation course"]

X = df[cat_features + num_features].copy()
y = df[target].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
        ("num", "passthrough", num_features),
    ]
)

models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
}

results = {}
for name, model in models.items():
    pipe = Pipeline([("pre", pre), ("model", model)])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    r2 = r2_score(y_test, pred)
    rmse = mean_squared_error(y_test, pred, squared=False)
    mae = mean_absolute_error(y_test, pred)
    results[name] = {"R2": r2, "RMSE": rmse, "MAE": mae}

pd.DataFrame(results).T.sort_values("R2", ascending=False)


## Hyperparameter Tuning
We'll tune the top-performing tree models with `RandomizedSearchCV` for speed.

In [None]:
from scipy.stats import randint, uniform

best_pipe = None
best_score = -np.inf
best_name = None

# RandomForest tuning
rf = Pipeline([("pre", pre), ("model", RandomForestRegressor(random_state=42))])
rf_params = {
    "model__n_estimators": randint(100, 600),
    "model__max_depth": randint(3, 20),
    "model__min_samples_split": randint(2, 20),
    "model__min_samples_leaf": randint(1, 10),
}
rf_search = RandomizedSearchCV(rf, rf_params, n_iter=25, scoring="r2", cv=5, random_state=42, n_jobs=-1)
rf_search.fit(X_train, y_train)
rf_r2 = rf_search.best_score_
if rf_r2 > best_score:
    best_score = rf_r2
    best_pipe = rf_search.best_estimator_
    best_name = "RandomForest"

# GradientBoosting tuning
gb = Pipeline([("pre", pre), ("model", GradientBoostingRegressor(random_state=42))])
gb_params = {
    "model__n_estimators": randint(100, 500),
    "model__learning_rate": uniform(0.01, 0.3),
    "model__max_depth": randint(2, 6),
    "model__subsample": uniform(0.6, 0.4),
}
gb_search = RandomizedSearchCV(gb, gb_params, n_iter=25, scoring="r2", cv=5, random_state=42, n_jobs=-1)
gb_search.fit(X_train, y_train)
gb_r2 = gb_search.best_score_
if gb_r2 > best_score:
    best_score = gb_r2
    best_pipe = gb_search.best_estimator_
    best_name = "GradientBoosting"

print("Best CV R2:", round(best_score, 4), "Best Model:", best_name)

# Evaluate on test set
y_pred = best_pipe.predict(X_test)
test_r2 = r2_score(y_test, y_pred)
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_pred)

print("Test R2:", round(test_r2, 4))
print("Test RMSE:", round(test_rmse, 4))
print("Test MAE:", round(test_mae, 4))


## Feature Importance (Tree Models)
We can inspect feature importances for interpretability.


In [None]:
# get feature names after preprocessing
ohe = best_pipe.named_steps["pre"].named_transformers_["cat"]
ohe_features = list(ohe.get_feature_names_out(cat_features))
all_features = ohe_features + num_features

model = best_pipe.named_steps["model"]
if hasattr(model, "feature_importances_"):
    importances = pd.Series(model.feature_importances_, index=all_features).sort_values(ascending=False)
    print(importances.head(15))

    # Plot top 15
    topk = importances.head(15)
    fig = plt.figure(figsize=(8,5))
    topk[::-1].plot(kind="barh")
    plt.title("Top 15 Feature Importances")
    plt.xlabel("Importance")
    plt.tight_layout()
    plt.show()
else:
    print("Selected best model does not expose feature_importances_.")


## Persist Model
Save the trained pipeline for use in the Streamlit app.


In [None]:
model_path = os.path.join(MODEL_DIR, "best_model.pkl")
joblib.dump(best_pipe, model_path)
model_path


## Conclusions
- Created `average_score` target and trained multiple models.
- Tuned tree-based models with randomized search.
- Saved the best pipeline and used it in a Streamlit app.
