# Student Exam Score Prediction - Full Modeling Variations
This notebook evaluates 12 variations across 4 model types and compares them to a baseline.


Project Objective and Overarching Question
The central question driving this project is: To what extent can student exam scores be predicted from lifestyle habits, wellness factors, and socioeconomic background?

We aim to identify which features contribute most to academic performance and explore predictive models that can help estimate student outcomes.

## Baseline Performance
The baseline model uses the mean of the target variable (exam score) to predict all outcomes. It serves as a naive benchmark.

In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Load data
df = pd.read_csv("student_habits_performance.csv")
X = df.drop(columns=['student_id', 'exam_score'])
y = df['exam_score']

# Separate features
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('category', OneHotEncoder(drop='first'), cat_cols),
    ('num', StandardScaler(), num_cols)
])

# Baseline
baseline_pred = np.full_like(y_test, y_train.mean())
baseline_r2 = r2_score(y_test, baseline_pred)
baseline_rmse = mean_squared_error(y_test, baseline_pred, )
baseline_mae = mean_absolute_error(y_test, baseline_pred)

print(f"Baseline R²: {baseline_r2:.4f}, RMSE: {baseline_rmse:.2f}, MAE: {baseline_mae:.2f}")


Baseline R²: -0.0050, RMSE: 257.70, MAE: 12.38


## Model Variations Evaluation
The following sections contain code and results for 12 model variations (3 per model type).

In [3]:
# Linear Regression - Basic
lr_pipe = Pipeline([
    ('pre', preprocessor),
    ('model', LinearRegression())
])
lr_pipe.fit(X_train, y_train)
y_pred = lr_pipe.predict(X_test)
print("Linear Regression", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred, ), mean_absolute_error(y_test, y_pred))

Linear Regression 0.8967502640380202 26.476236454969175 4.1893110546275985


In [4]:
# Linear Regression with Polynomial Features
degree_2_pipe = Pipeline([
    ('pre', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('model', LinearRegression())
])
degree_2_pipe.fit(X_train, y_train)
y_pred_poly = degree_2_pipe.predict(X_test)
print("Linear Regression (Poly)", r2_score(y_test, y_pred_poly), mean_squared_error(y_test, y_pred_poly, ), mean_absolute_error(y_test, y_pred_poly))

Linear Regression (Poly) 0.8889571417059269 28.474619769605262 4.298645336992828


In [5]:
# Linear Regression with Feature Selection
selector = SelectKBest(score_func=mutual_info_regression, k=10)
X_new = selector.fit_transform(preprocessor.fit_transform(X), y)
X_train_fs, X_test_fs, y_train_fs, y_test_fs = train_test_split(X_new, y, test_size=0.2, random_state=42)
fs_pipe = Pipeline([
    ('model', LinearRegression())
])
fs_pipe.fit(X_train_fs, y_train_fs)
y_pred_fs = fs_pipe.predict(X_test_fs)
print("Linear Regression (Feature Selection)", r2_score(y_test_fs, y_pred_fs), mean_squared_error(y_test_fs, y_pred_fs, ), mean_absolute_error(y_test_fs, y_pred_fs))

Linear Regression (Feature Selection) 0.8661153827841181 34.33191136097462 4.746731429558739


In [6]:
# Random Forest - Default
rf_pipe = Pipeline([
    ('pre', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])
rf_pipe.fit(X_train, y_train)
y_pred_rf = rf_pipe.predict(X_test)
print("Random Forest (Default)", r2_score(y_test, y_pred_rf), mean_squared_error(y_test, y_pred_rf, ), mean_absolute_error(y_test, y_pred_rf))

Random Forest (Default) 0.8501874788421045 38.41628937 4.946330000000001


In [7]:
# Random Forest - Shallow Trees
rf_shallow = Pipeline([
    ('pre', preprocessor),
    ('model', RandomForestRegressor(max_depth=5, random_state=42))
])
rf_shallow.fit(X_train, y_train)
y_pred_shallow = rf_shallow.predict(X_test)
print("Random Forest (Shallow)", r2_score(y_test, y_pred_shallow), mean_squared_error(y_test, y_pred_shallow, ), mean_absolute_error(y_test, y_pred_shallow))

Random Forest (Shallow) 0.8115565971571697 48.3223714381508 5.631418420835278


In [8]:
# Random Forest - Deep Trees
rf_deep = Pipeline([
    ('pre', preprocessor),
    ('model', RandomForestRegressor(max_depth=None, min_samples_split=2, random_state=42))
])
rf_deep.fit(X_train, y_train)
y_pred_deep = rf_deep.predict(X_test)
print("Random Forest (Deep)", r2_score(y_test, y_pred_deep), mean_squared_error(y_test, y_pred_deep, ), mean_absolute_error(y_test, y_pred_deep))

Random Forest (Deep) 0.8501874788421045 38.41628937 4.946330000000001


In [9]:
# Gradient Boosting - Default
gb_pipe = Pipeline([
    ('pre', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])
gb_pipe.fit(X_train, y_train)
y_pred_gb = gb_pipe.predict(X_test)
print("Gradient Boosting (Default)", r2_score(y_test, y_pred_gb), mean_squared_error(y_test, y_pred_gb, ), mean_absolute_error(y_test, y_pred_gb))

Gradient Boosting (Default) 0.8805908844884767 30.6199715447795 4.619510553803127


In [10]:
# Gradient Boosting - Tuned
gb_tuned = Pipeline([
    ('pre', preprocessor),
    ('model', GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, random_state=42))
])
gb_tuned.fit(X_train, y_train)
y_pred_gb_tuned = gb_tuned.predict(X_test)
print("Gradient Boosting (Tuned)", r2_score(y_test, y_pred_gb_tuned), mean_squared_error(y_test, y_pred_gb_tuned, ), mean_absolute_error(y_test, y_pred_gb_tuned))

Gradient Boosting (Tuned) 0.8811845596710132 30.467735954404024 4.574196120088078


In [11]:
# Gradient Boosting - Shallow Trees
gb_shallow = Pipeline([
    ('pre', preprocessor),
    ('model', GradientBoostingRegressor(max_depth=2, random_state=42))
])
gb_shallow.fit(X_train, y_train)
y_pred_gb_shallow = gb_shallow.predict(X_test)
print("Gradient Boosting (Shallow)", r2_score(y_test, y_pred_gb_shallow), mean_squared_error(y_test, y_pred_gb_shallow, ), mean_absolute_error(y_test, y_pred_gb_shallow))

Gradient Boosting (Shallow) 0.878161050628305 31.24305164497352 4.670542360896657


In [12]:
# KNN - K=5
knn5 = Pipeline([
    ('pre', preprocessor),
    ('model', KNeighborsRegressor(n_neighbors=5))
])
knn5.fit(X_train, y_train)
y_pred_knn5 = knn5.predict(X_test)
print("KNN (K=5)", r2_score(y_test, y_pred_knn5), mean_squared_error(y_test, y_pred_knn5, ), mean_absolute_error(y_test, y_pred_knn5))

KNN (K=5) 0.6921676469974374 78.937172 7.064000000000001


In [13]:
# KNN - K=15
knn15 = Pipeline([
    ('pre', preprocessor),
    ('model', KNeighborsRegressor(n_neighbors=15))
])
knn15.fit(X_train, y_train)
y_pred_knn15 = knn15.predict(X_test)
print("KNN (K=15)", r2_score(y_test, y_pred_knn15), mean_squared_error(y_test, y_pred_knn15, ), mean_absolute_error(y_test, y_pred_knn15))

KNN (K=15) 0.7132961307947675 73.519214 6.769366666666667


In [14]:
# KNN - K=50
knn50 = Pipeline([
    ('pre', preprocessor),
    ('model', KNeighborsRegressor(n_neighbors=50))
])
knn50.fit(X_train, y_train)
y_pred_knn50 = knn50.predict(X_test)
print("KNN (K=50)", r2_score(y_test, y_pred_knn50), mean_squared_error(y_test, y_pred_knn50, ), mean_absolute_error(y_test, y_pred_knn50))

KNN (K=50) 0.6777729029626185 82.62840320000001 7.000000000000001
