In [1]:
# ---  (Regression Version) ---
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
DATA_PATH = os.path.join(BASE_DIR, 'Results', 'Output', 'cleaned_student_data.csv')
OUT_DIR = os.path.join(BASE_DIR, 'Results', 'Output', 'ModelResults')
os.makedirs(OUT_DIR, exist_ok=True)

# Load data
data = pd.read_csv("/content/cleaned_student_data.csv")

# Split features and target
TARGET = 'G3'
X = data.drop(columns=[TARGET])
y = data[TARGET]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale for models that need normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [3]:
from sklearn.ensemble import RandomForestRegressor

param_grid = {'n_estimators':[50,100], 'max_depth':[5,10,None]}
grid = GridSearchCV(RandomForestRegressor(random_state=42),
                    param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Best Params:", grid.best_params_)
print(f"MAE:{mae:.3f}, RMSE:{rmse:.3f}, R²:{r2:.3f}")

pd.DataFrame([{'Model':'RandomForest','MAE':mae,'MSE':mse,'RMSE':rmse,'R2':r2}])\
  .to_csv(os.path.join(OUT_DIR,'randomforest_results.csv'),index=False)

joblib.dump(best_model, os.path.join(OUT_DIR,'randomforest_best.joblib'))


Best Params: {'max_depth': 5, 'n_estimators': 100}
MAE:4.009, RMSE:5.072, R²:0.052


['/Results/Output/ModelResults/randomforest_best.joblib']