In [None]:
# ---  (Regression Version) ---
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline  # Added import for Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
DATA_PATH = os.path.join(BASE_DIR, 'Results', 'Output', 'cleaned_student_data.csv')
OUT_DIR = os.path.join(BASE_DIR, 'Results', 'Output', 'ModelResults')
os.makedirs(OUT_DIR, exist_ok=True)

# Load data
data = pd.read_csv("/content/cleaned_student_data.csv")

# Split features and target
TARGET = 'G3'
X = data.drop(columns=[TARGET])
y = data[TARGET]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression

# Create a pipeline with PolynomialFeatures, StandardScaler, and LinearRegression
pipeline = Pipeline([
    ('poly', PolynomialFeatures()),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# Define hyperparameter grid for tuning
param_grid = {
    'poly__degree': [1, 2, 3],  # Try polynomial degrees 1 (linear), 2, and 3
    'model__fit_intercept': [True, False]  # Whether to fit intercept
}

# Perform GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_mean_squared_error',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

# Save results
results = pd.DataFrame([{
    'Model': 'LinearRegression_Tuned',
    'MAE': mae,
    'MSE': mse,
    'RMSE': rmse,
    'R2': r2,
    'Best_Parameters': str(grid_search.best_params_)
}])
results.to_csv(os.path.join(OUT_DIR, 'linear_tuned_results.csv'), index=False)

# Save the best model
joblib.dump(best_model, os.path.join(OUT_DIR, 'linear_tuned_best.joblib'))

# Output the saved model path
print(f"Model saved to: {os.path.join(OUT_DIR, 'linear_tuned_best.joblib')}")

Best Parameters: {'model__fit_intercept': True, 'poly__degree': 1}
MAE: 4.108
MSE: 25.813
RMSE: 5.081
R²: 0.049
Model saved to: /Results/Output/ModelResults/linear_tuned_best.joblib
