In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score
import joblib

In [2]:
# Load the preprocessed data from CSV files
dataset = pd.read_csv('../data/updated_preprocessed_dataset1.csv')

In [3]:
dataset.dropna(inplace=True)

In [4]:
# Extract features (X) and target (y)
X = dataset.drop(['reading_score'], axis=1)  # Remove only the target variable
y = dataset['reading_score']


In [5]:
# Initialize models with hyperparameter grids
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Ridge Regression': (Ridge(), {'alpha': [0.01, 0.1, 1.0, 10.0]}),
    'Lasso Regression': (Lasso(), {'alpha': [0.01, 0.1, 1.0, 10.0]}),
    'Decision Tree Regressor': (DecisionTreeRegressor(), {'max_depth': [None, 10, 20, 30]}),
    'Random Forest Regressor': (RandomForestRegressor(), {'n_estimators': [50, 100, 200]}),
    'Support Vector Regression': (SVR(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']})
}

In [6]:
# Initialize the best model and its performance
best_model = None
best_model_name = None
best_mse = float('inf')

In [7]:
# Perform hyperparameter tuning and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
    
    # Hyperparameter tuning using GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=kf, scoring=mse_scorer)
    grid_search.fit(X, y)
    
    # Evaluate the best model using cross-validation
    cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=kf, scoring=mse_scorer)
    mse = -cv_scores.mean()
    
    # Check if this is the best model so far
    if mse < best_mse:
        best_mse = mse
        best_model = grid_search.best_estimator_
        best_model_name = model_name


In [8]:
# Save the best model
joblib.dump(best_model, '../ml_models/best_student_performance_model3.pkl')

['../ml_models/best_student_performance_model3.pkl']

In [9]:
# Print the best model and its performance
print(f'Best Model: {best_model_name}')
print(f'Mean Squared Error: {best_mse}')

Best Model: Ridge Regression
Mean Squared Error: 16.722489673733833
