In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score
import joblib

In [2]:
# Load the preprocessed data from CSV files
dataset = pd.read_csv('../data/updated_preprocessed_dataset2.csv')

In [3]:
dataset.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0


In [4]:
# Extract features (X) and target (y)
X = dataset.drop(['Performance Index'], axis=1)  # Remove only the target variable
y = dataset['Performance Index']

In [5]:
X

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
0,7,99,1,9,1
1,4,82,0,4,2
2,8,51,1,7,2
3,5,52,1,5,2
4,7,75,0,8,5
...,...,...,...,...,...
9868,1,49,1,4,2
9869,7,64,1,8,5
9870,6,83,1,8,5
9871,9,97,1,7,0


In [6]:
y

0       91.0
1       65.0
2       45.0
3       36.0
4       66.0
        ... 
9868    23.0
9869    58.0
9870    74.0
9871    95.0
9872    64.0
Name: Performance Index, Length: 9873, dtype: float64

In [7]:
# Initialize models with hyperparameter grids
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Ridge Regression': (Ridge(), {'alpha': [0.01, 0.1, 1.0, 10.0]}),
    'Lasso Regression': (Lasso(), {'alpha': [0.01, 0.1, 1.0, 10.0]}),
    'Decision Tree Regressor': (DecisionTreeRegressor(), {'max_depth': [None, 10, 20, 30]}),
    'Random Forest Regressor': (RandomForestRegressor(), {'n_estimators': [50, 100, 200]}),
    'Support Vector Regression': (SVR(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly']})
}

In [8]:
# Initialize the best model and its performance
best_model = None
best_model_name = None
best_mse = float('inf')

In [9]:
# Perform hyperparameter tuning and cross-validation for each model
for model_name, (model, param_grid) in models.items():
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
    
    # Hyperparameter tuning using GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=kf, scoring=mse_scorer)
    grid_search.fit(X, y)
    
    # Evaluate the best model using cross-validation
    cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=kf, scoring=mse_scorer)
    mse = -cv_scores.mean()
    
    # Check if this is the best model so far
    if mse < best_mse:
        best_mse = mse
        best_model = grid_search.best_estimator_
        best_model_name = model_name

In [11]:
# Save the best model
joblib.dump(best_model, '../ml_models/best_student_performance_model4.pkl')

['../ml_models/best_student_performance_model4.pkl']

In [12]:
# Print the best model and its performance
print(f'Best Model: {best_model_name}')
print(f'Mean Squared Error: {best_mse}')

Best Model: Ridge Regression
Mean Squared Error: 4.182560269357039
