In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, cross_val_score

In [2]:
data = pd.read_csv(r"E:\project\Student-Performance-System-Using-Mlops\notebooks\archive\Student_performance_data _.csv")
data.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [4]:
X, y = data.drop(columns=['StudentID','GPA', 'GradeClass']), data['GPA']
X.shape, y.shape

((2392, 12), (2392,))

In [5]:
X = X.values
y = y.values
print(f"data type: X: {X.dtype}, y: {y.dtype}")

data type: X: float64, y: float64


In [8]:
model_0 = LinearRegression()

model_0.fit(X, y)
train_pred = model_0.predict(X)
training_acc = mean_absolute_error(train_pred, y)

cv = KFold(n_splits=3, shuffle=True, random_state=42)

scores = cross_val_score(model_0, X, y, cv=cv, scoring="neg_mean_absolute_error")

print(f"Model: {model_0.__class__.__name__}")
print(f"Training MAE: {training_acc:.6f}")
print(f"Cross-validation MAE: {-scores.mean():.6f}")

Model: LinearRegression
Training MAE: 0.158227
Cross-validation MAE: 0.159254


In [7]:
X.shape

(2392, 12)

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error

model_1 = GradientBoostingRegressor()

model_1.fit(X, y)
train_pred = model_1.predict(X)
train_acc = mean_absolute_error(y, train_pred)

cv = KFold(n_splits=3, shuffle=True, random_state=42)

scores = cross_val_score(model_1, X, y, cv=cv, scoring="neg_mean_absolute_error")


print(f"Model: {model_1.__class__.__name__}")
print(f"Training MAE: {training_acc:.3f}")
print(f"Cross-validation MAE: {-scores.mean():.3f}")

Model: GradientBoostingRegressor
Training MAE: 0.158
Cross-validation MAE: 0.168


In [10]:
import optuna
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

def objective(trial):

    params = {
          "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.4, log=True),
          "n_estimators": trial.suggest_int("n_estimators", 100, 300, step=50),
          "max_depth": trial.suggest_int("max_depth", 1, 5),
          "subsample": trial.suggest_float("subsample", 0.3, 1.0)
    }

    model = GradientBoostingRegressor(**params)

    model.fit(X, y)
    y_pred = model.predict(X)

    acc = mean_absolute_error(y, y_pred)

    return acc

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2025-10-13 22:10:36,457] A new study created in memory with name: no-name-293d5337-3826-418c-9db2-8a6f00c3b3cb
[I 2025-10-13 22:10:37,082] Trial 0 finished with value: 0.06643631363996617 and parameters: {'learning_rate': 0.1605192295064776, 'n_estimators': 250, 'max_depth': 5, 'subsample': 0.502371441006814}. Best is trial 0 with value: 0.06643631363996617.
[I 2025-10-13 22:10:37,204] Trial 1 finished with value: 0.1606501613004702 and parameters: {'learning_rate': 0.2151533551292026, 'n_estimators': 100, 'max_depth': 1, 'subsample': 0.8780643533866046}. Best is trial 0 with value: 0.06643631363996617.
[I 2025-10-13 22:10:37,625] Trial 2 finished with value: 0.16591971808828915 and parameters: {'learning_rate': 0.030338008883495058, 'n_estimators': 250, 'max_depth': 2, 'subsample': 0.6083788611025122}. Best is trial 0 with value: 0.06643631363996617.
[I 2025-10-13 22:10:38,260] Trial 3 finished with value: 0.06729097290582989 and parameters: {'learning_rate': 0.3571159482728992, 'n

In [12]:
# Best hyperparameters
print("Best MAE:", study.best_value)
print("Best parameters:", study.best_params)

Best MAE: 0.06643631363996617
Best parameters: {'learning_rate': 0.1605192295064776, 'n_estimators': 250, 'max_depth': 5, 'subsample': 0.502371441006814}


In [14]:
best_param = study.best_params
best_param

{'learning_rate': 0.1605192295064776,
 'n_estimators': 250,
 'max_depth': 5,
 'subsample': 0.502371441006814}

In [15]:
import yaml

path = r"E:\project\Student-Performance-System-Using-Mlops\config.yaml"
with open(path, "r") as f:
    config = yaml.safe_load(f)

config["parameter"] = best_param

with open(path, "w") as f:
    yaml.dump(config, f)