In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import optuna

In [4]:
data = pd.read_csv(r"E:\project\Student-Performance-System-Using-Mlops\notebooks\archive\Student_performance_data _.csv")
data.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [5]:
# split into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =  train_test_split(data.drop(columns=['GPA', 'GradeClass']), 
                                                      data['GPA'], test_size=0.3, random_state = 42)

print(f"After data split into training and testing.\nx_train: {x_train.shape}, x_test: {x_test.shape}\ny_train: {y_train.shape}, y_test: {y_test.shape}")

After data split into training and testing.
x_train: (1674, 13), x_test: (718, 13)
y_train: (1674,), y_test: (718,)


In [6]:
from typing import List

def drop_col(data: pd.DataFrame, col_to_drop: List) -> pd.DataFrame:
    """
    this function drops the columns from data.

    args:
    data = pandas dataframe.
    col_to_drop = list of columns to drop.

    return:
    return thedataframe .
    """
    data = data.drop(columns=col_to_drop)

    return data

In [7]:

col = ['StudentID', 'ParentalEducation', 'Ethnicity', 'Gender', 'Volunteering']
x_train = drop_col(x_train, col)
x_test = drop_col(x_test, col)

print(f"useless columns removed from test and train.")
print(f"x_train: {x_train.shape}, x_test: {x_test.shape}")

useless columns removed from test and train.
x_train: (1674, 8), x_test: (718, 8)


In [8]:
x_train.head()

Unnamed: 0,Age,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music
380,18,3.772645,7,0,2,1,1,0
226,15,11.448155,19,0,2,0,0,0
1075,15,19.570119,13,0,1,0,0,0
715,18,10.071401,20,0,4,0,0,0
1822,18,2.158491,29,1,3,1,1,1


In [9]:
y_train.head()

380     2.525116
226     0.881866
1075    2.096645
715     1.321427
1822    0.992297
Name: GPA, dtype: float64

In [10]:
# convert into numpy array.
def to_numpy(data: pd.DataFrame) -> np.ndarray:
    """
    this method convert pandas dataframe to numpy array.

    args:
    data = dataframe

    returns:
    numpy array of data.
    """

    data = data.values

    return data

In [11]:
x_train = to_numpy(x_train)
x_test = to_numpy(x_test)
y_train = to_numpy(y_train)
y_test = to_numpy(y_test)
print(f"converted to numpy array")

converted to numpy array


In [13]:
X, y = data.drop(columns=['GPA', 'GradeClass']), data['GPA']
print(f"data split into features and labels:\nX: {X.shape}, y: {y.shape}")

data split into features and labels:
X: (2392, 13), y: (2392,)


In [14]:
X = X.values
y = y.values
print(f"data type: X: {X.dtype}, y: {y.dtype}")

data type: X: float64, y: float64


In [16]:
y

array([2.92919559, 3.04291483, 0.11260225, ..., 1.14233288, 1.80329676,
       2.14001388])

SIMPLE LINEAR REGRESSION MODEL.

In [20]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, cross_val_score

model_0 = LinearRegression()

model_0.fit(X, y)
train_pred = model_0.predict(X)
training_acc = mean_absolute_error(train_pred, y)

cv = KFold(n_splits=3, shuffle=True, random_state=42)

scores = cross_val_score(model_0, X, y, cv=cv, scoring="neg_mean_absolute_error")

print(f"Model: {model_0.__class__.__name__}")
print(f"Training MAE: {training_acc:.3f}")
print(f"Cross-validation MAE: {-scores.mean():.3f}")

Model: LinearRegression
Training MAE: 0.158
Cross-validation MAE: 0.159


In [24]:
results = []
results.append({"Model": model_0.__class__.__name__, "Training MAE": training_acc, "Cross-validation MAE": -scores.mean().item()}
)
results

[{'Model': 'LinearRegression',
  'Training MAE': 0.15807381305079082,
  'Cross-validation MAE': 0.1676246868000685}]

In [23]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error

model_1 = GradientBoostingRegressor()

model_1.fit(X, y)
train_pred = model_1.predict(X)
train_acc = mean_absolute_error(y, train_pred)

cv = KFold(n_splits=3, shuffle=True, random_state=42)

scores = cross_val_score(model_1, X, y, cv=cv, scoring="neg_mean_absolute_error")


print(f"Model: {model_1.__class__.__name__}")
print(f"Training MAE: {training_acc:.3f}")
print(f"Cross-validation MAE: {-scores.mean():.3f}")

Model: GradientBoostingRegressor
Training MAE: 0.158
Cross-validation MAE: 0.168


In [25]:
results.append({"Model": model_1.__class__.__name__, 
                "Training MAE": train_acc,
                 "Cross-validation MAE": -scores.mean().item()})

results

[{'Model': 'LinearRegression',
  'Training MAE': 0.15807381305079082,
  'Cross-validation MAE': 0.1676246868000685},
 {'Model': 'GradientBoostingRegressor',
  'Training MAE': 0.1434465870288238,
  'Cross-validation MAE': 0.1676246868000685}]

In [26]:
import optuna
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

def objective(trial):

    params = {
          "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.4, log=True),
          "n_estimators": trial.suggest_int("n_estimators", 100, 300, step=50),
          "max_depth": trial.suggest_int("max_depth", 1, 5),
          "subsample": trial.suggest_float("subsample", 0.3, 1.0)
    }

    model = GradientBoostingRegressor(**params)

    model.fit(X, y)
    y_pred = model.predict(X)

    acc = mean_absolute_error(y, y_pred)

    return acc

In [27]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2025-10-10 17:59:08,230] A new study created in memory with name: no-name-53c89655-c6cb-4a93-9b8d-899034646adb
[I 2025-10-10 17:59:09,245] Trial 0 finished with value: 0.157348694895044 and parameters: {'learning_rate': 0.09087020253619815, 'n_estimators': 300, 'max_depth': 1, 'subsample': 0.4721218730101114}. Best is trial 0 with value: 0.157348694895044.
[I 2025-10-10 17:59:09,906] Trial 1 finished with value: 0.1925613918948993 and parameters: {'learning_rate': 0.01598062512496541, 'n_estimators': 300, 'max_depth': 2, 'subsample': 0.31614886148066534}. Best is trial 0 with value: 0.157348694895044.
[I 2025-10-10 17:59:10,380] Trial 2 finished with value: 0.14732126059032466 and parameters: {'learning_rate': 0.17104890879633916, 'n_estimators': 150, 'max_depth': 2, 'subsample': 0.4910650363617779}. Best is trial 2 with value: 0.14732126059032466.
[I 2025-10-10 17:59:10,990] Trial 3 finished with value: 0.21470280206259135 and parameters: {'learning_rate': 0.030885561990080003, 'n_

In [28]:
# Best hyperparameters
print("Best MAE:", study.best_value)
print("Best parameters:", study.best_params)

Best MAE: 0.14732126059032466
Best parameters: {'learning_rate': 0.17104890879633916, 'n_estimators': 150, 'max_depth': 2, 'subsample': 0.4910650363617779}


In [29]:
best_param = study.best_params
best_param

{'learning_rate': 0.17104890879633916,
 'n_estimators': 150,
 'max_depth': 2,
 'subsample': 0.4910650363617779}

In [32]:
import yaml

path = r"E:\project\Student-Performance-System-Using-Mlops\config.yaml"
with open(path, "r") as f:
    config = yaml.safe_load(f)

config["parameter"] = best_param

with open(path, "w") as f:
    yaml.dump(config, f)