In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import tensorflow as tf
import keras
from keras import layers

In [4]:
df = pd.read_csv("/content/drive/MyDrive/Advancing Machine Learning/exercise_1/preprocessed/processed_regression.csv")
df.tail()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
5466,25,69,2,0,0,7,76,0,1,1,2,0,1,0,2,0,0,0,0,68
5467,23,76,2,0,0,8,81,0,1,3,1,2,1,0,2,0,0,0,0,69
5468,20,90,0,1,1,6,65,1,1,3,1,0,1,1,2,0,2,0,0,68
5469,10,86,2,2,1,6,91,2,1,2,1,0,0,0,3,0,0,1,0,68
5470,15,67,0,1,1,9,94,0,1,0,0,0,1,0,4,0,2,0,1,64


In [6]:
 # CatBoost requires we save our continuous and categorical variables separately into a list
categorical_variables = ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender']

# continous variables also into a list
continuous_variables = ['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores', 'Tutoring_Sessions', 'Physical_Activity', 'Exam_Score']

# X/y
X = df.drop("Exam_Score", axis=1)
y = df['Exam_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SCALING => some of the algorithms require this
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
from sklearn.ensemble import StackingRegressor



# meta model is the judge or optimizer of the stacking algorithm
# => it assigns weights to the models above based on their performance
meta_model = LinearRegression()

# stacking regression algorithm
stacking_model = StackingRegressor(
    estimators=[('svm', SVR()),
                ('catboost', cb.CatBoostRegressor(iterations=500, learning_rate=0.05, depth=6, verbose=0)),
                ('lgbm', lgb.LGBMRegressor(objective='regression'))
                ],
    final_estimator = meta_model,
    cv=5
)

# fit the model
stacking_model.fit(X_train, y_train)

# test predictions and metrics
predictions = stacking_model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print()
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")


MAE: 0.2907886251465975
MSE: 0.12486017626882753
RMSE: 0.35335559464769695
R2: 0.9852937158313292
