In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import tensorflow as tf
import keras
from keras import layers

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Advancing Machine Learning/exercise_1/preprocessed/processed_regression.csv")
df.tail()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
5466,25,69,2,0,0,7,76,0,1,1,2,0,1,0,2,0,0,0,0,68
5467,23,76,2,0,0,8,81,0,1,3,1,2,1,0,2,0,0,0,0,69
5468,20,90,0,1,1,6,65,1,1,3,1,0,1,1,2,0,2,0,0,68
5469,10,86,2,2,1,6,91,2,1,2,1,0,0,0,3,0,0,1,0,68
5470,15,67,0,1,1,9,94,0,1,0,0,0,1,0,4,0,2,0,1,64


In [4]:
 # CatBoost requires we save our continuous and categorical variables separately into a list
categorical_variables = ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender']

# continous variables also into a list
continuous_variables = ['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores', 'Tutoring_Sessions', 'Physical_Activity', 'Exam_Score']

# X/y
X = df.drop("Exam_Score", axis=1)
y = df['Exam_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SCALING => some of the algorithms require this
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
 # define our model dictionary


# => it's borderline if this is too heavy for KNN (due to computational heaviness)

# Randomized search gave us this result for LightGBM:
# Best parameters from RandomizedSearchCV: {'learning_rate': 0.11224137931034484, 'max_depth': 32, 'num_leaves': 49}

models = {
    'Linear Regression': LinearRegression(),
    'SVM': SVR(),
    'KNN': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': xgb.XGBRegressor(enable_categorical=True, objective='reg:squarederror'),
    'CatBoost': cb.CatBoostRegressor(verbose=0),
    'LightGBM-default': lgb.LGBMRegressor(objective='regression'),
    'LightGBM-optimized': lgb.LGBMRegressor(objective='regression', learning_rate=0.11224137931034484, max_depth=32, num_leaves=49)

}

In [6]:
# our benchmarking code!

# this will contain later our results of the benchmark
results = []

# loop through our models above
for name, model in models.items():

    # we have to react to certain algorithms
    # because they have requirements for fit()

    print("Starting ... " + name)

    # SVM / KNN require scaled data
    if name in ['SVM', 'KNN']:
        model.fit(X_train_scaled, y_train)
        predictions = model.predict(X_test_scaled)
    elif name == 'CatBoost':
        # CatBoost requires
        # we specify which columns are categories, INCLUDING ORDINALS
        model.fit(X_train, y_train, cat_features=categorical_variables)
        predictions = model.predict(X_test)
    else:
        # everything else follows the same logic!
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

    # Calculate metrics for current model in training
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)

    # save the metrics for this model into results
    results.append([name, mae, mse, rmse, r2])

Starting ... Linear Regression
Starting ... SVM
Starting ... KNN
Starting ... Random Forest
Starting ... XGBoost
Starting ... CatBoost
Starting ... LightGBM-default
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000755 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 190
[LightGBM] [Info] Number of data points in the train set: 4376, number of used features: 19
[LightGBM] [Info] Start training from score 67.045704
Starting ... LightGBM-optimized
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000515 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 190
[LightGBM] [Info] Number of data points in the train set: 4376, number of used features: 19
[LightGBM] [Info] Start training from score 67.

In [7]:
metrics_df = pd.DataFrame(results, columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2'])

light_theme = 'background: yellow'
dark_theme = 'background: goldenrod'

# you can alternate the highlight styling based on your theme
default_highlight_style = dark_theme

# helper function that highlights the best model of each metric
def highlight_best_metrics(row):
    # default styles for everything is empty in the beginning
    styles = ['' for _ in row]

    # index 1 => MAE (index 0 => model name)
    if row['MAE'] == metrics_df['MAE'].min():
        styles[1] = default_highlight_style

    # index 2 = > MSE
    if row['MSE'] == metrics_df['MSE'].min():
        styles[2] = default_highlight_style

    # index 3 = > RMSE
    if row['RMSE'] == metrics_df['RMSE'].min():
        styles[3] = default_highlight_style

    # index 4 = > R-squared (R2)
    if row['R2'] == metrics_df['R2'].max():
        styles[4] = default_highlight_style

    return styles


# apply the custom styles based on the min/max metrics
highlight_df = metrics_df.style.apply(highlight_best_metrics, axis=1)


highlight_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2
0,Linear Regression,0.888674,1.217606,1.103452,0.86127
1,SVM,0.481405,0.383976,0.619658,0.956251
2,KNN,1.52347,3.582429,1.892731,0.591831
3,Random Forest,0.922365,1.308862,1.144055,0.850873
4,XGBoost,0.523186,0.429534,0.655389,0.95106
5,CatBoost,0.289356,0.126062,0.355053,0.985637
6,LightGBM-default,0.489816,0.378041,0.61485,0.956927
7,LightGBM-optimized,0.474972,0.358803,0.599002,0.959119


In [None]:
 # better model CatBoost, LightGBM-optimized, SVM because for stacking LightGBM-default is almos same
