**1.1 Import Data and required packages**
Importing Pandas, numpy, matplotlib, seaborn, warings library

In [4]:
# basic input
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

# modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [5]:
df = pd.read_csv('data/stud.csv')

In [None]:
X = df.drop(columns=['math_score'],axis=1)
y = df['math_score']


0    72
1    69
2    90
3    47
4    76
Name: math_score, dtype: int64

In [15]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
cat_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder', cat_transformer, cat_features),
        ('StandardScaler', numeric_transformer, num_features)
    ]
)

In [16]:
X = preprocessor.fit_transform(X)

In [20]:
X.shape

print(X[0])

[1.         0.         0.         1.         0.         0.
 0.         0.         1.         0.         0.         0.
 0.         0.         1.         0.         1.         0.19399858
 0.39149181]


In [22]:

# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

**MAE, RMSE, R2**
- MAE: (mean absolute error) - có cùng đơn vị với y
![image.png](attachment:image.png)
 ít nhạy cảm với outliers, nên hay được sử dụng khi ta không muốn outliers ảnh hưởng nhiều đến đánh giá. Sai số nhỏ hay lớn đều được xem xét công bằng

- RMSE: (root mean squared error) - có cùng đơn vị với y
![image-2.png](attachment:image-2.png)
 rất nhạy cảm với outliers, hay được dùng khi ta muốn trừng phạt những lỗi dự đoán lớn (sai số nhỏ lớn được xem xét khác nhau)

- R2 - % : cho biến phần trăm biến động của biến mục tiêu được giải thích bởi mô hình

- **R2 cao, RMSE/MAE thấp** : mô hình tốt

- **R2 cao, RMSE/MAE cao**: mô hình có thể overfitting hoặc dữ liệu nhiễu
- **R2 thấp,...thấp**: dẹp mẹ đi


In [23]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [24]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 5.3293
- Mean Absolute Error: 4.2715
- R2 Score: 0.8740
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.4252
- Mean Absolute Error: 4.2222
- R2 Score: 0.8790


Lasso
Model performance for Training set
- Root Mean Squared Error: 6.5938
- Mean Absolute Error: 5.2063
- R2 Score: 0.8071
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.5197
- Mean Absolute Error: 5.1579
- R2 Score: 0.8253


Ridge
Model performance for Training set
- Root Mean Squared Error: 5.3233
- Mean Absolute Error: 4.2650
- R2 Score: 0.8743
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3904
- Mean Absolute Error: 4.2111
- R2 Score: 0.8806


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 5.7079
- Mean Absolute Error: 4.5168
- R2 Score: 0.8555
-----------------------

In [25]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.880593
0,Linear Regression,0.879046
7,CatBoosting Regressor,0.851632
5,Random Forest Regressor,0.850176
8,AdaBoost Regressor,0.845371
6,XGBRegressor,0.827797
1,Lasso,0.82532
3,K-Neighbors Regressor,0.783813
4,Decision Tree,0.726389
