In [None]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [5]:

df = pd.read_csv('data/student.csv')


In [6]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


preparing x and y


In [19]:
X = df.drop('math_score', axis=1)
y = df['math_score']



In [20]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

num_transformer = StandardScaler()
cat_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ('standard_scaler', num_transformer, num_features),
        ('onehotencoder', cat_transformer, cat_features)    
        ]
    )



In [21]:
X = preprocessor.fit_transform(X)



In [23]:
X

array([[ 0.19399858,  0.39149181,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 1.42747598,  1.31326868,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.77010859,  1.64247471,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.12547206, -0.20107904,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.60515772,  0.58901542,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.15336989,  1.18158627,  1.        , ...,  0.        ,
         0.        ,  1.        ]], shape=(1000, 19))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)
X_train.shape, X_test.shape, y_train.shape, y_test.shape   


((800, 19), (200, 19), (800,), (200,))

### Create evaluate Function to give all metrics after model training


In [25]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [45]:
models = {
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'SVR': SVR(),
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'CatBoostRegressor': CatBoostRegressor(verbose=False),
    'XGBRegressor': XGBRegressor()

}



def model_training_testing_evaluation(models: dict, 
                                    X_train, y_train, 
                                    X_test, y_test):
    model_train_report = {}
    model_test_report = {}
    model_list = []

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        mae_train, mse_train, rmse_train, r2_square_train = evaluate_model(y_train, y_train_pred)
        mae_test, mse_test, rmse_test, r2_square_test = evaluate_model(y_test, y_test_pred)
        
        # print(f"Model: {model_name}")
        # print("Model performance on Train Data")
        # print(f"Train MAE: {mae_train}")
        # print(f"Train MSE: {mse_train}")
        # print(f"Train RMSE: {rmse_train}")
        # print(f"Train R2 Score: {r2_square_train}")
        # print("-"*30)

        # print("Model performance on Test Data")
        # print(f"Test MAE: {mae_test}")
        # print(f"Test MSE: {mse_test}")
        # print(f"Test RMSE: {rmse_test}")
        # print(f"Test R2 Score: {r2_square_test}")
        # print("-"*30)
        
        model_list.append(model_name)
        model_test_report[model_name] = [model, mae_test, mse_test, rmse_test, r2_square_test]
        model_train_report[model_name] = [model, mae_train, mse_train, rmse_train, r2_square_train]
        
    return model_train_report, model_test_report

model_train_report, model_test_report = model_training_testing_evaluation(models, X_train, y_train, X_test, y_test)


In [46]:
model_train_report, model_test_report = model_training_testing_evaluation(models, X_train, y_train, X_test, y_test)

In [None]:
print(model_train_report)
print("\n")
print("-"*30)
print(model_test_report)

{'KNeighborsRegressor': [KNeighborsRegressor(), 4.67975, 34.34485000000001, np.float64(5.8604479350984775), 0.8524362825271323], 'DecisionTreeRegressor': [DecisionTreeRegressor(), 0.015, 0.065, np.float64(0.25495097567963926), 0.9997207254759961], 'RandomForestRegressor': [RandomForestRegressor(), 1.8122208333333334, 5.264151503888887, np.float64(2.2943738805802525), 0.9773824091456439], 'AdaBoostRegressor': [AdaBoostRegressor(), 4.791905576651994, 34.55734714019221, np.float64(5.878549748040941), 0.8515232819474496], 'SVR': [SVR(), 4.97294529176065, 48.34051601488965, np.float64(6.952734427179687), 0.7923034676898046], 'LinearRegression': [LinearRegression(), 4.252013204488637, 28.364789437184317, np.float64(5.325860441016486), 0.8781297989455173], 'Ridge': [Ridge(), 4.25077839804168, 28.36748758361712, np.float64(5.326113741145331), 0.8781182062753525], 'Lasso': [Lasso(), 5.208398131423758, 43.66366022487953, np.float64(6.607848380893703), 0.8123977242219601], 'CatBoostRegressor': [<

In [37]:
pd.DataFrame(model_test_report, index=['Model', 'MAE', 'MSE', 'RMSE', 'R2_Score']).T.sort_values(by='R2_Score', ascending=False)


Unnamed: 0,Model,MAE,MSE,RMSE,R2_Score
Ridge,Ridge(),4.24927,28.549283,5.343153,0.867828
LinearRegression,LinearRegression(),4.253413,28.580976,5.346118,0.867681
CatBoostRegressor,<catboost.core.CatBoostRegressor object at 0x1...,4.485776,31.869691,5.645325,0.852456
AdaBoostRegressor,"(DecisionTreeRegressor(max_depth=3, random_sta...",4.629488,32.649356,5.713961,0.848846
RandomForestRegressor,"(DecisionTreeRegressor(max_features=1.0, rando...",4.662358,33.371924,5.776844,0.845501
Lasso,Lasso(),4.841101,37.043639,6.086349,0.828503
XGBRegressor,"XGBRegressor(base_score=None, booster=None, ca...",4.854394,39.690552,6.300044,0.816248
SVR,SVR(),4.89504,40.258557,6.344963,0.813619
KNeighborsRegressor,KNeighborsRegressor(),5.109,43.0318,6.559863,0.80078
DecisionTreeRegressor,DecisionTreeRegressor(),6.395,65.015,8.063188,0.699006


We decide to use Linear Regression since it hardly has any difference

In [None]:
lin = model_test_report['LinearRegression'][0] 
# we use the trained linear regression model in the model test report 
# instead of retraining it
y_test_pred = lin.predict(X_test)
score = r2_score(y_test, y_test_pred)*100
print(f"R2 Score of Linear Regression on test data: {score}")

R2 Score of Linear Regression on test data: 86.76813385550714
