#   Model Training

### 1.1 Importing Packages

In [83]:
#   importing basic libraries

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [84]:
#   importing modelling libraries

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

from catboost import CatBoostRegressor
from xgboost import XGBRegressor

import warnings

### 1.2 Importing the CSV file as Pandas Dataframe

In [85]:
df = pd.read_csv("data/StudentsPerformance.csv")

In [86]:
print(df.shape, end="\n\n\n")
print(df.head())

(1000, 8)


   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  


### 1.3 Splitting independent features and dependent feature

In [87]:
x = df.drop("math score", axis=1, inplace=False)
x

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75
...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,99,95
996,male,group C,high school,free/reduced,none,55,55
997,female,group C,high school,free/reduced,completed,71,65
998,female,group D,some college,standard,completed,78,77


In [88]:
y = df["math score"]
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

### 1.4 Creating Column Transformer with 3 types of Transformers

In [89]:
#   splitting numerical features

#   numerical features
num_features = x.select_dtypes(exclude="O").columns

#   categorical features
cat_features = x.select_dtypes(include="O").columns

#### 1.4.1 creating a pipeline of transforming categorical features into numerical features and then scaling these numerical features.

In [90]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [91]:
#   initializing the one hot encoder and standard scaler for use

oh_transformer= OneHotEncoder()
numeric_transformer = StandardScaler()

In [92]:
#   creating the pipeline

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),
    ]
)

#   note: this preprocessor pipeline object will be used later
#   to perform any kind of fit or transform on the data

In [93]:
x = preprocessor.fit_transform(x)

In [94]:
x

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [95]:
x.shape

(1000, 19)

### 1.5 Splitting train and test sets

In [96]:
from sklearn.model_selection import train_test_split

In [97]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state=42)

In [98]:
print("x_train size= ", x_train.shape)
print("y_train size= ", y_train.shape)

print("x_test size= ", x_test.shape)
print("y_test size= ", y_test.shape)

x_train size=  (800, 19)
y_train size=  (800,)
x_test size=  (200, 19)
y_test size=  (200,)


### 1.6 Creating an Evaluation Function to evaluate the model on various metrics after training

In [99]:
def evaluate_model(true, predicted):
    '''This function returns mean absolute error, mean squared error, root mean squared error and r2_square error.'''
    
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)

    return mae, mse, rmse, r2_square

### 1.7 Model Training Process

#### 1.7.1 evaluating various predictive models on the same data

In [100]:
models = {
    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "K-Neighbors Regressor" : KNeighborsRegressor(),
    "Decision Tree Regressor" : DecisionTreeRegressor(),
    "Random Forest Regressor" : RandomForestRegressor(),
    "XGB Regressor" : XGBRegressor(),
    "CatBoosting Regressor" : CatBoostRegressor(verbose=False),
    "AdaBoost Regressor" : AdaBoostRegressor()
}

In [101]:
model_list = []
r2_list = []

for i in range(len(list(models))):

    model = list(models.values())[i]
    model.fit(x_train, y_train)

    #   making predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    #   evaluate train and test dataset
    model_train_mae, model_train_mse, model_train_rmse, model_train_r2score = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2score =  evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i], end="\n\n")
    model_list.append(list(models.keys())[i])


    print('Model Performance for Training set')
    print("Mean Absolute Error : {:.4f}".format(model_train_mae))
    print("Mean Squared Error : {:.4f}".format(model_train_mse))
    print("Root Mean Squared Error : {:.4f}".format(model_train_rmse))
    print("R2 score : {:.4f}".format(model_train_r2score))

    print()

    print('Model Performance for Test set')
    print("Mean Absolute Error : {:.4f}".format(model_test_mae))
    print("Mean Squared Error : {:.4f}".format(model_test_mse))
    print("Root Mean Squared Error : {:.4f}".format(model_test_rmse))
    print("R2 score : {:.4f}".format(model_test_r2score))

    r2_list.append(model_test_r2score)

    print()
    print('------------------------------------------')
    print()

LinearRegression

Model Performance for Training set
Mean Absolute Error : 4.2671
Mean Squared Error : 28.3487
Root Mean Squared Error : 5.3244
R2 score : 0.8743

Model Performance for Test set
Mean Absolute Error : 4.2158
Mean Squared Error : 29.1167
Root Mean Squared Error : 5.3960
R2 score : 0.8803

------------------------------------------

Lasso

Model Performance for Training set
Mean Absolute Error : 5.2063
Mean Squared Error : 43.4784
Root Mean Squared Error : 6.5938
R2 score : 0.8071

Model Performance for Test set
Mean Absolute Error : 5.1579
Mean Squared Error : 42.5064
Root Mean Squared Error : 6.5197
R2 score : 0.8253

------------------------------------------

Ridge

Model Performance for Training set
Mean Absolute Error : 4.2650
Mean Squared Error : 28.3378
Root Mean Squared Error : 5.3233
R2 score : 0.8743

Model Performance for Test set
Mean Absolute Error : 4.2111
Mean Squared Error : 29.0563
Root Mean Squared Error : 5.3904
R2 score : 0.8806

----------------------

Random Forest Regressor

Model Performance for Training set
Mean Absolute Error : 1.8450
Mean Squared Error : 5.4033
Root Mean Squared Error : 2.3245
R2 score : 0.9760

Model Performance for Test set
Mean Absolute Error : 4.6890
Mean Squared Error : 36.7697
Root Mean Squared Error : 6.0638
R2 score : 0.8489

------------------------------------------

XGB Regressor

Model Performance for Training set
Mean Absolute Error : 0.6875
Mean Squared Error : 1.0146
Root Mean Squared Error : 1.0073
R2 score : 0.9955

Model Performance for Test set
Mean Absolute Error : 5.0577
Mean Squared Error : 41.9037
Root Mean Squared Error : 6.4733
R2 score : 0.8278

------------------------------------------

CatBoosting Regressor

Model Performance for Training set
Mean Absolute Error : 2.4054
Mean Squared Error : 9.2578
Root Mean Squared Error : 3.0427
R2 score : 0.9589

Model Performance for Test set
Mean Absolute Error : 4.6125
Mean Squared Error : 36.1037
Root Mean Squared Error : 6.0086
R2 score : 0.

#### Results:

In [102]:
lis= pd.DataFrame(
    list(zip(model_list, r2_list)),
    columns=["Model Name", "R2_score"]
).sort_values(by= ["R2_score"],ascending=False)

In [103]:
print(lis)

                Model Name  R2_score
2                    Ridge  0.880593
0         LinearRegression  0.880345
8       AdaBoost Regressor  0.852055
7    CatBoosting Regressor  0.851632
5  Random Forest Regressor  0.848895
6            XGB Regressor  0.827797
1                    Lasso  0.825320
3    K-Neighbors Regressor  0.783813
4  Decision Tree Regressor  0.727560


##### Result:    
1.  Ridge and Linear Regression performed the best.
2.  The difference in performance score of Ridge and Linear Regression is very small.

#### 1.7.2 Training the model on Linear Regression

In [9]:
#   renaming the column names

import pandas as pd
df = pd.read_csv('data/StudentsPerformance.csv')
df.rename(columns={'race/ethnicity':'race_ethnicity', 'parental level of education': 'parental_level_of_education', 'test preparation course': 'test_preparation_course', 'reading score': 'reading_score', 'writing score': 'writing_score', 'math score':'math_score'}, inplace=True)
df.to_csv('StudentPerformances.csv',index=False)