# Model Training

## 1.1 Import Data and required packages

In [88]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor

import warnings

Import the csv data as Pandas Data Frame

In [2]:
df= pd.read_csv(r'C:\Users\win10\Desktop\Upgrad\ML WebApps\notebook\data\data.csv')

Top 5 Records

In [3]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


### Creating Independent and Dependent feature from existing Data Frame

In [4]:
X=df.drop('math_score',axis=1)
y=df['math_score']

Shape of X and y

In [5]:
print("Shape of X variable : {}".format(X.shape))
print("Shape of y variable : {}".format(y.shape))

Shape of X variable : (1000, 7)
Shape of y variable : (1000,)


In [6]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [8]:
y.head()

0    72
1    69
2    90
3    47
4    76
Name: math_score, dtype: int64

Creating preprocessor using ColumnTranformer to tranform categorical and numerical features

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [18]:
# creating 2 list for numerical feature and categorical feature 
num_features=X.select_dtypes(exclude='object').columns
cat_features=X.select_dtypes(include='object').columns

num_tranformer=StandardScaler()
cat_tranformer=OneHotEncoder()

preprocessor=ColumnTransformer(
                                transformers=[
                                    ('numer_tranform',num_tranformer,num_features),
                                    ('cat_tranformer',cat_tranformer,cat_features)
                                ],
                                remainder='passthrough'
)

Applying the preprocess to tranform the features

In [19]:
X= preprocessor.fit_transform(X)

In [21]:
X.shape

(1000, 19)

Creating Train and Test Split

In [24]:
X_train,X_test, y_train,y_test= train_test_split(X,y, test_size=0.25,random_state=42)

Shape of train and test data

In [26]:
print("X train Shape : {}".format(X_train.shape))
print("X test Shape : {}".format(X_test.shape))
print("y train Shape : {}".format(y_train.shape))
print("y test Shape : {}".format(y_test.shape))

X train Shape : (750, 19)
X test Shape : (250, 19)
y train Shape : (750,)
y test Shape : (250,)


Creating function for model evaluation 

In [28]:
def performance(true, predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true, predicted)
    r2=r2_score(true,predicted)
    
    return mae,mse,r2

Creating Dictonary of models we want to train our dataset

In [89]:
models={
    "Linear Regression" : LinearRegression(),
    "Lasso Resgression" : Lasso(),
    "Ridge Regression"  : Ridge(),
    "K-Neighbour Regressor" : KNeighborsRegressor(),
    "Decision Tree Regressor" : DecisionTreeRegressor(),
    "Random Forest Regressor" : RandomForestRegressor(),
    "AdaBoost Regressor" : AdaBoostRegressor()
 }

Reshaping Our Y Train 

In [78]:
y_train=y_train.values.reshape(-1,1)

Training Dataset on these Models.

In [90]:
model_list=[]
r2_list=[]

for i in range(len(models.values())):
    model=list(models.values())[i]
    model.fit(X_train,y_train)  #Model Training

    #Make Prediction
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    #Evaluating model performance for training and test set
    model_train_mae,model_train_mse,model_train_r2= performance(y_train,y_train_pred)
    model_test_mae,model_test_mse,model_test_r2= performance(y_test,y_test_pred)

    print(list(models.keys())[i])
    print("Model Performance for Training set")
    print("- Root Absolute Error : {}".format(model_train_mae))
    print("- Root Squared Error : {}".format(model_train_mse))
    print("- R Squared Error : {}".format(model_train_r2))

    print("----------------------------")

    print("Model Performance for Test set")
    print("- Root Absolute Error : {}".format(model_test_mae))
    print("- Root Squared Error : {}".format(model_test_mse))
    print("- R Squared Error : {}".format(model_test_r2))

    print('='*35,'\n')

    model_list.append(list(models.keys())[i])
    r2_list.append(model_test_r2)

Linear Regression
Model Performance for Training set
- Root Absolute Error : 4.238267465725305
- Root Squared Error : 28.06080423568467
- R Squared Error : 0.8742732380399838
----------------------------
Model Performance for Test set
- Root Absolute Error : 4.337930600167401
- Root Squared Error : 30.05811462672032
- R Squared Error : 0.8778243107659015

Lasso Resgression
Model Performance for Training set
- Root Absolute Error : 5.183725467135435
- Root Squared Error : 42.92215298860819
- R Squared Error : 0.8076867909314052
----------------------------
Model Performance for Test set
- Root Absolute Error : 5.2217044431408555
- Root Squared Error : 44.27742512370272
- R Squared Error : 0.8200278028352899

Ridge Regression
Model Performance for Training set
- Root Absolute Error : 4.236822709293622
- Root Squared Error : 28.06418045343843
- R Squared Error : 0.8742581108568038
----------------------------
Model Performance for Test set
- Root Absolute Error : 4.33537241938823
- Root S

  model.fit(X_train,y_train)  #Model Training


Random Forest Regressor
Model Performance for Training set
- Root Absolute Error : 1.8328477777777776
- Root Squared Error : 5.228725204629629
- R Squared Error : 0.9765726354941456
----------------------------
Model Performance for Test set
- Root Absolute Error : 4.7613433333333335
- Root Squared Error : 37.847501986111105
- R Squared Error : 0.84616318427266

AdaBoost Regressor
Model Performance for Training set
- Root Absolute Error : 4.698475600541065
- Root Squared Error : 33.33986949119867
- R Squared Error : 0.850620324346678
----------------------------
Model Performance for Test set
- Root Absolute Error : 4.823085395241953
- Root Squared Error : 38.322555925826954
- R Squared Error : 0.8442322566935708



  y = column_or_1d(y, warn=True)


In [99]:
best_model=pd.DataFrame(list(zip(model_list,r2_list)),columns=['Regressor','R2_score']).sort_values(by='R2_score',ascending=False)
best_model

Unnamed: 0,Regressor,R2_score
2,Ridge Regression,0.87799
0,Linear Regression,0.877824
5,Random Forest Regressor,0.846163
6,AdaBoost Regressor,0.844232
1,Lasso Resgression,0.820028
3,K-Neighbour Regressor,0.794216
4,Decision Tree Regressor,0.764852


As we can we got a good performance from Ridge and Linear Regression, lets go ahead and chose our model as linear regression.

### Linear Regression

In [101]:
linear_model=LinearRegression(fit_intercept=True)
linear_model.fit(X_train,y_train)

y_pred=linear_model.predict(X_test)

In [116]:
print("Intercept for our linear Model : ",linear_model.intercept_[0],'\n')
print("Coefficents for our linear Model are :",linear_model.coef_)
print('\n','Accuracy of the Model is',f'{r2_score(y_test,y_pred)*100:.2f}')

Intercept for our linear Model :  65.57334231389379 

Coefficents for our linear Model are : [[ 3.35643752e+00  1.11222532e+01 -6.55315835e+00  6.55315835e+00
  -2.90315752e-01 -5.94694341e-01 -1.46782950e+00 -1.53748986e+00
   3.89032945e+00  6.11447676e-03 -7.61031598e-01  8.22351574e-01
  -1.49308802e+00  8.25360412e-01  6.00293154e-01 -1.72785641e+00
   1.72785641e+00 -1.58071976e+00  1.58071976e+00]]

 Accuracy of the Model is 87.78
