In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import matplotlib.pyplot as plt
from sklearn import preprocessing
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Loading dataset

In [None]:
data=pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')
data.head()

**Objective: To build a linear regression model to predict math score of a student**

# Data cleaning

In [None]:
clean_data=data.copy()

arr=np.array(clean_data['math score'])

q3=np.quantile(arr,0.75)
q1=np.quantile(arr,0.25)
iqr=q3-q1
print(clean_data[clean_data['math score']<(q1-1.5*iqr)])
print(clean_data[clean_data['math score']>(q3+1.5*iqr)])

There are no outliers in the data. Next, we will check for missing values in the data set.


In [None]:
clean_data.isna().sum()

We observe that there are no missing values in the data set. Now we will scale the data.

# Data scaling

In [None]:
standard=preprocessing.StandardScaler()
scaled=standard.fit_transform(clean_data[['math score','reading score','writing score']])
scaled=pd.DataFrame(scaled,columns=['math score','reading score','writing score'])
clean_data[['math score','reading score','writing score']]=scaled
clean_data.head()

The scores have been scaled using Standard scaler. Standard scaler was chosen because the scores follow normal distribution and have no outliers. Now, we will do categorical encoding of the data as ML model does not understand string values. We will use one hot encoding technique as the values are nominal categorical and the number of parameters is relatively low.

In [None]:
clean_data=pd.get_dummies(clean_data)
clean_data.head()

# Feature selection

In [None]:
import seaborn as sns
plt.figure(figsize=(12,12))
sns.heatmap(clean_data.corr(),center=0,cmap='inferno',annot=True)

Based in heat map, we decided that math score depends on reading score, writing score, gender, type of lunch and whether test preparation was completed or not. The criteria adopt is correlation being more than 0.1. Since we observed that the model performs better by excluding the factor of test preparation.

# Test and train split

In [None]:
X=clean_data[['reading score','writing score','lunch_free/reduced','lunch_standard','gender_female','gender_male']]
#X=clean_data[['reading score','writing score','lunch_free/reduced','lunch_standard','test preparation course_completed','test preparation course_none']]
Y=clean_data['math score']

from sklearn.model_selection import train_test_split
X_tr,X_te,Y_tr,Y_te=train_test_split(X,Y,test_size=0.1,random_state=1)


From a bit of hit and trial, we realized that the fact that whether test preparation has been completed or not does not influence the explainatory power of the model significantly (low adjusted R2 value~0.7) but removing them increases the R2 and adjusted R2 value till 0.85  This is a vast improvement!

# Building Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression()
reg.fit(X_tr,Y_tr)
coef=reg.coef_
y_pred=X_te['reading score']*coef[0]+X_te['writing score']*coef[1]+X_te['lunch_free/reduced']*coef[2]+X_te['lunch_standard']*coef[3]+X_te['gender_female']*coef[4]+X_te['gender_male']*coef[5]
plt.scatter(X_te['reading score'],y_pred,color='k',label='predicted')
plt.scatter(X_te['reading score'],Y_te,color='b',label='actual')
plt.legend()

# Quality check of the model

In [None]:
import statsmodels.api as sm
X_te=sm.add_constant(X_tr)
model=sm.OLS(Y_tr,X_tr).fit()
model.summary()

The R2 and adjusted R2 values are nearly equal to 0.85. This indicates absence of non-essential independent variables in the model. The model explains 85% of the actual variance in the math score.  

P-value of F-test is 0 and this indicates that the multi linear regression model predicts better than intercept only model. 

Durbin-Watson test yields value of around 2 and this is acceptable. The model has negligible multi-collinearity. 

Skewness is negative and this indicates that outliers in the predicted dataset is on the lower side. It's absolute value of around 0 indicates near normal distribution with slight skewness. This is further supported by kurtosis value of around 3. 

In [None]:
from sklearn.metrics import r2_score
print("R2 score without regularization - test data= ",r2_score(y_pred,Y_te))

In [None]:
from sklearn.linear_model import Ridge,Lasso, ElasticNet

print("After reqularization - train data")
rid=Ridge(alpha=0.05)
rid.fit(X_tr,Y_tr)
y_reg=rid.predict(X_tr)
print("R2 score with Ridge regression=",r2_score(y_reg,Y_tr))
rid=Lasso(alpha=0.05)
rid.fit(X_tr,Y_tr)
y_reg=rid.predict(X_tr)
print("R2 score with Lasso regression=",r2_score(y_reg,Y_tr))
rid=ElasticNet(alpha=0.05)
rid.fit(X_tr,Y_tr)
y_reg=rid.predict(X_tr)
print("R2 score with Elastic Net regression=",r2_score(y_reg,Y_tr))

Lasso regression gives lowest R2 value because it diminishes the coefficients of few independent variables to zero. This leads to feature selection. However, in this case as we observed earlier that R2 and adjusted R2 are nearly equal to 0.85. So, unnecessary features hardly exist that need to be selected and removed.Thus, out of the three techniques Ridge regression would be the best choice, if that is to be applied. 

We note that the model gives R2 value of 0.85 with test data and 0.83 with train data. The reduction is minimal. So,overall regularization would not be much beneficial in this case.

# Results

**For male students:**

*If free lunch is taken,*
   Math score = 0.4 x RS + 0.5 x WS - 0.3


*If standard lunch is taken,*
   Math score = 0.4 x RS + 0.5 x WS + 0.5


**For female students:**

*If free lunch is taken,*
Math score = 0.4 x RS + 0.5 x WS - 0.5


*If standard lunch is taken,*
Math score = 0.4 x RS + 0.5 x WS - 0.3

where
**RS:** standardized reading score (Z-score)
**WS:** standardized writing score (Z-score)
