## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Dataset

In [20]:
data_set = pd.read_csv('student-mat.csv', sep=';')

In [23]:
data_set[['G1', 'G2', 'G3', 'studytime', 'failures', 'activities', 'internet', 'health', 'absences']].head()

Unnamed: 0,G1,G2,G3,studytime,failures,activities,internet,health,absences
0,5,6,6,2,0,no,no,3,6
1,5,5,6,2,0,no,yes,3,4
2,7,8,10,2,3,no,yes,3,10
3,15,14,15,3,0,yes,yes,5,2
4,6,10,10,2,0,no,no,5,4


## Divide into dependent and independent variables

In [32]:
X = data_set.loc[:, ['G1', 'G2', 'studytime', 'failures', 'activities', 'internet', 'health', 'absences']].values
Y = data_set['G3'].values.reshape(-1, 1)

## Encode categorical features

In [42]:
from sklearn.preprocessing import LabelEncoder
l_e = LabelEncoder()
X[:, 4] = l_e.fit_transform(X[:, 4])  # encoded activities
X[:, 5] = l_e.fit_transform(X[:, 5])  # encoded internet

In [46]:
X = np.append(arr=np.ones((len(X), 1)).astype(int), values=X, axis=1)  # appended ones at front of the matrix

## Apply backward elemination algorithm

In [48]:
from statsmodels.regression.linear_model import OLS
X = np.array(X, dtype=float)
x_temp = X[:, [0,1,2,3,4,5,6,7]]
ols = OLS(endog=Y, exog=x_temp)
l_r = ols.fit()
l_r.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.826
Model:,OLS,Adj. R-squared:,0.823
Method:,Least Squares,F-statistic:,263.0
Date:,"Mon, 15 Jun 2020",Prob (F-statistic):,8.15e-143
Time:,18:28:19,Log-Likelihood:,-815.49
No. Observations:,395,AIC:,1647.0
Df Residuals:,387,BIC:,1679.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.4122,0.556,-2.538,0.012,-2.506,-0.318
x1,0.1492,0.056,2.642,0.009,0.038,0.260
x2,0.9811,0.050,19.626,0.000,0.883,1.079
x3,-0.1737,0.119,-1.462,0.145,-0.407,0.060
x4,-0.2772,0.142,-1.955,0.051,-0.556,0.002
x5,-0.2887,0.195,-1.478,0.140,-0.673,0.095
x6,-0.0342,0.263,-0.130,0.897,-0.552,0.484
x7,0.0869,0.071,1.232,0.219,-0.052,0.226

0,1,2,3
Omnibus:,227.185,Durbin-Watson:,1.872
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1450.963
Skew:,-2.475,Prob(JB):,0.0
Kurtosis:,10.979,Cond. No.,97.7


In [49]:
from statsmodels.regression.linear_model import OLS
X = np.array(X, dtype=float)
x_temp = X[:, [0,1,2,3,4,5,7]]
ols = OLS(endog=Y, exog=x_temp)
l_r = ols.fit()
l_r.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.826
Model:,OLS,Adj. R-squared:,0.824
Method:,Least Squares,F-statistic:,307.6
Date:,"Mon, 15 Jun 2020",Prob (F-statistic):,4.4900000000000006e-144
Time:,18:31:57,Log-Likelihood:,-815.5
No. Observations:,395,AIC:,1645.0
Df Residuals:,388,BIC:,1673.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.4390,0.516,-2.788,0.006,-2.454,-0.424
x1,0.1497,0.056,2.659,0.008,0.039,0.260
x2,0.9805,0.050,19.746,0.000,0.883,1.078
x3,-0.1743,0.119,-1.470,0.142,-0.407,0.059
x4,-0.2768,0.142,-1.955,0.051,-0.555,0.002
x5,-0.2898,0.195,-1.486,0.138,-0.673,0.094
x6,0.0875,0.070,1.245,0.214,-0.051,0.226

0,1,2,3
Omnibus:,227.129,Durbin-Watson:,1.871
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1449.704
Skew:,-2.474,Prob(JB):,0.0
Kurtosis:,10.975,Cond. No.,89.5


In [50]:
from statsmodels.regression.linear_model import OLS
X = np.array(X, dtype=float)
x_temp = X[:, [0,1,2,3,4,5]]
ols = OLS(endog=Y, exog=x_temp)
l_r = ols.fit()
l_r.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.826
Model:,OLS,Adj. R-squared:,0.823
Method:,Least Squares,F-statistic:,368.2
Date:,"Mon, 15 Jun 2020",Prob (F-statistic):,4.81e-145
Time:,18:32:20,Log-Likelihood:,-816.29
No. Observations:,395,AIC:,1645.0
Df Residuals:,389,BIC:,1668.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.0926,0.435,-2.511,0.012,-1.948,-0.237
x1,0.1516,0.056,2.692,0.007,0.041,0.262
x2,0.9765,0.050,19.693,0.000,0.879,1.074
x3,-0.1837,0.118,-1.551,0.122,-0.417,0.049
x4,-0.2717,0.142,-1.918,0.056,-0.550,0.007
x5,-0.2812,0.195,-1.442,0.150,-0.665,0.102

0,1,2,3
Omnibus:,232.73,Durbin-Watson:,1.88
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1548.203
Skew:,-2.534,Prob(JB):,0.0
Kurtosis:,11.269,Cond. No.,74.5


In [51]:
from statsmodels.regression.linear_model import OLS
X = np.array(X, dtype=float)
x_temp = X[:, [0,1,2,3,4]]
ols = OLS(endog=Y, exog=x_temp)
l_r = ols.fit()
l_r.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.825
Model:,OLS,Adj. R-squared:,0.823
Method:,Least Squares,F-statistic:,458.5
Date:,"Mon, 15 Jun 2020",Prob (F-statistic):,5.9499999999999994e-146
Time:,18:34:18,Log-Likelihood:,-817.34
No. Observations:,395,AIC:,1645.0
Df Residuals:,390,BIC:,1665.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.1991,0.429,-2.792,0.005,-2.043,-0.355
x1,0.1503,0.056,2.666,0.008,0.039,0.261
x2,0.9765,0.050,19.666,0.000,0.879,1.074
x3,-0.1967,0.118,-1.663,0.097,-0.429,0.036
x4,-0.2630,0.142,-1.856,0.064,-0.542,0.016

0,1,2,3
Omnibus:,235.539,Durbin-Watson:,1.875
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1601.529
Skew:,-2.564,Prob(JB):,0.0
Kurtosis:,11.427,Cond. No.,73.2


## Divide the dependent and independent features we get from backward elemination algorithm

In [74]:
new_x = data_set[['G1', 'G2', 'studytime', 'failures']].values
new_y = data_set['G3'].values.reshape(-1, 1)

## Divide data into train and test set

In [75]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(new_x, new_y, train_size=0.8, test_size=0.2)

## Train model

In [78]:
from sklearn.linear_model import LinearRegression
l_r = LinearRegression()
l_r.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [79]:
l_r.coef_  # equation coefficients

array([[ 0.14264609,  0.9817479 , -0.04227558, -0.30802452]])

In [80]:
l_r.intercept_   # equation intersept

array([-1.50115851])

## Predict new test values

In [81]:
predicted_y = l_r.predict(x_test)

## Analyse results Test values v/s Predicted values

In [101]:
print('Test values    Predicted values')
for ry, py in zip(y_test ,predicted_y):
    print('   ', ry[0], '            ',round(py[0], 1))

Test values    Predicted values
    17              15.3
    12              11.9
    10              8.4
    11              10.8
    10              8.6
    0              7.4
    14              14.2
    11              11.8
    6              6.5
    14              14.3
    15              14.3
    18              18.6
    10              8.6
    10              8.5
    10              8.5
    10              9.7
    14              14.2
    11              9.8
    12              10.9
    0              -1.8
    10              9.1
    9              9.4
    0              8.6
    0              8.6
    5              3.9
    10              10.7
    13              13.0
    0              7.3
    17              16.5
    15              15.4
    13              12.6
    0              -1.8
    11              8.9
    11              8.7
    11              10.7
    13              11.9
    15              15.3
    14              11.9
    11              10.8
    10             

## Predict new value (other than test set)

In [142]:
G1 = 16
G2 = 14
studytime = 2
failure = 0
prediction = l_r.predict([[G1, G2, studytime, failure]])
prediction = round(prediction[0][0], 1)

In [143]:
if prediction > 20:
    prediction = 20
elif prediction < 0:
    prediction = 0
print('G3: ', prediction)

G3:  14.4
