In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [2]:
from sklearn.linear_model import LinearRegression

In [3]:
data = pd.read_csv('1.02. Multiple linear regression.csv')

In [4]:
data.head()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
0,1714,1,2.4
1,1664,3,2.52
2,1760,3,2.54
3,1685,3,2.74
4,1693,2,2.83


In [5]:
data.describe()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
count,84.0,84.0,84.0
mean,1845.27381,2.059524,3.330238
std,104.530661,0.855192,0.271617
min,1634.0,1.0,2.4
25%,1772.0,1.0,3.19
50%,1846.0,2.0,3.38
75%,1934.0,3.0,3.5025
max,2050.0,3.0,3.81


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 3 columns):
SAT           84 non-null int64
Rand 1,2,3    84 non-null int64
GPA           84 non-null float64
dtypes: float64(1), int64(2)
memory usage: 2.0 KB


In [7]:
x = data[['SAT', 'Rand 1,2,3']]

In [8]:
y = data['GPA']

In [9]:
x.shape

(84, 2)

In [10]:
y.shape

(84,)

In [11]:
reg = LinearRegression()

In [12]:
reg.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
#Coefficients
reg.coef_

array([ 0.00165354, -0.00826982])

In [14]:
#intercept
reg.intercept_

0.29603261264909353

In [15]:
#R-squared: Common measure of goodness of fit
reg.score(x,y)

0.40668119528142815

In [16]:
#adjusted R-squared: There is no inbuilt method included in 
#sklearn to find adjusted R square
#Fromula for R-squared

In [17]:
### Formula for Adjusted R^2
# $R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

### Formula for Adjusted R^2
$R^2_{adj.} = 1 - (1-R^2)*\frac{n-1}{n-p-1}$

In [18]:
#n = number of observations (84 here)
#p = number of predictors/independent variables (2 here)

In [19]:
#Step 1: Calculate and store Rsquare in r2
r2 = reg.score(x,y)
#Step 2: Find n and p
n = x.shape[0]
p = x.shape[1]
#Step 3: Calculate adj r-square
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)

In [20]:
adjusted_r2

0.39203134825134

In [26]:
#Conclusion: Adjusted R-square < R-square, thus one or more 
#of the predictors have little or no explanatory power

### Feature Selection

In [27]:
#Identifying the non-necessary predictors: Feature Selection
#This is something we already did. 
#Remember the statsmodel regression table, there we has regression
#table which gives the p-value. The p-value < 0.05 shows
#the feature is important. 
#We will do the same here, we will find the p-value with sklearn
#There is not built-in package/method for that in sklearn as 
#sklearn has evolved more as a machine learning package
#rather than a statistical package

In [28]:
#a very close concept is available from feature selection module
#of sklearn is: feature_selection.f_regression
#f_regression: creates simple linear regression of each feature
#and the dependent variable

#For our GPA SAT Multiple regression, it will convert to 2 regressions
#1. GPA <- SAT
#2. GPA <- Rand 1,2,3
#Then method will calculate the f-statistics for each and return
#the p-values
#If there were 50 features, 50 simple regression will be created

#Note: For simple LR: p-value of F-stat coincides with p-value of independent variable
#hence this method is precisly what we need

In [29]:
from sklearn.feature_selection import f_regression

In [30]:
f_regression(x,y)

(array([56.04804786,  0.17558437]), array([7.19951844e-11, 6.76291372e-01]))

In [31]:
#Two output arrays: 
#First array contains: F-statistics for each regression
#Second array contains: Corresponsing p-values

In [33]:
p_values = f_regression(x,y)[1]

In [34]:
p_values

array([7.19951844e-11, 6.76291372e-01])

In [35]:
#7.19951844e-11, 6.76291372e-01 - Scientific Notations
#e-11 = 10 to power -11 = 1/10 to power 11 
#e-01 = 10 to power -1 = 1/10 to power 1 = 1/10

In [39]:
#Converting the p-values to 3 digits
p_values.round(3)

array([0.   , 0.676])

In [41]:
#p-value of SAT = 0.000 (useful feature)
#P-value od Rand 1,2,3 = 0.676 (Not usefule feature)

In [42]:
#Note that these are univariate p-values (i.e. one to one) from 
#simple linear models. They do not reflect the interconnection
#of the features in our mutliple linear regression.
#Therefore, F-regression must be used with caution

### Create a Summary Table

In [43]:
reg_summary = pd.DataFrame(data=x.columns.values, columns=['Features'])

In [44]:
reg_summary

Unnamed: 0,Features
0,SAT
1,"Rand 1,2,3"


In [45]:
#Adding Coefficients column
reg_summary['Coefficients'] = reg.coef_
reg_summary['p-values'] = p_values.round(3)

In [46]:
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,SAT,0.001654,0.0
1,"Rand 1,2,3",-0.00827,0.676


In [49]:
#This is just once, but while filling it fills in every row
reg_summary['Intercept'] = reg.intercept_

In [50]:
reg_summary

Unnamed: 0,Features,Coefficients,p-values,Intercept
0,SAT,0.001654,0.0,0.296033
1,"Rand 1,2,3",-0.00827,0.676,0.296033
