# ISLR chapter 3 Linear Regression

## Load data

In [1]:
# Setup Matplotlib to display in page instead of new window
%matplotlib inline

In [6]:
# Install library
!pip install statsmodels

Looking in indexes: http://mirrors.aliyun.com/pypi/simple/
Collecting statsmodels
[?25l  Downloading http://mirrors.aliyun.com/pypi/packages/22/34/f32c5812145d80bdb5e92af73e2173d08012379d52a30e87bef5a8b1b6e1/statsmodels-0.9.0-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (9.6MB)
[K    100% |████████████████████████████████| 9.6MB 1.4MB/s ta 0:00:01    40% |█████████████                   | 3.9MB 1.8MB/s eta 0:00:04    68% |██████████████████████          | 6.6MB 1.1MB/s eta 0:00:03
[?25hCollecting patsy (from statsmodels)
[?25l  Downloading http://mirrors.aliyun.com/pypi/packages/5d/eb/92c4b45ca47a2dd1339c958636e083b50ffadb5162a599a1cbbe92f89832/patsy-0.5.0-py2.py3-none-any.whl (232kB)
[K    100% |████████████████████████████████| 235kB 1.8MB/s ta 0:00:01
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.0 statsmodels-0.9.0


In [17]:
import numpy as np
import pandas as pd

In [51]:
# Read Boston House Pricing data
data_train = pd.read_csv('./data/boston.csv')
data_train.head()

# We will seek to predict medv using 13 predictors such as 
    # rm (average number of Rooms per house), 
    # age (average Age of houses), and 
    # lstat (percent of households with Low Socioeconomic Status).

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## Fit data to Linear Model

In [34]:
# Create Model: Simple Linear Regression
#    OLS: Ordinary Least Squares
#    variable x: lstat
#    result y: medv

from statsmodels.api import OLS as ols

linear_model = ols.from_formula('medv ~ lstat', data=data_train)
fit = linear_model.fit()

# Display prediction
fit.summary()

0,1,2,3
Dep. Variable:,medv,R-squared:,0.544
Model:,OLS,Adj. R-squared:,0.543
Method:,Least Squares,F-statistic:,601.6
Date:,"Mon, 22 Oct 2018",Prob (F-statistic):,5.08e-88
Time:,23:16:48,Log-Likelihood:,-1641.5
No. Observations:,506,AIC:,3287.0
Df Residuals:,504,BIC:,3295.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,34.5538,0.563,61.415,0.000,33.448,35.659
lstat,-0.9500,0.039,-24.528,0.000,-1.026,-0.874

0,1,2,3
Omnibus:,137.043,Durbin-Watson:,0.892
Prob(Omnibus):,0.0,Jarque-Bera (JB):,291.373
Skew:,1.453,Prob(JB):,5.36e-64
Kurtosis:,5.319,Cond. No.,29.7


In [36]:
# Try to get some values from the Fit informations
# rsquared -> R², fvalue -> F-statistic, Intercept, variable -> lstat
fit.rsquared, fit.fvalue, fit.params.Intercept, fit.params.lstat

(0.5441462975864799, 601.6178711098955, 34.55384087938308, -0.9500493537579922)

In [49]:
# Get basic statistics from the model
fit.resid.describe()

count    5.060000e+02
mean     3.521821e-14
std      6.209603e+00
min     -1.516745e+01
25%     -3.989612e+00
50%     -1.318186e+00
75%      2.033701e+00
max      2.450013e+01
dtype: float64

## Make predictions

In [37]:
# Values [Intercept, lstat]
values_test = [[1, 5], [1, 10], [1, 15]]
data_test = pd.DataFrame(values_test, columns=['Intercept', 'lstat'])

In [44]:
# Prediction on "lstat", based on "Intercept"
fit.predict(data_test)

0    29.803594
1    25.053347
2    20.303101
dtype: float64

## Get Confidence Interval of Prediction

In [45]:
# Setup Confidence Level 𝞪=95% 
fit.conf_int(alpha=0.95)

Unnamed: 0,0,1
Intercept,34.518543,34.589139
lstat,-0.952479,-0.947619
