## Multiple Linear Regression with statsmodels

In [11]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd

from sklearn import datasets, model_selection


In [12]:
boston = datasets.load_boston()

X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target


In [13]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.7)

In [14]:
model = sm.OLS(y_train, sm.add_constant(X_train))
res = model.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.718
Model:                            OLS   Adj. R-squared:                  0.708
Method:                 Least Squares   F-statistic:                     66.71
Date:                Fri, 18 Aug 2017   Prob (F-statistic):           3.30e-85
Time:                        17:35:58   Log-Likelihood:                -1050.6
No. Observations:                 354   AIC:                             2129.
Df Residuals:                     340   BIC:                             2183.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         42.8182      5.999      7.138      0.0

In [15]:
res.pvalues

const      5.774031e-12
CRIM       1.277064e-01
ZN         5.644465e-02
INDUS      5.833678e-01
CHAS       6.790150e-02
NOX        3.373860e-04
RM         4.046170e-10
AGE        5.111047e-01
DIS        5.447579e-09
RAD        1.312844e-03
TAX        1.641811e-02
PTRATIO    5.109053e-10
B          1.501062e-02
LSTAT      2.456468e-16
dtype: float64

In [16]:
res.pvalues < 0.05

const       True
CRIM       False
ZN         False
INDUS      False
CHAS       False
NOX         True
RM          True
AGE        False
DIS         True
RAD         True
TAX         True
PTRATIO     True
B           True
LSTAT       True
dtype: bool

In [17]:
dat = X_train.copy()
dat['PRICE'] = y_train
dat.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
184,0.08308,0.0,2.46,0.0,0.488,5.604,89.8,2.9879,3.0,193.0,17.8,391.0,13.98,26.4
307,0.04932,33.0,2.18,0.0,0.472,6.849,70.3,3.1827,7.0,222.0,18.4,396.9,7.53,28.2
288,0.0459,52.5,5.32,0.0,0.405,6.315,45.6,7.3172,6.0,293.0,16.6,396.9,7.6,22.3
137,0.35233,0.0,21.89,0.0,0.624,6.454,98.4,1.8498,4.0,437.0,21.2,394.08,14.59,17.1
292,0.03615,80.0,4.95,0.0,0.411,6.63,23.4,5.1167,4.0,245.0,19.2,396.9,4.7,27.9


In [18]:
results = smf.ols('PRICE ~ CRIM + ZN + INDUS + CHAS + NOX + RM + DIS + RAD + PTRATIO + B', data=dat).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.642
Model:                            OLS   Adj. R-squared:                  0.632
Method:                 Least Squares   F-statistic:                     61.60
Date:                Fri, 18 Aug 2017   Prob (F-statistic):           1.73e-70
Time:                        17:36:25   Log-Likelihood:                -1092.9
No. Observations:                 354   AIC:                             2208.
Df Residuals:                     343   BIC:                             2250.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     27.4607      6.453      4.256      0.0

In [19]:
results = smf.ols('PRICE ~ CRIM + ZN', data=dat).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.231
Model:                            OLS   Adj. R-squared:                  0.227
Method:                 Least Squares   F-statistic:                     52.72
Date:                Fri, 18 Aug 2017   Prob (F-statistic):           9.52e-21
Time:                        17:36:28   Log-Likelihood:                -1228.4
No. Observations:                 354   AIC:                             2463.
Df Residuals:                     351   BIC:                             2474.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     22.4943      0.518     43.419      0.0