## Multiple Linear Regression with statsmodels

In [15]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd

from sklearn import datasets, model_selection


In [16]:
boston = datasets.load_boston()

X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target


In [17]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.7)



In [18]:
model = sm.OLS(y_train, sm.add_constant(X_train))
res = model.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.767
Model:                            OLS   Adj. R-squared:                  0.758
Method:                 Least Squares   F-statistic:                     86.20
Date:                Sat, 30 Sep 2017   Prob (F-statistic):           4.05e-99
Time:                        18:19:02   Log-Likelihood:                -998.34
No. Observations:                 354   AIC:                             2025.
Df Residuals:                     340   BIC:                             2079.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         29.7434      5.252      5.663      0.0

In [19]:
res.pvalues

const      3.172061e-08
CRIM       2.277476e-04
ZN         4.867809e-03
INDUS      6.008156e-01
CHAS       9.875179e-02
NOX        1.058637e-02
RM         1.024719e-16
AGE        5.710887e-01
DIS        1.308529e-08
RAD        1.452952e-04
TAX        4.738253e-04
PTRATIO    9.851339e-09
B          2.262158e-03
LSTAT      2.526541e-15
dtype: float64

In [20]:
res.pvalues < 0.05

const       True
CRIM        True
ZN          True
INDUS      False
CHAS       False
NOX         True
RM          True
AGE        False
DIS         True
RAD         True
TAX         True
PTRATIO     True
B           True
LSTAT       True
dtype: bool

In [21]:
dat = X_train.copy()
dat['PRICE'] = y_train
dat.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
28,0.77299,0.0,8.14,0.0,0.538,6.495,94.4,4.4547,4.0,307.0,21.0,387.94,12.8,18.4
59,0.10328,25.0,5.13,0.0,0.453,5.927,47.2,6.932,8.0,284.0,19.7,396.9,9.22,19.6
265,0.76162,20.0,3.97,0.0,0.647,5.56,62.8,1.9865,5.0,264.0,13.0,392.4,10.45,22.8
208,0.13587,0.0,10.59,1.0,0.489,6.064,59.1,4.2392,4.0,277.0,18.6,381.32,14.66,24.4
483,2.81838,0.0,18.1,0.0,0.532,5.762,40.3,4.0983,24.0,666.0,20.2,392.92,10.42,21.8


In [22]:
results = smf.ols('PRICE ~ CRIM + ZN + INDUS + CHAS + NOX + RM + DIS + RAD + PTRATIO + B', data=dat).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.699
Model:                            OLS   Adj. R-squared:                  0.690
Method:                 Least Squares   F-statistic:                     79.49
Date:                Sat, 30 Sep 2017   Prob (F-statistic):           4.42e-83
Time:                        18:19:02   Log-Likelihood:                -1044.1
No. Observations:                 354   AIC:                             2110.
Df Residuals:                     343   BIC:                             2153.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     13.4544      5.609      2.399      0.0

In [23]:
results = smf.ols('PRICE ~ CRIM + ZN', data=dat).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.280
Model:                            OLS   Adj. R-squared:                  0.276
Method:                 Least Squares   F-statistic:                     68.41
Date:                Sat, 30 Sep 2017   Prob (F-statistic):           8.18e-26
Time:                        18:19:02   Log-Likelihood:                -1198.1
No. Observations:                 354   AIC:                             2402.
Df Residuals:                     351   BIC:                             2414.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     21.7449      0.474     45.844      0.0