## Multiple Linear Regression with statsmodels

In [1]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd

from sklearn import datasets, model_selection


In [2]:
boston = datasets.load_boston()

X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target


In [3]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.7)

In [4]:
model = sm.OLS(y_train, sm.add_constant(X_train))
res = model.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.728
Model:                            OLS   Adj. R-squared:                  0.718
Method:                 Least Squares   F-statistic:                     70.06
Date:                Fri, 30 Oct 2020   Prob (F-statistic):           8.50e-88
Time:                        09:04:53   Log-Likelihood:                -1053.8
No. Observations:                 354   AIC:                             2136.
Df Residuals:                     340   BIC:                             2190.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         44.0878      6.177      7.137      0.0

In [5]:
res.pvalues

const      5.793213e-12
CRIM       8.869828e-04
ZN         1.666095e-04
INDUS      4.426706e-01
CHAS       7.514282e-03
NOX        1.405692e-04
RM         4.045495e-09
AGE        6.779796e-01
DIS        1.009368e-12
RAD        3.110157e-06
TAX        1.750243e-03
PTRATIO    8.928530e-09
B          1.037824e-02
LSTAT      2.781755e-15
dtype: float64

In [6]:
res.pvalues < 0.05

const       True
CRIM        True
ZN          True
INDUS      False
CHAS        True
NOX         True
RM          True
AGE        False
DIS         True
RAD         True
TAX         True
PTRATIO     True
B           True
LSTAT       True
dtype: bool

In [7]:
dat = X_train.copy()
dat['PRICE'] = y_train
dat.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
406,20.7162,0.0,18.1,0.0,0.659,4.138,100.0,1.1781,24.0,666.0,20.2,370.22,23.34,11.9
257,0.61154,20.0,3.97,0.0,0.647,8.704,86.9,1.801,5.0,264.0,13.0,389.7,5.12,50.0
219,0.11425,0.0,13.89,1.0,0.55,6.373,92.4,3.3633,5.0,276.0,16.4,393.74,10.5,23.0
169,2.44953,0.0,19.58,0.0,0.605,6.402,95.2,2.2625,5.0,403.0,14.7,330.04,11.32,22.3
132,0.59005,0.0,21.89,0.0,0.624,6.372,97.9,2.3274,4.0,437.0,21.2,385.76,11.12,23.0


In [8]:
results = smf.ols('PRICE ~ CRIM + ZN + INDUS + CHAS + NOX + RM + DIS + RAD + PTRATIO + B', data=dat).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.658
Model:                            OLS   Adj. R-squared:                  0.648
Method:                 Least Squares   F-statistic:                     66.05
Date:                Fri, 30 Oct 2020   Prob (F-statistic):           8.00e-74
Time:                        09:04:58   Log-Likelihood:                -1094.4
No. Observations:                 354   AIC:                             2211.
Df Residuals:                     343   BIC:                             2253.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     26.6504      6.555      4.065      0.0

In [9]:
results = smf.ols('PRICE ~ CRIM + ZN', data=dat).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  PRICE   R-squared:                       0.237
Model:                            OLS   Adj. R-squared:                  0.232
Method:                 Least Squares   F-statistic:                     54.39
Date:                Fri, 30 Oct 2020   Prob (F-statistic):           2.66e-21
Time:                        09:04:59   Log-Likelihood:                -1236.6
No. Observations:                 354   AIC:                             2479.
Df Residuals:                     351   BIC:                             2491.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     22.0958      0.517     42.717      0.0