In [1]:
import statsmodels.api as sm
from sklearn import datasets
data = datasets.load_boston() 

In [3]:
print (data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


df = pd.DataFrame(data.data, columns=data.feature_names)
target = pd.DataFrame(data.target, columns=["MEDV"])


In [7]:
x = df["RM"]
y = target["MEDV"]

In [8]:
model = sm.OLS(y, x).fit()
predictions = model.predict(X)

In [10]:
model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.901
Model:,OLS,Adj. R-squared:,0.901
Method:,Least Squares,F-statistic:,4615.0
Date:,"Tue, 23 Jul 2019",Prob (F-statistic):,3.7399999999999996e-256
Time:,12:28:24,Log-Likelihood:,-1747.1
No. Observations:,506,AIC:,3496.0
Df Residuals:,505,BIC:,3500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,3.6534,0.054,67.930,0.000,3.548,3.759

0,1,2,3
Omnibus:,83.295,Durbin-Watson:,0.493
Prob(Omnibus):,0.0,Jarque-Bera (JB):,152.507
Skew:,0.955,Prob(JB):,7.649999999999999e-34
Kurtosis:,4.894,Cond. No.,1.0


In [18]:
X = df[['RM' , 'LSTAT',]]
y = target['MEDV']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,4637.0
Date:,"Tue, 23 Jul 2019",Prob (F-statistic):,0.0
Time:,12:42:46,Log-Likelihood:,-1582.9
No. Observations:,506,AIC:,3170.0
Df Residuals:,504,BIC:,3178.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,4.9069,0.070,69.906,0.000,4.769,5.045
LSTAT,-0.6557,0.031,-21.458,0.000,-0.716,-0.596

0,1,2,3
Omnibus:,145.153,Durbin-Watson:,0.834
Prob(Omnibus):,0.0,Jarque-Bera (JB):,442.157
Skew:,1.351,Prob(JB):,9.7e-97
Kurtosis:,6.698,Cond. No.,4.72


In [41]:
X = df[['RM' ,'LSTAT','PTRATIO','INDUS','NOX']]
y = target['MEDV']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.952
Model:,OLS,Adj. R-squared:,0.952
Method:,Least Squares,F-statistic:,1993.0
Date:,"Tue, 23 Jul 2019",Prob (F-statistic):,0.0
Time:,12:53:20,Log-Likelihood:,-1564.1
No. Observations:,506,AIC:,3138.0
Df Residuals:,501,BIC:,3159.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,6.1961,0.285,21.716,0.000,5.636,6.757
LSTAT,-0.4939,0.050,-9.872,0.000,-0.592,-0.396
PTRATIO,-0.5685,0.094,-6.025,0.000,-0.754,-0.383
INDUS,-0.0159,0.057,-0.279,0.780,-0.128,0.096
NOX,0.8056,3.208,0.251,0.802,-5.497,7.108

0,1,2,3
Omnibus:,196.425,Durbin-Watson:,0.887
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1130.405
Skew:,1.594,Prob(JB):,3.43e-246
Kurtosis:,9.592,Cond. No.,360.0


In [38]:
X = df[['RM' , 'LSTAT','PTRATIO','INDUS']]
y = target['MEDV']
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.952
Model:,OLS,Adj. R-squared:,0.952
Method:,Least Squares,F-statistic:,2496.0
Date:,"Tue, 23 Jul 2019",Prob (F-statistic):,0.0
Time:,12:51:57,Log-Likelihood:,-1564.1
No. Observations:,506,AIC:,3136.0
Df Residuals:,502,BIC:,3153.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,6.2395,0.227,27.484,0.000,5.793,6.685
LSTAT,-0.4885,0.045,-10.820,0.000,-0.577,-0.400
PTRATIO,-0.5682,0.094,-6.028,0.000,-0.753,-0.383
INDUS,-0.0069,0.044,-0.156,0.876,-0.094,0.080

0,1,2,3
Omnibus:,197.876,Durbin-Watson:,0.886
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1151.283
Skew:,1.604,Prob(JB):,9.999999999999999e-251
Kurtosis:,9.657,Cond. No.,27.4


In [42]:
from sklearn import linear_model


In [44]:
X = df
y = target['MEDV']
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

In [46]:
predictions = lm.predict(X)
print(predictions[0:5])

[30.00384338 25.02556238 30.56759672 28.60703649 27.94352423]


In [47]:
lm.score(X,y)


0.7406426641094095