In [2]:
import statsmodels.api as sm
from sklearn import datasets ## imports datasets from scikit-learn
data = datasets.load_boston() ## loads Boston dataset from datasets library 
print(data.DESCR) ## data dictionary

import numpy as np
import pandas as pd
# define the data/predictors as the pre-set feature names  
df = pd.DataFrame(data.data, columns=data.feature_names)

# Put the target (housing value -- MEDV) in another DataFrame
target = pd.DataFrame(data.target, columns=["MEDV"])

## 1. Linear Regression in statsmodels using OLS, single variable

X = df["RM"]
y = target["MEDV"]

## Linear regression using OLS = Ordinary Least Square
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

  from pandas.core import datetools


Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.901
Model:,OLS,Adj. R-squared:,0.901
Method:,Least Squares,F-statistic:,4615.0
Date:,"Thu, 01 Mar 2018",Prob (F-statistic):,3.7399999999999996e-256
Time:,13:02:48,Log-Likelihood:,-1747.1
No. Observations:,506,AIC:,3496.0
Df Residuals:,505,BIC:,3500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,3.6534,0.054,67.930,0.000,3.548,3.759

0,1,2,3
Omnibus:,83.295,Durbin-Watson:,0.493
Prob(Omnibus):,0.0,Jarque-Bera (JB):,152.507
Skew:,0.955,Prob(JB):,7.649999999999999e-34
Kurtosis:,4.894,Cond. No.,1.0


In [5]:
## 2. Linear Regression in statsmodels using OLS, single variable + constant

X = df["RM"] ## X usually means our input variables (or independent variables)
y = target["MEDV"] ## Y usually means our output/dependent variable
X = sm.add_constant(X) ## let's add an intercept (beta_0) to our model

##Linear Regression with constant 
model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
predictions = model.predict(X)

# Print out the statistics
model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.484
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,471.8
Date:,"Thu, 01 Mar 2018",Prob (F-statistic):,2.49e-74
Time:,13:53:06,Log-Likelihood:,-1673.1
No. Observations:,506,AIC:,3350.0
Df Residuals:,504,BIC:,3359.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-34.6706,2.650,-13.084,0.000,-39.877,-29.465
RM,9.1021,0.419,21.722,0.000,8.279,9.925

0,1,2,3
Omnibus:,102.585,Durbin-Watson:,0.684
Prob(Omnibus):,0.0,Jarque-Bera (JB):,612.449
Skew:,0.726,Prob(JB):,1.02e-133
Kurtosis:,8.19,Cond. No.,58.4


In [9]:
## 3. Linear Regression in statsmodels using OLS, multiple variables

X = df[['RM', 'LSTAT', 'PTRATIO', 'RAD', 'TAX', 'CRIM']]
y = target['MEDV']
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.691
Model:,OLS,Adj. R-squared:,0.687
Method:,Least Squares,F-statistic:,185.6
Date:,"Thu, 01 Mar 2018",Prob (F-statistic):,1.18e-123
Time:,14:02:41,Log-Likelihood:,-1543.5
No. Observations:,506,AIC:,3101.0
Df Residuals:,499,BIC:,3131.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,22.3383,4.202,5.316,0.000,14.083,30.594
RM,4.3320,0.431,10.061,0.000,3.486,5.178
LSTAT,-0.5316,0.047,-11.225,0.000,-0.625,-0.439
PTRATIO,-0.9553,0.125,-7.622,0.000,-1.202,-0.709
RAD,0.2605,0.068,3.825,0.000,0.127,0.394
TAX,-0.0118,0.003,-3.435,0.001,-0.019,-0.005
CRIM,-0.0959,0.035,-2.749,0.006,-0.164,-0.027

0,1,2,3
Omnibus:,213.819,Durbin-Watson:,0.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1131.676
Skew:,1.799,Prob(JB):,1.8199999999999998e-246
Kurtosis:,9.382,Cond. No.,8160.0


In [16]:
## 4. Linear Regression in SKLearn using multiple variables

from sklearn import linear_model

X = df
y = target['MEDV']

lm = linear_model.LinearRegression()
model = lm.fit(X,y)

score = lm.score(X,y)

#Run test
predictions = lm.predict(X)
#print(predictions)
print(score) # R2 value
coef_linear=pd.Series(lm.coef_,X.columns)
print(coef_linear)
print(lm.intercept_)

0.740607742865
CRIM       -0.107171
ZN          0.046395
INDUS       0.020860
CHAS        2.688561
NOX       -17.795759
RM          3.804752
AGE         0.000751
DIS        -1.475759
RAD         0.305655
TAX        -0.012329
PTRATIO    -0.953464
B           0.009393
LSTAT      -0.525467
dtype: float64
36.4911032804


In [20]:
## 5. Lasso Regression in SKLearn using multiple variables

lm = linear_model.Lasso(alpha=10,max_iter=10000)
model = lm.fit(X,y)

score = lm.score(X,y)

#Run test
predictions = lm.predict(X)
#print(predictions)
print(score) # R2 value
coef_Lasso = pd.Series(lm.coef_, X.columns)
print(coef_Lasso)
print(lm.intercept_)

0.523363123072
CRIM      -0.000000
ZN         0.026146
INDUS     -0.000000
CHAS       0.000000
NOX        0.000000
RM         0.000000
AGE        0.000000
DIS       -0.000000
RAD        0.000000
TAX       -0.009282
PTRATIO   -0.000000
B          0.007496
LSTAT     -0.564038
dtype: float64
30.4883598328
