In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn

In [3]:
from sklearn.datasets import load_boston
boston = load_boston()
bos = pd.DataFrame(boston.data)

In [4]:
bos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [5]:
print (boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [15]:
#we’ll load the data to Pandas
# define the data/predictors as the pre-set feature names  
df = pd.DataFrame(boston.data, columns=boston.feature_names)

# Put the target (housing value -- MEDV) in another DataFrame
target = pd.DataFrame(boston.target, columns=["MEDV"])

In [37]:
##So now, as before, we have the data frame that contains the independent variables (marked as “df”) 
#and the data frame with the dependent variable (marked as “target”). Let’s fit a regression model using SKLearn. 
#First we’ll define our X and y — this time I’ll use all the variables in the data frame to predict the 
#housing price:
##

In [19]:
from sklearn import linear_model
X = df
y = target ["MEDV"]

In [20]:
# Fit the model
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

In [38]:
##The lm.fit() function fits a linear model. We want to use the model to make predictions 
#(that’s what we’re here for!), so we’ll use lm.predict():

In [35]:
##The print function would print the first 5 predictions for y (I didn’t print the entire list to “save room”. 
#Removing [0:5] would print 
#the entire list):
predictions = lm.predict(X)
print(predictions)

[30.00821269 25.0298606  30.5702317  28.60814055 27.94288232 25.25940048
 23.00433994 19.5347558  11.51696539 18.91981483 18.9958266  21.58970854
 20.90534851 19.55535931 19.2837957  19.30000174 20.52889993 16.9096749
 16.17067411 18.40781636 12.52040454 17.67104565 15.82934891 13.80368317
 15.67708138 13.3791645  15.46258829 14.69863607 19.54518512 20.87309945
 11.44806825 18.05900412  8.78841666 14.27882319 13.69097132 23.81755469
 22.34216285 23.11123204 22.91494157 31.35826216 34.21485385 28.0207132
 25.20646572 24.61192851 22.94438953 22.10150945 20.42467417 18.03614022
  9.10176198 17.20856571 21.28259372 23.97621248 27.65853521 24.0521088
 15.35989132 31.14817003 24.85878746 33.11017111 21.77458036 21.08526739
 17.87203538 18.50881381 23.9879809  22.54944098 23.37068403 30.36557584
 25.53407332 21.11758504 17.42468223 20.7893086  25.20349174 21.74490595
 24.56275612 24.04479519 25.5091157  23.97076758 22.94823519 23.36106095
 21.26432549 22.4345376  28.40699937 26.99734716 26.03

In [24]:
#lm.predict() predicts the y (dependent variable) using the linear model we fitted. 
#You must have noticed that when we run a linear regression with SKLearn, we don’t get a pretty table 
#(okay, it’s not that pretty… but it’s pretty useful) like in Statsmodels. What we can do is use built-in functions 
#to return the score, the coefficients and the estimated intercepts.

In [26]:
lm.score(X,y) #This is the R² score of our model. As you probably remember, this the percentage of explained 
#variance of the predictions. If you’re interested, read more here. Next, let’s check out the coefficients 
#for the predictors:

0.7406077428649428

In [27]:
lm.coef_ # will give coeficient

array([-1.07170557e-01,  4.63952195e-02,  2.08602395e-02,  2.68856140e+00,
       -1.77957587e+01,  3.80475246e+00,  7.51061703e-04, -1.47575880e+00,
        3.05655038e-01, -1.23293463e-02, -9.53463555e-01,  9.39251272e-03,
       -5.25466633e-01])

In [28]:
lm.intercept_ # will give intercept

36.4911032803614

In [29]:
#These are all (estimated/predicted) parts of the multiple regression equation 