In [15]:
from sklearn import linear_model
import pandas as pd
import numpy as np

# Lets use Boston housing prices dataset for this exercise

# import the dataset from scikit-learn library

from sklearn import datasets 

# load the boston dataset from dataset library

data = datasets.load_boston() 

# we can print the description of the dataset using print(data.DESCR) command

print(data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [21]:
# Now that the dataset has been loaded, we will use Pandas libarary for do our analysis

# we need to store the information in a dataframe (marked as "df"). df is independent variable which we need to predict

# we load the dataset as pandas dataframe for easier analysis

df = pd.DataFrame(data.data, columns=data.feature_names)

# now we need a dataframe (marked as "target") to store the independent variable. Our target variable is median house value MEDV

target = pd.DataFrame(data.target, columns=["MEDV"])

# What we have done here is to take the dataset and load it as pandas dataframe
# After that, we are setting the predictor ("as df") - the independent variables that are pre-set in the dataset
# After that, we are setting the target ("as target") - the dependnent variable that we are predicing


In [None]:
# Now, we want to fit a linear regression model. For doing that, we need to choose the variables which we think are
# good predictors of our target/dependent variable. 

# This can be done by checking the correlation(s) between the variables. We can plot the data visually or conduct some
# primary research on what variables are good predictors of our target variable. I am skipping this part for later.

# So, now we have the datafame that contains independent variable as df and dependent variable as target.
# Lets fit the linear regression model. We need to define X and Y

In [22]:
X = df
y = target["MEDV"]

In [23]:
# Fitting the model
lm = linear_model.LinearRegression()
model = lm.fit(X,y)

In [26]:
# The lm.fit() function fits a linear model. 
# We want to use the model to make predictions. Hence, we will use lm.predict()

prediction = lm.predict(X)
print(prediction[0:5])

# [0:5] means we are only printing first 5 predictions!


[30.00384338 25.02556238 30.56759672 28.60703649 27.94352423]


In [27]:
# We can use inbuilt functions to return the score, the coefficients and the estimated intercepts.
lm.score(X,y)

0.7406426641094095

In [29]:
# This is R2 (also called as coefficient of determination) of our model. Higher the better. (Max value is 1)
# Now, lets check coefficients of the predictors
lm.coef_

array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
       -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
        3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
       -5.24758378e-01])

In [31]:
# Also, coefficients of the intercept
lm.intercept_

36.459488385089855

In [None]:
# These all are parts of the multiple regression equation. (More than 1 independent variables)

#In practice, we do not use the entire dataset. But we split our data into testing and training as explained in Chapter 1.