# Solutions to Case Study #1 on Predicting Boston Housing Prices.

## Import required packages.

In [2]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from dmba import regressionSummary
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

import matplotlib.pylab as plt
%matplotlib inline

no display found. Using non-interactive Agg backend


## Upload data set for analysis. Explore, clean, and pre-process data. 

In [3]:
# Create data frame from the original data set.  
boston_df = pd.read_csv('BostonHousing.csv')

# Determine dimensions of dataframe. 
print('BostonHousing Dimensions:', boston_df.shape)


BostonHousing Dimensions: (506, 14)


In [4]:
# Display the column names.
print('Original Column Titles')
boston_df.columns

Original Column Titles


Index(['CRIME', 'ZONE', 'INDUST', 'CHAR RIV', 'NIT OXIDE', 'ROOMS', 'AGE',
       'DISTANCE', 'RADIAL', 'TAX', 'ST RATIO', 'LOW STAT', 'MVALUE',
       'C MVALUE'],
      dtype='object')

In [5]:
# Make column titles (variable names) as one word and 
# without blank. 
boston_df.columns = [s.strip().replace(' ', '_') for s in boston_df.columns]
print('Converted One-Word Titles')
boston_df.columns

Converted One-Word Titles


Index(['CRIME', 'ZONE', 'INDUST', 'CHAR_RIV', 'NIT_OXIDE', 'ROOMS', 'AGE',
       'DISTANCE', 'RADIAL', 'TAX', 'ST_RATIO', 'LOW_STAT', 'MVALUE',
       'C_MVALUE'],
      dtype='object')

In [6]:
# Display column data types in the data frame for regression analysis.
boston_df.dtypes

CRIME        float64
ZONE         float64
INDUST       float64
CHAR_RIV      object
NIT_OXIDE    float64
ROOMS        float64
AGE          float64
DISTANCE     float64
RADIAL         int64
TAX            int64
ST_RATIO     float64
LOW_STAT     float64
MVALUE       float64
C_MVALUE      object
dtype: object

In [7]:
# Convert object variables  into dummy variables.
# Use drop_first=True to drop the first dummy variable.
boston_df = pd.get_dummies(boston_df, prefix_sep='_', 
                            drop_first=True)
# Disply updated data types.  
boston_df.dtypes

CRIME           float64
ZONE            float64
INDUST          float64
NIT_OXIDE       float64
ROOMS           float64
AGE             float64
DISTANCE        float64
RADIAL            int64
TAX               int64
ST_RATIO        float64
LOW_STAT        float64
MVALUE          float64
CHAR_RIV_Y        uint8
C_MVALUE_Yes      uint8
dtype: object

In [8]:
# Display descriptive statistics of the data set's columns.
np.round(boston_df.describe(), decimals=2)

Unnamed: 0,CRIME,ZONE,INDUST,NIT_OXIDE,ROOMS,AGE,DISTANCE,RADIAL,TAX,ST_RATIO,LOW_STAT,MVALUE,CHAR_RIV_Y,C_MVALUE_Yes
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.61,11.36,11.14,0.55,6.28,68.57,3.8,9.55,408.24,18.46,12.65,22.53,0.07,0.17
std,8.6,23.32,6.86,0.12,0.7,28.15,2.11,8.71,168.54,2.16,7.14,9.2,0.25,0.37
min,0.01,0.0,0.46,0.38,3.56,2.9,1.13,1.0,187.0,12.6,1.73,5.0,0.0,0.0
25%,0.08,0.0,5.19,0.45,5.89,45.02,2.1,4.0,279.0,17.4,6.95,17.02,0.0,0.0
50%,0.26,0.0,9.69,0.54,6.21,77.5,3.21,5.0,330.0,19.05,11.36,21.2,0.0,0.0
75%,3.68,12.5,18.1,0.62,6.62,94.07,5.19,24.0,666.0,20.2,16.96,25.0,0.0,0.0
max,88.98,100.0,27.74,0.87,8.78,100.0,12.13,24.0,711.0,22.0,37.97,50.0,1.0,1.0


## Develop multiple linear regression model and make predictions. 

In [9]:
# Develop outcome. 
outcome = 'MVALUE'

# Develop predictor variables. 
predictors = [s for s in boston_df.columns if s not in outcome]

# Identify X and y variables for regression and partition data
# using 60% of records for training and 40% for validation 
# (test_size=0.4). 
X = boston_df[predictors]
y = boston_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, 
                            test_size=0.2, random_state=1)

# Create multiple linear regression model using X and y
# and LinearRegression() function from sklearn (skikit-learn) 
# library.
boston_lm = LinearRegression()
boston_lm.fit(train_X, train_y)

# Display intercept and regression coefficients. Round
# them to 2 decimals.
print('Regression Model for Boston Housing Training Set')
print()
print('Intercept ', np.round(boston_lm.intercept_, decimals=2))
print(pd.DataFrame({'Predictor': X.columns, 'Coefficient': np.round(boston_lm.coef_, decimals=2)}))


Regression Model for Boston Housing Training Set

Intercept  46.41
       Predictor  Coefficient
0          CRIME        -0.13
1           ZONE        -0.01
2         INDUST         0.11
3      NIT_OXIDE       -17.12
4          ROOMS         0.64
5            AGE        -0.01
6       DISTANCE        -0.70
7         RADIAL         0.19
8            TAX        -0.01
9       ST_RATIO        -0.60
10      LOW_STAT        -0.47
11    CHAR_RIV_Y         2.17
12  C_MVALUE_Yes        11.66


## Identify and compare performance measures for training and validation set.

In [10]:
# Use predict() function to make predictions for
# training set.
pred_y = boston_lm.predict(train_X)

# Create prediction performance measures for training set.
r2 = round(r2_score(train_y, pred_y),3)
adj_r2 = round(adjusted_r2_score(train_y, pred_y, boston_lm),3)

# Display prediction performance measures for training set.
print('Prediction Performance Measures for Training Set')
print('r2 : ', r2)
print('Adjusted r2 : ', adj_r2)


# Use predict() to score (make) predictions for validation set.
boston_lm_pred = boston_lm.predict(valid_X)

# Create prediction performance measures for validation set.
r2 = round(r2_score(valid_y, boston_lm_pred),3)
adj_r2 = round(adjusted_r2_score(valid_y, boston_lm_pred, boston_lm),3)

# Display prediction performance measures for validation set.
print() 
print('Prediction Performance Measures for Validation Set')
print('r2 : ', r2)
print('adjusted r2 : ', adj_r2)

Prediction Performance Measures for Training Set
r2 :  0.836
Adjusted r2 :  0.831

Prediction Performance Measures for Validation Set
r2 :  0.851
adjusted r2 :  0.829


In [11]:
# Display common accuracy measures for training set.
print('Accuracy Measures for Training Set - All Variables')
regressionSummary(train_y, pred_y)
print()

# Display common accuracy measures for validation set.
print('Accuracy Measures for Validation Set - All Variables')
regressionSummary(valid_y, boston_lm_pred)

Accuracy Measures for Training Set - All Variables

Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 3.6395
            Mean Absolute Error (MAE) : 2.6454
          Mean Percentage Error (MPE) : -2.6958
Mean Absolute Percentage Error (MAPE) : 12.9926

Accuracy Measures for Validation Set - All Variables

Regression statistics

                      Mean Error (ME) : 0.2023
       Root Mean Squared Error (RMSE) : 3.8378
            Mean Absolute Error (MAE) : 2.8230
          Mean Percentage Error (MPE) : -4.5533
Mean Absolute Percentage Error (MAPE) : 14.6529


## Backward Elimination algorithm.

In [12]:
# Define train_model() function used in Backward Elimination
# algorithm with backward_elimination() function. 
def train_model(variables):
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

# Define score_model() function used in Backward Elimination
# algorithm with backward_elimination() function. 
def score_model(model, variables):
    return AIC_score(train_y, model.predict(train_X[variables]), model)

# Use backward_elimination() function to identify the
# best_model and best_variables. 
best_model_be, best_variables_be = backward_elimination(train_X.columns, 
                        train_model, score_model, verbose=True)

# Display best variables based on Backward Elimination algorithm. 
print()
print('Best Variables from Backward Elimination Algorithm')
print(best_variables_be)

Variables: CRIME, ZONE, INDUST, NIT_OXIDE, ROOMS, AGE, DISTANCE, RADIAL, TAX, ST_RATIO, LOW_STAT, CHAR_RIV_Y, C_MVALUE_Yes
Start: score=2220.32
Step: score=2218.56, remove AGE
Step: score=2216.85, remove ZONE
Step: score=2216.85, remove None

Best Variables from Backward Elimination Algorithm
['CRIME', 'INDUST', 'NIT_OXIDE', 'ROOMS', 'DISTANCE', 'RADIAL', 'TAX', 'ST_RATIO', 'LOW_STAT', 'CHAR_RIV_Y', 'C_MVALUE_Yes']


In [14]:
# Develop the multiple linear regression model based
# on the Backward Elimination results.

# Identify predictors and outcome of the regression model.
predictors_be = ['CRIME', 'INDUST', 'NIT_OXIDE', 'ROOMS', 'DISTANCE', 
   'RADIAL', 'TAX', 'ST_RATIO', 'LOW_STAT', 'CHAR_RIV_Y', 'C_MVALUE_Yes']
               
outcome = 'MVALUE'

# Identify X and y variables for regression and partition data
# using 60% of records for training and 40% for validation 
# (test_size=0.4). 
X = boston_df[predictors_be]
y = boston_df[outcome]
train_X_be, valid_X_be, train_y_be, valid_y_be = \
          train_test_split(X, y, test_size=0.2, random_state=1)

# Create multiple linear regression model using X and y.
boston_be = LinearRegression()
boston_be.fit(train_X_be, train_y_be)

# Display intercept and regression coefficients. Round them
# to 2 decimals.
print('Regression Model for Training Set Using Backward Elimination')
print()
print('Intercept ', round(boston_be.intercept_, 2))
print(pd.DataFrame({'Predictor': X.columns,
            'Coefficient': np.round(boston_be.coef_, decimals=2)}))

Regression Model for Training Set Using Backward Elimination

Intercept  46.55
       Predictor  Coefficient
0          CRIME        -0.13
1         INDUST         0.11
2      NIT_OXIDE       -17.47
3          ROOMS         0.61
4       DISTANCE        -0.73
5         RADIAL         0.20
6            TAX        -0.01
7       ST_RATIO        -0.59
8       LOW_STAT        -0.48
9     CHAR_RIV_Y         2.14
10  C_MVALUE_Yes        11.52


In [16]:
# Use predict() to score predictions for validation set in
# regression model based Backward Elimination algorithm.
boston_be_pred = boston_be.predict(valid_X_be)

# Develop and display data frame with actual values of Price,
# scoring (predicted) results, and residuals.
# Use round() function to round vlaues in data frame to 
# 2 decimals. 
result = round(pd.DataFrame({'Actual': valid_y_be,'Predicted': boston_be_pred, 
                       'Residual': valid_y_be - boston_be_pred}), 2)
print()
print('Predictions for Validation Set Using Backward Elimination')
print(result.head(10))

# Display common accuracy measures for validation set.
print()
print('Accuracy Measures for Validation Set Using Backward Elimination')
regressionSummary(valid_y_be, boston_be_pred)


Predictions for Validation Set Using Backward Elimination
     Actual  Predicted  Residual
307    28.2      25.57      2.63
343    23.9      22.70      1.20
47     16.6      18.11     -1.51
67     22.0      21.98      0.02
362    20.8      19.19      1.61
132    23.0      19.68      3.32
292    27.9      25.52      2.38
31     14.5      18.25     -3.75
218    21.5      22.62     -1.12
90     22.6      23.57     -0.97

Accuracy Measures for Validation Set Using Backward Elimination

Regression statistics

                      Mean Error (ME) : 0.1904
       Root Mean Squared Error (RMSE) : 3.8356
            Mean Absolute Error (MAE) : 2.8137
          Mean Percentage Error (MPE) : -4.5767
Mean Absolute Percentage Error (MAPE) : 14.5939


##  Forward Selection algorithm.

In [17]:
# Define train_model() function used in Forward Selection
# algorithm with forward_selection() function. 
# The initial model is the constant model - this requires 
# special handling in train_model and score_model.
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

# Define score_model() function used in Forward Selection
# algorithm with forward_selection() function. 
def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_X[variables]), model)

# Use forward_selection() function to identify the
# best_model and best_variables.
best_model_fs, best_variables_fs = forward_selection(train_X.columns, 
                    train_model, score_model, verbose=True)

# Display best variables based on Forward Selection algorithm.
print
print('Best Variables')
print(best_variables_fs)

Variables: CRIME, ZONE, INDUST, NIT_OXIDE, ROOMS, AGE, DISTANCE, RADIAL, TAX, ST_RATIO, LOW_STAT, CHAR_RIV_Y, C_MVALUE_Yes
Start: score=2924.77, constant
Step: score=2542.89, add C_MVALUE_Yes
Step: score=2295.94, add LOW_STAT
Step: score=2276.57, add CRIME
Step: score=2265.27, add CHAR_RIV_Y
Step: score=2255.86, add ST_RATIO
Step: score=2247.88, add DISTANCE
Step: score=2227.34, add NIT_OXIDE
Step: score=2220.35, add RADIAL
Step: score=2219.47, add TAX
Step: score=2217.31, add INDUST
Step: score=2216.85, add ROOMS
Step: score=2216.85, add None
Best Variables
['C_MVALUE_Yes', 'LOW_STAT', 'CRIME', 'CHAR_RIV_Y', 'ST_RATIO', 'DISTANCE', 'NIT_OXIDE', 'RADIAL', 'TAX', 'INDUST', 'ROOMS']


In [19]:
# Develop the multiple linear regression model based
# on the Exhaustive Search results.

# Identify predictors and outcome of the regression model.
predictors_fs = ['C_MVALUE_Yes', 'LOW_STAT', 'CRIME', 'CHAR_RIV_Y', 'ST_RATIO', 
                 'DISTANCE', 'NIT_OXIDE', 'RADIAL', 'TAX', 'INDUST', 'ROOMS']
outcome = 'MVALUE'

# Identify X and y variables for regression and partition data
# using 60% of records for training and 40% for validation 
# (test_size=0.4). 
X = boston_df[predictors_fs]
y = boston_df[outcome]
train_X_fs, valid_X_fs, train_y_fs, valid_y_fs = \
          train_test_split(X, y, test_size=0.2, random_state=1)

# Create multiple linear regression model using X and y.
boston_fs = LinearRegression()
boston_fs.fit(train_X_fs, train_y_fs)

# Display intercept and regression coefficients. Round them
# to 2 decimals.
print('Regression Model for Training Set Using Forward Selection')
print()
print('Intercept ', round(boston_fs.intercept_, 2))
print(pd.DataFrame({'Predictor': X.columns,
            'Coefficient': np.round(boston_fs.coef_, decimals=2)}))


Regression Model for Training Set Using Forward Selection

Intercept  46.55
       Predictor  Coefficient
0   C_MVALUE_Yes        11.52
1       LOW_STAT        -0.48
2          CRIME        -0.13
3     CHAR_RIV_Y         2.14
4       ST_RATIO        -0.59
5       DISTANCE        -0.73
6      NIT_OXIDE       -17.47
7         RADIAL         0.20
8            TAX        -0.01
9         INDUST         0.11
10         ROOMS         0.61


In [20]:
# Use predict() to score predictions for validation set.
best_fs_pred = boston_fs.predict(valid_X_fs)

# Display common accuracy measures for validation set.
print()
print('Accuracy Measures for Validation Set Using Forward Selection')
regressionSummary(valid_y_fs, best_fs_pred)


Accuracy Measures for Validation Set Using Forward Selection

Regression statistics

                      Mean Error (ME) : 0.1904
       Root Mean Squared Error (RMSE) : 3.8356
            Mean Absolute Error (MAE) : 2.8137
          Mean Percentage Error (MPE) : -4.5767
Mean Absolute Percentage Error (MAPE) : 14.5939


## Stepwise algorithm.

In [21]:
# Define train_model() function used in Stepwise Selection
# algorithm with stepwise_selection() function. 
# The initial model is the constant model - this requires 
# special handling in train_model and score_model.
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

# Define score_model() function used in Stepwise Selection
# algorithm with stepwise_selection() function. 
def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_X[variables]), model)

# Use stepwise() function to identify the best_model
# and best_variables with Stepwise section algorithm.
best_model_st, best_variables_st = stepwise_selection(train_X.columns, 
                    train_model, score_model, verbose=True)

# Display best variables based on Stepwise algorithm.
print()
print('Best Variables from Stepwise Selection Algorithm')
print(best_variables_st)

Variables: CRIME, ZONE, INDUST, NIT_OXIDE, ROOMS, AGE, DISTANCE, RADIAL, TAX, ST_RATIO, LOW_STAT, CHAR_RIV_Y, C_MVALUE_Yes
Start: score=2924.77, constant
Step: score=2542.89, add C_MVALUE_Yes
Step: score=2295.94, add LOW_STAT
Step: score=2276.57, add CRIME
Step: score=2265.27, add CHAR_RIV_Y
Step: score=2255.86, add ST_RATIO
Step: score=2247.88, add DISTANCE
Step: score=2227.34, add NIT_OXIDE
Step: score=2220.35, add RADIAL
Step: score=2219.47, add TAX
Step: score=2217.31, add INDUST
Step: score=2216.85, add ROOMS
Step: score=2216.85, unchanged None

Best Variables from Stepwise Selection Algorithm
['C_MVALUE_Yes', 'LOW_STAT', 'CRIME', 'CHAR_RIV_Y', 'ST_RATIO', 'DISTANCE', 'NIT_OXIDE', 'RADIAL', 'TAX', 'INDUST', 'ROOMS']


In [22]:
# Develop the multiple linear regression model based
# on the Stepwise Selection results.

# Identify predictors and outcome of the regression model.
predictors_st = ['C_MVALUE_Yes', 'LOW_STAT', 'CRIME', 'CHAR_RIV_Y', 
 'ST_RATIO', 'DISTANCE', 'NIT_OXIDE', 'RADIAL', 'TAX', 'INDUST', 'ROOMS']
outcome = 'MVALUE'

# Identify X and y variables for regression and partition data
# using 60% of records for training and 40% for validation 
# (test_size=0.4). 
X = boston_df[predictors_st]
y = boston_df[outcome]
train_X_st, valid_X_st, train_y_st, valid_y_st = \
          train_test_split(X, y, test_size=0.2, random_state=1)

# Create multiple linear regression model using X and y.
boston_st = LinearRegression()
boston_st.fit(train_X_st, train_y_st)

# Display intercept and regression coefficients. Round them
# to 2 decimals.
print('Regression Model for Training Set Using Stewise Algorithm')
print()
print('Intercept ', np.round(boston_st.intercept_, 2))
print(pd.DataFrame({'Predictor': X.columns,
            'Coefficient': np.round(boston_st.coef_, 2)}))

Regression Model for Training Set Using Stewise Algorithm

Intercept  46.55
       Predictor  Coefficient
0   C_MVALUE_Yes        11.52
1       LOW_STAT        -0.48
2          CRIME        -0.13
3     CHAR_RIV_Y         2.14
4       ST_RATIO        -0.59
5       DISTANCE        -0.73
6      NIT_OXIDE       -17.47
7         RADIAL         0.20
8            TAX        -0.01
9         INDUST         0.11
10         ROOMS         0.61


In [23]:
# Use predict() to score predictions for validation set 
# using Stepwise algorithm.
boston_st_pred = boston_st.predict(valid_X_st)

# Develop and display data frame with actual values of Price,
# scoring (predicted) results, and residuals.
# Use round() function to round vlaues in data frame to 
# 2 decimals. 
result = round(pd.DataFrame({'Actual': valid_y_st,'Predicted': boston_st_pred, 
                       'Residual': valid_y_st - boston_st_pred}), 2)
print()
print('Predictions for Validation Set Using Stepwise Selection')
print(result.head(10))

# Display common accuracy measures for validation set.
print()
print('Accuracy Measures for Validation Set Using Stepwise Selection')
regressionSummary(valid_y_st, boston_st_pred)


Predictions for Validation Set Using Stepwise Selection
     Actual  Predicted  Residual
307    28.2      25.57      2.63
343    23.9      22.70      1.20
47     16.6      18.11     -1.51
67     22.0      21.98      0.02
362    20.8      19.19      1.61
132    23.0      19.68      3.32
292    27.9      25.52      2.38
31     14.5      18.25     -3.75
218    21.5      22.62     -1.12
90     22.6      23.57     -0.97

Accuracy Measures for Validation Set Using Stepwise Selection

Regression statistics

                      Mean Error (ME) : 0.1904
       Root Mean Squared Error (RMSE) : 3.8356
            Mean Absolute Error (MAE) : 2.8137
          Mean Percentage Error (MPE) : -4.5767
Mean Absolute Percentage Error (MAPE) : 14.5939
