# Exercise - Regression


The data set for this exercise includes information on house sales in King County, WA (between May 2014 and May 2015). (Each row in the data set pertains to one house. There is a total of 21,613 houses in the data set). Use this data set to predict the sale price of a house (i.e., the `price` column) based on the characteristics of the house. A model can be helpful for buyers, sellers, realtors, and lenders.

## Description of Variables

The description and type of each variable is provided in "KC house data - Data Dictionary.docx". Make sure to read this document to learn about the variables.

## Goal

Use the **kc_house_data.csv** data set and build a model to predict **price**. <br>

# Read and Prepare the Data

In [1]:
# Common imports

import pandas as pd
import numpy as np

np.random.seed(42)

# Get the data

In [2]:
#We will predict the "price" value in the data set:

housing = pd.read_csv("kc_house_data.csv")
housing.head()

Unnamed: 0,price_gte_avg,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,432000,5.0,2.75,2060.0,329903.0,1.5,0,3,5,7.0,2060,0,1989.0,0,zip_98022,47.1776,-121.944,2240,220232.0
1,0,170000,2.0,1.0,810.0,8424.0,1.0,0,0,4,6.0,810,0,1959.0,0,zip_98023,47.3286,-122.346,820,8424.0
2,0,235000,3.0,1.0,960.0,5030.0,1.0,0,0,3,7.0,960,0,1955.0,0,zip_98118,47.5611,-122.28,1460,5400.0
3,0,350000,2.0,1.0,830.0,5100.0,1.0,0,0,4,7.0,830,0,1942.0,0,zip_98126,47.5259,-122.379,1220,5100.0
4,0,397380,2.0,1.0,1030.0,5072.0,1.0,0,0,3,6.0,1030,0,1924.0,1958,zip_98115,47.6962,-122.294,1220,6781.0


# Split data (train/test)

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(housing, test_size=0.3)

# Data Prep

Perform your data prep here. You can use pipelines like we do in the tutorials. Otherwise, feel free to use your own data prep steps. Eventually, you should do the following at a minimum:<br>
- Separate inputs from target<br>
- Impute/remove missing values<br>
- Standardize the continuous variables<br>
- One-hot encode categorical variables<br>

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Separate the target variable 

In [10]:
train_target = train['price']
test_target = test['price']

train_inputs = train.drop(['price','price_gte_avg'], axis=1)
test_inputs = test.drop(['price','price_gte_avg'], axis=1)

In [11]:
train_inputs.dtypes

bedrooms         float64
bathrooms        float64
sqft_living      float64
sqft_lot         float64
floors           float64
waterfront         int64
view               int64
condition          int64
grade            float64
sqft_above         int64
sqft_basement      int64
yr_built         float64
yr_renovated       int64
zipcode           object
lat              float64
long             float64
sqft_living15      int64
sqft_lot15       float64
dtype: object

##  Identify the numeric, binary, and categorical columns

In [12]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [13]:
# Identify the binary columns so we can pass them through without transforming
binary_columns = ['waterfront']

In [14]:
# Be careful: numerical columns already includes the binary columns,
# So, we need to remove the binary columns from numerical columns.

for col in binary_columns:
    numeric_columns.remove(col)

In [15]:
binary_columns

['waterfront']

In [16]:
numeric_columns

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [17]:
categorical_columns

['zipcode']

# Pipeline

In [18]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [19]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [20]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [21]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

In [22]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='drop')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [23]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

<13724x88 sparse matrix of type '<class 'numpy.float64'>'
	with 233343 stored elements in Compressed Sparse Row format>

In [24]:
train_x.shape

(13724, 88)

# Tranform: transform() for TEST

In [25]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

<5882x88 sparse matrix of type '<class 'numpy.float64'>'
	with 100008 stored elements in Compressed Sparse Row format>

In [26]:
test_x.shape

(5882, 88)

In [31]:
print(train_target)

2799     265000
10843    285000
15800    875000
1597     255000
16169    500000
          ...  
16444    325000
2556     216000
5962     815000
24       190000
2451     425000
Name: price, Length: 13724, dtype: int64


# Calculate the Baseline

In [32]:
#First find the average value of the target

mean_value = np.mean(train_target)

mean_value

455668.39616729814

In [33]:
# Predict all values as the mean

baseline_pred = np.repeat(mean_value, len(test_target))

baseline_pred

array([455668.3961673, 455668.3961673, 455668.3961673, ...,
       455668.3961673, 455668.3961673, 455668.3961673])

In [34]:
test_target

16954    220000
9036     593500
4602     648000
11640    257000
7024     212000
          ...  
8343     208000
1918     670000
4931     442500
16123    415000
18974    400000
Name: price, Length: 5882, dtype: int64

In [35]:
from sklearn.metrics import mean_squared_error

In [36]:
baseline_mse = mean_squared_error(test_target, baseline_pred)

baseline_rmse = np.sqrt(baseline_mse)

print('Baseline RMSE: {}' .format(baseline_rmse))

Baseline RMSE: 180448.52076767175


# Calculate the baseline - new approach (not included in the tutorial video)

**This section is not included in the tutorial video. Though, it is a better (and a more foolproof) way of calculating the baseline. I recommend using this approach rather than the above approach.**

**Note that it generates the same results as the above approach.**

In [37]:
from sklearn.dummy import DummyRegressor

dummy_regr = DummyRegressor(strategy="mean")

dummy_regr.fit(train_target, train_target)

DummyRegressor()

In [38]:
from sklearn.metrics import mean_squared_error

dummy_train_pred = dummy_regr.predict(train_x)

baseline_train_mse = mean_squared_error(train_target, dummy_train_pred)

baseline_train_rmse = np.sqrt(baseline_train_mse)

print('Baseline Train RMSE: {}' .format(baseline_train_rmse))

Baseline Train RMSE: 181744.3442482227


In [40]:
dummy_test_pred = dummy_regr.predict(test_x)

baseline_test_mse = mean_squared_error (test_target, dummy_test_pred)

baseline_test_rmse = np.sqrt(baseline_test_mse)

print('Baseline Test RMSE: {}' .format(baseline_test_rmse))

Baseline Test RMSE: 180448.52076767175


# Train a SGD model (with no regularization)

In [41]:
from sklearn.linear_model import SGDRegressor 

# eta0 = learning rate
# penalty = regularization term
# max_iter = number of passes over training data (i.e., epochs)

sgd_reg = SGDRegressor(max_iter=100, penalty=None, eta0=0.01) 

sgd_reg.fit(train_x, train_target)





SGDRegressor(max_iter=100, penalty=None)

In [42]:
sgd_reg.predict(test_x)

array([279821.60738221, 708761.78089313, 653737.1865741 , ...,
       389380.40793177, 403380.1346091 , 422113.54431671])

In [43]:
# Create a new DataFrame

predictions = pd.DataFrame(sgd_reg.predict(test_x), columns=['Predicted'])

predictions

Unnamed: 0,Predicted
0,279821.607382
1,708761.780893
2,653737.186574
3,293474.319736
4,154220.473811
...,...
5877,163139.478397
5878,584045.773814
5879,389380.407932
5880,403380.134609


In [44]:
# Add the actual to the same DataFrame

predictions['Actual'] = np.array(test_target)

predictions

Unnamed: 0,Predicted,Actual
0,279821.607382,220000
1,708761.780893,593500
2,653737.186574,648000
3,293474.319736,257000
4,154220.473811,212000
...,...,...
5877,163139.478397,208000
5878,584045.773814,670000
5879,389380.407932,442500
5880,403380.134609,415000


### Generate the error metrics (Linear Regression)

In [46]:
#Train RMSE
reg_train_pred = sgd_reg.predict(train_x)

train_mse = mean_squared_error(train_target, reg_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 77085.49574868185


In [47]:
#Test RMSE
reg_test_pred = sgd_reg.predict(test_x)

test_mse = mean_squared_error (test_target, reg_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))



Test RMSE: 79175.09410603007


# Try L1 Regularization in SGD

In [48]:
#Stochastic Gradient:
sgd_reg_L1 = SGDRegressor(max_iter=50, penalty='l1', alpha = 0.1, eta0=0.01)

sgd_reg_L1.fit(train_x, train_target)





SGDRegressor(alpha=0.1, max_iter=50, penalty='l1')

### Generate the error metrics

In [58]:
#Train RMSE
reg_train_pred = sgd_reg_L1.predict(train_x)

train_mse = mean_squared_error(train_target, reg_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 78981.98947837703


In [59]:
#Test RMSE
reg_test_pred = sgd_reg_L1.predict(test_x)

test_mse = mean_squared_error (test_target, reg_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))



Test RMSE: 81258.33827054496


# Try L2 Regularization in SGD

In [53]:
#Stochastic Gradient:

sgd_reg_L2 = SGDRegressor(max_iter=50, penalty='l2', alpha = 0.1, eta0=0.01)

sgd_reg_L2.fit(train_x, train_target)

SGDRegressor(alpha=0.1, max_iter=50)

### Generate the error metrics

In [60]:
#Train RMSE
reg_train_pred = sgd_reg_L2.predict(train_x)

train_mse = mean_squared_error(train_target, reg_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 102630.11187197933


In [61]:
#Test RMSE
reg_test_pred = sgd_reg_L2.predict(test_x)

test_mse = mean_squared_error (test_target, reg_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))



Test RMSE: 104796.16642246874


# Try ElasticNet in SGD

In [63]:
#Stochastic Gradient:
sgd_reg_elastic = SGDRegressor(max_iter=50, penalty='elasticnet', l1_ratio=0.5, alpha = 0.1, 
                          eta0=0.01)
sgd_reg_elastic.fit(train_x, train_target)



SGDRegressor(alpha=0.1, l1_ratio=0.5, max_iter=50, penalty='elasticnet')

### Generate the error metrics

In [65]:
#Train RMSE
reg_train_pred = sgd_reg_elastic.predict(train_x)

train_mse = mean_squared_error(train_target, reg_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 101067.12256599613


In [66]:
#Test RMSE
reg_test_pred = sgd_reg_elastic.predict(test_x)

test_mse = mean_squared_error (test_target, reg_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 103137.12047924419


# Create Polynomial Features

Create polynomial features with degree = 2. 

In [67]:
from sklearn.preprocessing import PolynomialFeatures

# Create second degree terms and interaction terms
poly_features = PolynomialFeatures(degree=2).fit(train_x)

train_x_poly = poly_features.transform(train_x)

test_x_poly = poly_features.transform(test_x)

#This will create the polynomial terms of the categorical variables too

#if degree=3, then it creates all combinations: a, a^2, a^3, b, b^2, b^3, a.b, a^2.b, a.b^2, a^2.b^2 

In [69]:
#We still fit a linear regression model

pol_lin_reg = SGDRegressor(max_iter=1000, penalty=None, eta0=0.01) 

pol_lin_reg.fit(train_x_poly, train_target)

SGDRegressor(penalty=None)

In [71]:
#Train RMSE
reg_train_pred = pol_lin_reg.predict(train_x_poly)

train_mse = mean_squared_error(train_target, reg_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 4238666499659.779


In [72]:
#Test RMSE
reg_test_pred = pol_lin_reg.predict(test_x_poly)

test_mse = mean_squared_error (test_target, reg_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 1145472496395.4016


In [74]:
train_x_poly

<13724x4005 sparse matrix of type '<class 'numpy.float64'>'
	with 2347469 stored elements in Compressed Sparse Row format>

# Try L2 Regularization in SGD (with polynomial features)

In [75]:
#Stochastic Gradient:

sgd_reg_L2 = SGDRegressor(max_iter=50, penalty='l2', alpha = 0.1, eta0=0.01)

sgd_reg_L2.fit(train_x_poly, train_target)

SGDRegressor(alpha=0.1, max_iter=50)

### Generate the error metrics

In [77]:
#Train RMSE
reg_train_pred = sgd_reg_L2.predict(train_x_poly)

train_mse = mean_squared_error(train_target, reg_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 2905513213952.3267


In [78]:
#Test RMSE
reg_test_pred = sgd_reg_L2.predict(test_x_poly)

test_mse = mean_squared_error (test_target, reg_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 1714155326235.5654
