# Exercise - Regression


The data set for this exercise includes information on house sales in King County, WA (between May 2014 and May 2015). (Each row in the data set pertains to one house. There is a total of 21,613 houses in the data set). Use this data set to predict the sale price of a house (i.e., the `price` column) based on the characteristics of the house. A model can be helpful for buyers, sellers, realtors, and lenders.

## Description of Variables

The description and type of each variable is provided in "KC house data - Data Dictionary.docx". Make sure to read this document to learn about the variables.

## Goal

Use the **kc_house_data.csv** data set and build a model to predict **price**. <br>

# Read and Prepare the Data

In [1]:
# Common imports

import pandas as pd
import numpy as np

np.random.seed(42)

# Get the data

In [2]:
#We will predict the "price" value in the data set:

housing = pd.read_csv("kc_house_data.csv")
housing.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,432000.0,5.0,2.75,2060.0,329903.0,1.5,0,3,5,7.0,2060,0,1989.0,0,98022.0,47.1776,-121.944,2240,220232.0
1,170000.0,2.0,1.0,810.0,8424.0,1.0,0,0,4,6.0,810,0,1959.0,0,98023.0,47.3286,-122.346,820,8424.0
2,235000.0,3.0,1.0,960.0,5030.0,1.0,0,0,3,7.0,960,0,1955.0,0,98118.0,47.5611,-122.28,1460,5400.0
3,350000.0,2.0,1.0,830.0,5100.0,1.0,0,0,4,7.0,830,0,1942.0,0,98126.0,47.5259,-122.379,1220,5100.0
4,397380.0,2.0,1.0,1030.0,5072.0,1.0,0,0,3,6.0,1030,0,1924.0,1958,98115.0,47.6962,-122.294,1220,6781.0


# Split data (train/test)

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(housing, test_size=0.3)

In [4]:
train.isna().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         1
floors           1
waterfront       0
view             0
condition        0
grade            1
sqft_above       0
sqft_basement    0
yr_built         1
yr_renovated     0
zipcode          2
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [5]:
test.isna().sum()

price            0
bedrooms         1
bathrooms        0
sqft_living      1
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       1
dtype: int64

# Data Prep

Perform your data prep here. You can use pipelines like we do in the tutorials. Otherwise, feel free to use your own data prep steps. Eventually, you should do the following at a minimum:<br>
- Separate inputs from target<br>
- Impute/remove missing values<br>
- Standardize the continuous variables<br>
- One-hot encode categorical variables<br>

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Separate the target variable 

In [7]:
train_target = train['price']
test_target = test['price']

train_inputs = train.drop(['price'], axis=1)
test_inputs = test.drop(['price'], axis=1)

## Set the data type of "zipcode" to categorical

In [8]:
train_inputs['zipcode'] = train_inputs['zipcode'].astype('object')
test_inputs['zipcode'] = test_inputs['zipcode'].astype('object')

##  Identify the numeric, binary, and categorical columns

In [9]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [10]:
# Identify the binary columns so we can pass them through without transforming
binary_columns = ['waterfront']

In [11]:
# Be careful: numerical columns already includes the binary columns,
# So, we need to remove the binary columns from numerical columns.

for col in binary_columns:
    numeric_columns.remove(col)

In [12]:
binary_columns

['waterfront']

In [13]:
numeric_columns

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [14]:
categorical_columns

['zipcode']

# Pipeline

In [15]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [16]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=99999)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [17]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [18]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [19]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x
train_x.toarray()

array([[-0.39940789,  0.17591684, -0.34165211, ...,  0.        ,
         0.        ,  0.        ],
       [-1.46007876, -1.4459271 , -1.62003209, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.66126298, -0.79718952, -0.40149117, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.39940789, -1.4459271 , -0.96724231, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39940789, -0.47282073, -0.58644827, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39940789, -0.47282073, -0.83668436, ...,  0.        ,
         0.        ,  0.        ]])

In [20]:
train_x.shape

(15129, 88)

# Tranform: transform() for TEST

In [21]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x
test_x.toarray()

array([[ 0.66126298, -0.14845195, -0.28181304, ...,  0.        ,
         0.        ,  0.        ],
       [-0.39940789,  0.17591684,  0.13162048, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.72193385,  1.1490232 ,  1.79623441, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.39940789,  1.1490232 ,  0.05546167, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.72193385, -0.79718952, -0.15125509, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.66126298, -0.14845195,  0.09898099, ...,  0.        ,
         0.        ,  0.        ]])

In [22]:
test_x.shape

(6484, 88)

# Calculate the Baseline

In [23]:
from sklearn.dummy import DummyRegressor
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(train_x, train_target)

In [24]:
from sklearn.metrics import mean_squared_error

In [25]:
dummy_train_pred = dummy_regr.predict(train_x)
baseline_train_mse = mean_squared_error(train_target, dummy_train_pred)
baseline_train_rmse = np.sqrt(baseline_train_mse)
print('Baseline Train RMSE: {}' .format(baseline_train_rmse))

Baseline Train RMSE: 368963.0257566831


In [26]:
dummy_test_pred = dummy_regr.predict(test_x)
baseline_test_mse = mean_squared_error (test_target, dummy_test_pred)
baseline_test_rmse = np.sqrt(baseline_test_mse)
print('Baseline Test RMSE: {}' .format(baseline_test_rmse))

Baseline Test RMSE: 363575.99565640965


# Train a SGD model (with no regularization)

In [27]:
from sklearn.linear_model import SGDRegressor 

sgd_reg = SGDRegressor(max_iter=100, penalty=None, eta0=0.01) 
sgd_reg.fit(train_x, train_target)

In [28]:
sgd_reg.predict(test_x)



array([425384.27033907, 325570.14955796, 930545.40815889, ...,
       519740.41374377, 648073.6462539 , 293905.71340202])

In [29]:
pd.options.display.float_format = '{:.2f}'.format
predictions = pd.DataFrame(sgd_reg.predict(test_x), columns=['Predicted'])
predictions

Unnamed: 0,Predicted
0,425384.27
1,325570.15
2,930545.41
3,246863.79
4,1249649.00
...,...
6479,1208015.01
6480,641577.15
6481,519740.41
6482,648073.65


In [30]:
predictions['Actual'] = np.array(test_target)
predictions

Unnamed: 0,Predicted,Actual
0,425384.27,378950.00
1,325570.15,285167.00
2,930545.41,979700.00
3,246863.79,360000.00
4,1249649.00,640000.00
...,...,...
6479,1208015.01,1200000.00
6480,641577.15,570000.00
6481,519740.41,509000.00
6482,648073.65,780000.00


### Generate the error metrics

In [31]:
#Train RMSE
reg_train_pred = sgd_reg.predict(train_x)
train_mse = mean_squared_error(train_target, reg_train_pred)
train_rmse = np.sqrt(train_mse)
print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 161674.68617822853


In [32]:
#Test RMSE
reg_test_pred = sgd_reg.predict(test_x)
test_mse = mean_squared_error (test_target, reg_test_pred)
test_rmse = np.sqrt(test_mse)
print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 163809.50145112647


# Try L1 Regularization in SGD

In [33]:
sgd_reg_L1 = SGDRegressor(max_iter=50, penalty='l1', alpha = 0.1, eta0=0.01)
sgd_reg_L1.fit(train_x, train_target)




### Generate the error metrics

In [34]:
#Train RMSE
reg_train_pred = sgd_reg_L1.predict(train_x)
train_mse = mean_squared_error(train_target, reg_train_pred)
train_rmse = np.sqrt(train_mse)
print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 164678.01993102988


In [35]:
#Test RMSE
reg_test_pred = sgd_reg_L1.predict(test_x)
test_mse = mean_squared_error (test_target, reg_test_pred)
test_rmse = np.sqrt(test_mse)
print('Test RMSE: {}' .format(test_rmse))

# This is a worse result as the original model

Test RMSE: 165909.94073577828


# Try L2 Regularization in SGD

In [36]:
sgd_reg_L2 = SGDRegressor(max_iter=50, penalty='l2', alpha = 0.1, eta0=0.01)
sgd_reg_L2.fit(train_x, train_target)





### Generate the error metrics

In [37]:
#Train RMSE
reg_train_pred = sgd_reg_L2.predict(train_x)
train_mse = mean_squared_error(train_target, reg_train_pred)
train_rmse = np.sqrt(train_mse)
print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 205017.94524840527


In [38]:
#Test RMSE
reg_test_pred = sgd_reg_L2.predict(test_x)
test_mse = mean_squared_error (test_target, reg_test_pred)
test_rmse = np.sqrt(test_mse)
print('Test RMSE: {}' .format(test_rmse))

#This one is even worse

Test RMSE: 200705.61100656056


# Try ElasticNet in SGD

In [39]:
sgd_reg_elastic = SGDRegressor(max_iter=50, penalty='elasticnet', l1_ratio=0.75, alpha = 0.1, eta0=0.01)
sgd_reg_elastic.fit(train_x, train_target)




### Generate the error metrics

In [40]:
#Train RMSE
reg_train_pred = sgd_reg_elastic.predict(train_x)
train_mse = mean_squared_error(train_target, reg_train_pred)
train_rmse = np.sqrt(train_mse)
print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 200978.50335422318


In [41]:
#Test RMSE
reg_test_pred = sgd_reg_elastic.predict(test_x)
test_mse = mean_squared_error (test_target, reg_test_pred)
test_rmse = np.sqrt(test_mse)
print('Test RMSE: {}' .format(test_rmse))

Test RMSE: 198630.98738646397


# Create Polynomial Features

Create polynomial features with degree = 2. 

In [42]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2).fit(train_x)
train_x_poly = poly_features.transform(train_x)
test_x_poly = poly_features.transform(test_x)


In [43]:
train_x_poly.shape, test_x_poly.shape

((15129, 4005), (6484, 4005))

# Try L2 Regularization in SGD (with polynomial features)

In [44]:
poly_reg_L2 = SGDRegressor(max_iter=50, penalty='l2', alpha = 0.1, eta0=0.01)
poly_reg_L2.fit(train_x_poly, train_target)

### Generate the error metrics

In [45]:
poly_train_pred = poly_reg_L2.predict(train_x_poly)
train_mse = mean_squared_error(train_target, poly_train_pred)
train_rmse = np.sqrt(train_mse)
print('Train RMSE: {}' .format(train_rmse))

Train RMSE: 1566852169403.6216


In [46]:
poly_test_pred = poly_reg_L2.predict(test_x_poly)
test_mse = mean_squared_error(test_target, poly_test_pred)
test_rmse = np.sqrt(test_mse)
print('Test RMSE: {}' .format(test_rmse))

# Going in the wrong direction...

Test RMSE: 2123235777181.5342
