In [28]:
import numpy as np # NumPy is the fundamental package for scientific computing

import pandas as pd # Pandas is an easy-to-use data structures and data analysis tools
pd.set_option('display.max_columns', None) # To display all columns

import matplotlib.pyplot as plt # Matplotlib is a python 2D plotting library
%matplotlib inline 
# A magic command that tells matplotlib to render figures as static images in the Notebook.

import seaborn as sns # Seaborn is a visualization library based on matplotlib (attractive statistical graphics).
sns.set_style('whitegrid') # One of the five seaborn themes
import warnings
warnings.filterwarnings('ignore') # To ignore some of seaborn warning msg

from scipy import stats

from sklearn import linear_model # Scikit learn library that implements generalized linear models
from sklearn import neighbors # provides functionality for unsupervised and supervised neighbors-based learning methods
from sklearn.metrics import mean_squared_error # Mean squared error regression loss
from sklearn import preprocessing # provides functions and classes to change raw feature vectors

from math import log

In [31]:
data = pd.read_csv("kc_house_data.csv", parse_dates = ['date']) # load the data into a pandas dataframe
data.head(2) # Show the first 2 lines

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639


In [32]:
data.drop(['id', 'date'], axis = 1, inplace = True)

In [33]:
data['basement_present'] = data['sqft_basement'].apply(lambda x: 1 if x > 0 else 0) # Indicate whether there is a basement or not
data['renovated'] = data['yr_renovated'].apply(lambda x: 1 if x > 0 else 0) # 1 if the house has been renovated

In [34]:
categorial_cols = ['floors', 'view', 'condition', 'grade']

for cc in categorial_cols:
    dummies = pd.get_dummies(data[cc], drop_first=False)
    dummies = dummies.add_prefix("{}#".format(cc))
    data.drop(cc, axis=1, inplace=True)
    data = data.join(dummies)

In [35]:
dummies_zipcodes = pd.get_dummies(data['zipcode'], drop_first=False)
dummies_zipcodes.reset_index(inplace=True)
dummies_zipcodes = dummies_zipcodes.add_prefix("{}#".format('zipcode'))
dummies_zipcodes = dummies_zipcodes[['zipcode#98004','zipcode#98102','zipcode#98109','zipcode#98112','zipcode#98039','zipcode#98040']]
data.drop('zipcode', axis=1, inplace=True)
data = data.join(dummies_zipcodes)

In [36]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, train_size = 0.8, random_state = 10)

In [37]:
# A function that take one input of the dataset and return the RMSE (of the test data), and the intercept and coefficient
def simple_linear_model(train, test, input_feature):
    regr = linear_model.LinearRegression() # Create a linear regression object
    regr.fit(train.as_matrix(columns = [input_feature]), train.as_matrix(columns = ['price'])) # Train the model
    RMSE = mean_squared_error(test.as_matrix(columns = ['price']), 
                              regr.predict(test.as_matrix(columns = [input_feature])))**0.5 # Calculate the RMSE on test data
    return RMSE, regr.intercept_[0], regr.coef_[0][0]

In [38]:
RMSE, w0, w1 = simple_linear_model(train_data, test_data, 'sqft_living')
print ('RMSE for sqft_living is: %s ' %RMSE)
print ('intercept is: %s' %w0)
print ('coefficient is: %s' %w1)

RMSE for sqft_living is: 268279.6438833363 
intercept is: -36738.17734638124
coefficient is: 277.36412987021066


In [39]:
input_list = data.columns.values.tolist() # list of column name
input_list.remove('price')
simple_linear_result = pd.DataFrame(columns = ['feature', 'RMSE', 'intercept', 'coefficient'])

# loop that calculate the RMSE of the test data for each input 
for p in input_list:
    RMSE, w1, w0 = simple_linear_model(train_data, test_data, p)
    simple_linear_result = simple_linear_result.append({'feature':p, 'RMSE':RMSE, 'intercept':w0, 'coefficient': w1}
                                                       ,ignore_index=True)
simple_linear_result.sort_values('RMSE').head(10) # display the 10 best estimators


Unnamed: 0,feature,RMSE,intercept,coefficient
2,sqft_living,268279.643883,277.36413,-36738.18
5,sqft_above,304131.310592,266.306764,64617.14
11,sqft_living15,320686.541323,314.359911,-85025.9
1,bathrooms,324082.781919,246523.891877,18632.79
21,view#0,356019.00132,-435033.777431,932201.4
6,sqft_basement,357843.745395,258.126523,464296.6
40,grade#11,357964.423743,965286.415396,522266.3
39,grade#10,360773.700418,556992.601325,510702.4
0,bedrooms,361295.375626,117579.891853,143681.5
9,lat,365041.433662,814499.981062,-38197890.0


In [40]:
# A function that take multiple features as input and return the RMSE (of the test data), and the  intercept and coefficients
def multiple_regression_model(train, test, input_features):
    regr = linear_model.LinearRegression() # Create a linear regression object
    regr.fit(train.as_matrix(columns = input_features), train.as_matrix(columns = ['price'])) # Train the model
    RMSE = mean_squared_error(test.as_matrix(columns = ['price']), regr.predict(test.as_matrix(columns = input_features)))**0.5 # Calculate the RMSE on test data
    return RMSE, regr.intercept_[0], regr.coef_ 

In [42]:
print ('RMSE: %s, intercept: %s, coefficients: %s' %multiple_regression_model(train_data, test_data, ['sqft_living','bathrooms','bedrooms']))
print ('RMSE: %s, intercept: %s, coefficients: %s' %multiple_regression_model(train_data, test_data, ['sqft_above','view#0','bathrooms']))
print ('RMSE: %s, intercept: %s, coefficients: %s' %multiple_regression_model(train_data, test_data, ['bathrooms','bedrooms']))
print ('RMSE: %s, intercept: %s, coefficients: %s' %multiple_regression_model(train_data, test_data, ['view#0','grade#12','bedrooms','sqft_basement']))
print ('RMSE: %s, intercept: %s, coefficients: %s' %multiple_regression_model(train_data, test_data, ['sqft_living','bathrooms','view#0']))

RMSE: 264872.2835550954, intercept: 81100.95967753738, coefficients: [[   306.15090562   7913.53847651 -57658.90103459]]
RMSE: 282802.3649615518, intercept: 303531.2921883548, coefficients: [[ 1.98928518e+02 -3.17760670e+05  7.92684094e+04]]
RMSE: 323412.2692762604, intercept: -18432.905707209487, coefficients: [[235300.89998266  18030.65120532]]
RMSE: 320893.6584322489, intercept: 507958.88999206625, coefficients: [[-3.31957675e+05  1.35299763e+06  8.56179339e+04  1.24073575e+02]]
RMSE: 260210.04852365196, intercept: 205076.19892131374, coefficients: [[    258.00653033    -244.89749378 -223120.61245789]]


In [44]:
train_data['sqft_living_squared'] = train_data['sqft_living'].apply(lambda x: x**2) # create a new column in train_data
test_data['sqft_living_squared'] = test_data['sqft_living'].apply(lambda x: x**2) # create a new column in test_data
print ('RMSE: %s, intercept: %s, coefficients: %s' %multiple_regression_model(train_data, test_data, ['sqft_living','sqft_living_squared']))

RMSE: 246063.9592069224, intercept: 172106.56010562147, coefficients: [[9.31721994e+01 3.37454932e-02]]
