In [50]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [45]:
data = pd.read_csv("kc_house_data.csv", parse_dates = ['date'])
data.dtypes

id                        int64
date             datetime64[ns]
price                   float64
bedrooms                  int64
bathrooms               float64
sqft_living               int64
sqft_lot                  int64
floors                  float64
waterfront                int64
view                      int64
condition                 int64
grade                     int64
sqft_above                int64
sqft_basement             int64
yr_built                  int64
yr_renovated              int64
zipcode                   int64
lat                     float64
long                    float64
sqft_living15             int64
sqft_lot15                int64
dtype: object

In [46]:
data.describe(include='all')

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
unique,,372,,,,,,,,,,,,,,,,,,,
top,,2014-06-23 00:00:00,,,,,,,,,,,,,,,,,,,
freq,,142,,,,,,,,,,,,,,,,,,,
first,,2014-05-02 00:00:00,,,,,,,,,,,,,,,,,,,
last,,2015-05-27 00:00:00,,,,,,,,,,,,,,,,,,,
mean,4580302000.0,,540088.1,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,7.656873,1788.390691,291.509045,1971.005136,84.402258,98077.939805,47.560053,-122.213896,1986.552492,12768.455652
std,2876566000.0,,367127.2,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,1.175459,828.090978,442.575043,29.373411,401.67924,53.505026,0.138564,0.140828,685.391304,27304.179631
min,1000102.0,,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0


In [47]:
zipcode_data = data.groupby('zipcode').mean() # get group mean
zipcode_data.reset_index(inplace=True)
zipcode_data.sort_values(by="price",inplace=True,ascending=False)
zipcode_data.head(5)

Unnamed: 0,zipcode,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
24,98039,3755708000.0,2160607.0,4.06,3.2,3800.9,17403.56,1.56,0.02,0.44,3.48,9.56,3290.9,510.0,1972.52,398.6,47.62584,-122.23354,3132.2,17291.1
3,98004,4394237000.0,1355927.0,3.85489,2.527603,2909.022082,13104.22082,1.432177,0.003155,0.305994,3.495268,8.687697,2419.842271,489.179811,1971.470032,208.14511,47.616183,-122.205189,2674.700315,12805.022082
25,98040,4199323000.0,1194230.0,4.028369,2.716312,3106.833333,13704.464539,1.505319,0.042553,0.783688,3.787234,8.960993,2474.329787,632.503546,1971.471631,233.574468,47.559846,-122.225592,2898.744681,12801.283688
48,98112,4826282000.0,1095499.0,3.509294,2.373606,2498.743494,4990.431227,1.806691,0.0,0.148699,3.516729,8.442379,2005.695167,493.048327,1944.449814,223.144981,47.629619,-122.297866,2280.078067,4898.159851
41,98102,4123034000.0,901258.3,3.219048,2.261905,2159.742857,3616.190476,2.004762,0.0,0.171429,3.371429,8.285714,1764.933333,394.809524,1952.52381,113.92381,47.634607,-122.322248,1958.333333,3310.133333


In [48]:
categorial_cols = ['floors', 'view', 'condition', 'grade']

for cc in categorial_cols:
    dummies = pd.get_dummies(data[cc], drop_first=False)
    dummies = dummies.add_prefix("{}#".format(cc))
    data.drop(cc, axis=1, inplace=True)
    data = data.join(dummies)

dummies_zipcodes = pd.get_dummies(data['zipcode'], drop_first=False)
dummies_zipcodes.reset_index(inplace=True)
dummies_zipcodes = dummies_zipcodes.add_prefix("{}#".format('zipcode'))
dummies_zipcodes = dummies_zipcodes[['zipcode#98039','zipcode#98004','zipcode#98040','zipcode#98112','zipcode#98102']]
data.drop('zipcode', axis=1, inplace=True)
data = data.join(dummies_zipcodes)

data.dtypes

id                        int64
date             datetime64[ns]
price                   float64
bedrooms                  int64
bathrooms               float64
sqft_living               int64
sqft_lot                  int64
waterfront                int64
sqft_above                int64
sqft_basement             int64
yr_built                  int64
yr_renovated              int64
lat                     float64
long                    float64
sqft_living15             int64
sqft_lot15                int64
floors#1.0                uint8
floors#1.5                uint8
floors#2.0                uint8
floors#2.5                uint8
floors#3.0                uint8
floors#3.5                uint8
view#0                    uint8
view#1                    uint8
view#2                    uint8
view#3                    uint8
view#4                    uint8
condition#1               uint8
condition#2               uint8
condition#3               uint8
condition#4               uint8
conditio

In [51]:
data['basement_present'] = data['sqft_basement'].apply(lambda x: 1 if x > 0 else 0) # Indicate whether there is a basement or not
data['renovated'] = data['yr_renovated'].apply(lambda x: 1 if x > 0 else 0) # 1 if the house has been renovated

data['sqft_living_squared'] = data['sqft_living'].apply(lambda x: x**2)
data['sqft_living_cubed'] = data['sqft_living'].apply(lambda x: x**3) 
data['log_sqft_living'] = data['sqft_living'].apply(lambda x: np.log(x))

data['bedrooms_squared'] = data['bedrooms'].apply(lambda x: x**2) 
data['bed_bath_rooms'] = data['bedrooms']*data['bathrooms']

data_arr = np.array(data);
print(data_arr.shape)
train_X,test_X, train_y, test_y =train_test_split(data_arr[:,3:data_arr.shape[1]],data_arr[:,2:3],train_size = 0.8, random_state = 0)

print("train X shape {0},train y shape {1}".format(train_X.shape,train_y.shape))
print("test X shape {0},test y shape {1}".format(test_X.shape,test_y.shape))

(21613, 56)
train X shape (17290, 53),train y shape (17290, 1)
test X shape (4323, 53),test y shape (4323, 1)


In [None]:
train_X = np.mat(train_X,dtype=np.float32)
train_y = np.mat(train_y,dtype=np.float32)
test_X = np.mat(test_X,dtype=np.float32)
test_y = np.mat(test_y,dtype=np.float32)

train_X,train_X_stds,train_X_means = feature_normalize(train_X)
train_y,train_y_stds,train_y_means = feature_normalize(train_y)

for c in range(test_X.shape[1]):
        test_X[:, c] = (test_X[:, c] - train_X_means[0,c]) / train_X_stds[0,c]

test_y_fn = (test_y - train_y_means[0,0]) / train_y_stds[0,0]

In [52]:
data['date']

0       2014-10-13
1       2014-12-09
2       2015-02-25
3       2014-12-09
4       2015-02-18
5       2014-05-12
6       2014-06-27
7       2015-01-15
8       2015-04-15
9       2015-03-12
10      2015-04-03
11      2014-05-27
12      2014-05-28
13      2014-10-07
14      2015-03-12
15      2015-01-24
16      2014-07-31
17      2014-05-29
18      2014-12-05
19      2015-04-24
20      2014-05-14
21      2014-08-26
22      2014-07-03
23      2014-05-16
24      2014-11-20
25      2014-11-03
26      2014-06-26
27      2014-12-01
28      2014-06-24
29      2015-03-02
           ...    
21583   2014-06-10
21584   2014-12-02
21585   2014-08-28
21586   2014-10-15
21587   2015-03-05
21588   2014-11-13
21589   2014-09-10
21590   2014-05-14
21591   2014-10-02
21592   2015-04-16
21593   2015-03-17
21594   2014-10-17
21595   2014-10-31
21596   2014-08-13
21597   2015-04-21
21598   2014-10-13
21599   2014-09-15
21600   2014-10-15
21601   2015-04-07
21602   2014-06-26
21603   2014-08-25
21604   2015