In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import Imputer, StandardScaler
from keras import models
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor

Using TensorFlow backend.


### Load Training Data

In [2]:
data = pd.read_csv('train_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,logerror,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,yardbuildingsqft26,yearbuilt,numberofstories,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock
0,0,0.0276,1.0,,,2.0,3.0,,4.0,2.0,...,,1959.0,,122754.0,360170.0,2015.0,237416.0,6735.88,,60371070000000.0
1,1,-0.1684,,,,3.5,4.0,,,3.5,...,,2014.0,,346458.0,585529.0,2015.0,239071.0,10153.02,,
2,2,-0.004,1.0,,,3.0,2.0,,4.0,3.0,...,,1940.0,,61994.0,119906.0,2015.0,57912.0,11484.48,,60374640000000.0
3,3,0.0218,1.0,,,2.0,2.0,,4.0,2.0,...,,1987.0,,171518.0,244880.0,2015.0,73362.0,3048.74,,60372960000000.0
4,4,-0.005,,,,2.5,4.0,,,2.5,...,,1981.0,2.0,169574.0,434551.0,2015.0,264977.0,5488.96,,60590420000000.0


In [3]:
data.shape

(90275, 54)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90275 entries, 0 to 90274
Data columns (total 54 columns):
Unnamed: 0                      90275 non-null int64
logerror                        90275 non-null float64
airconditioningtypeid           28781 non-null float64
architecturalstyletypeid        261 non-null float64
basementsqft                    43 non-null float64
bathroomcnt                     90275 non-null float64
bedroomcnt                      90275 non-null float64
buildingclasstypeid             16 non-null float64
buildingqualitytypeid           57364 non-null float64
calculatedbathnbr               89093 non-null float64
decktypeid                      658 non-null float64
finishedfloor1squarefeet        6856 non-null float64
calculatedfinishedsquarefeet    89614 non-null float64
finishedsquarefeet12            85596 non-null float64
finishedsquarefeet13            33 non-null float64
finishedsquarefeet15            3564 non-null float64
finishedsquarefeet50        

In [5]:
data.describe()

Unnamed: 0.1,Unnamed: 0,logerror,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,yardbuildingsqft26,yearbuilt,numberofstories,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock
count,90275.0,90275.0,28781.0,261.0,43.0,90275.0,90275.0,16.0,57364.0,89093.0,...,95.0,89519.0,20570.0,89895.0,90274.0,90275.0,90274.0,90269.0,1783.0,89670.0
mean,45137.0,0.011457,1.816372,7.229885,713.581395,2.279474,3.031869,4.0,5.565407,2.309216,...,311.694737,1968.53287,1.440739,180093.4,457672.6,2015.0,278335.3,5983.975927,13.402692,60491510000000.0
std,26060.292113,0.161079,2.974168,2.716196,437.434198,1.004271,1.156436,0.0,1.900602,0.976172,...,346.35485,23.763475,0.544498,209129.9,554884.4,0.0,400495.5,6838.876956,2.715966,204660500000.0
min,0.0,-4.605,1.0,2.0,100.0,0.0,0.0,4.0,1.0,1.0,...,18.0,1885.0,1.0,100.0,22.0,2015.0,22.0,49.08,6.0,60371010000000.0
25%,22568.5,-0.0253,1.0,7.0,407.5,2.0,2.0,4.0,4.0,2.0,...,100.0,1953.0,1.0,81245.0,199023.2,2015.0,82228.0,2872.83,13.0,60373200000000.0
50%,45137.0,0.006,1.0,7.0,616.0,2.0,3.0,4.0,7.0,2.0,...,159.0,1970.0,1.0,132000.0,342872.0,2015.0,192970.0,4542.75,14.0,60376150000000.0
75%,67705.5,0.0392,1.0,7.0,872.0,3.0,4.0,4.0,7.0,3.0,...,361.0,1987.0,2.0,210534.5,540589.0,2015.0,345419.5,6901.09,15.0,60590420000000.0
max,90274.0,4.737,13.0,21.0,1555.0,20.0,16.0,4.0,12.0,20.0,...,1366.0,2015.0,4.0,9948100.0,27750000.0,2015.0,24500000.0,321936.09,99.0,61110090000000.0


In [6]:
missing_df = data.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / data.shape[0]
del_cols = missing_df.loc[missing_df['missing_ratio']>0.6]['column_name']

In [7]:
data.drop(list(del_cols), axis=1, inplace=True)

In [8]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90275 entries, 0 to 90274
Data columns (total 27 columns):
logerror                        90275 non-null float64
bathroomcnt                     90275 non-null float64
bedroomcnt                      90275 non-null float64
buildingqualitytypeid           57364 non-null float64
calculatedbathnbr               89093 non-null float64
calculatedfinishedsquarefeet    89614 non-null float64
finishedsquarefeet12            85596 non-null float64
fips                            90275 non-null float64
fullbathcnt                     89093 non-null float64
heatingorsystemtypeid           56080 non-null float64
latitude                        90275 non-null float64
longitude                       90275 non-null float64
lotsizesquarefeet               80125 non-null float64
propertylandusetypeid           90275 non-null float64
rawcensustractandblock          90275 non-null float64
regionidcity                    88472 non-null float64
regionidcou

In [10]:
data.fillna(data.mean(), inplace=True)

Unnamed: 0,logerror,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,fips,fullbathcnt,heatingorsystemtypeid,...,regionidzip,roomcnt,unitcnt,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock
0,0.0276,2.0,3.0,4.000000,2.0,1684.0,1684.000000,6037.0,2.0,2.000000,...,96370.0,0.0,1.000000,1959.0,122754.0,360170.0,2015.0,237416.0,6735.88,6.037107e+13
1,-0.1684,3.5,4.0,5.565407,3.5,2263.0,2263.000000,6059.0,3.0,3.926979,...,96962.0,0.0,1.110414,2014.0,346458.0,585529.0,2015.0,239071.0,10153.02,6.049151e+13
2,-0.0040,3.0,2.0,4.000000,3.0,2217.0,2217.000000,6037.0,3.0,2.000000,...,96293.0,0.0,1.000000,1940.0,61994.0,119906.0,2015.0,57912.0,11484.48,6.037464e+13
3,0.0218,2.0,2.0,4.000000,2.0,839.0,839.000000,6037.0,2.0,2.000000,...,96222.0,0.0,1.000000,1987.0,171518.0,244880.0,2015.0,73362.0,3048.74,6.037296e+13
4,-0.0050,2.5,4.0,5.565407,2.5,2283.0,2283.000000,6059.0,2.0,3.926979,...,96961.0,8.0,1.110414,1981.0,169574.0,434551.0,2015.0,264977.0,5488.96,6.059042e+13
5,-0.2705,4.0,4.0,1.000000,4.0,3067.0,3067.000000,6037.0,4.0,2.000000,...,96109.0,0.0,1.000000,1982.0,880650.0,2447951.0,2015.0,1567301.0,27126.57,6.037621e+13
6,0.0440,1.0,2.0,7.000000,1.0,1297.0,1297.000000,6037.0,1.0,7.000000,...,96091.0,0.0,1.000000,1939.0,64549.0,111521.0,2015.0,46972.0,2304.97,6.037542e+13
7,0.1638,2.5,3.0,5.565407,2.5,1763.0,1763.000000,6111.0,2.0,3.926979,...,97101.0,6.0,1.110414,1994.0,107000.0,306000.0,2015.0,199000.0,3745.50,6.111003e+13
8,-0.0030,1.0,2.0,5.565407,1.0,796.0,796.000000,6059.0,1.0,3.926979,...,96987.0,0.0,1.110414,1984.0,66834.0,210064.0,2015.0,143230.0,2172.88,6.059042e+13
9,0.0843,2.0,2.0,5.565407,2.0,1260.0,1260.000000,6059.0,2.0,3.926979,...,96963.0,5.0,1.110414,1977.0,109977.0,190960.0,2015.0,80983.0,1940.26,6.059063e+13


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90275 entries, 0 to 90274
Data columns (total 27 columns):
logerror                        90275 non-null float64
bathroomcnt                     90275 non-null float64
bedroomcnt                      90275 non-null float64
buildingqualitytypeid           90275 non-null float64
calculatedbathnbr               90275 non-null float64
calculatedfinishedsquarefeet    90275 non-null float64
finishedsquarefeet12            90275 non-null float64
fips                            90275 non-null float64
fullbathcnt                     90275 non-null float64
heatingorsystemtypeid           90275 non-null float64
latitude                        90275 non-null float64
longitude                       90275 non-null float64
lotsizesquarefeet               90275 non-null float64
propertylandusetypeid           90275 non-null float64
rawcensustractandblock          90275 non-null float64
regionidcity                    90275 non-null float64
regionidcou

## Feature Selection

In [12]:
X = data.drop(['logerror'], axis=1)

In [13]:
y = data['logerror']

In [14]:
(X.shape, y.shape)

((90275, 26), (90275,))

In [15]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor

In [16]:
rfe = RFE(DecisionTreeRegressor(), 8)
fit = rfe.fit(X, y)

In [17]:
fit.support_

array([False, False, False, False,  True, False, False, False, False,
        True,  True,  True, False, False, False, False, False, False,
       False, False,  True,  True, False,  True,  True, False], dtype=bool)

In [18]:
fit.ranking_

array([12,  8,  9, 10,  1,  4, 18, 14, 15,  1,  1,  1, 11,  7,  6, 16,  3,
       13, 17,  2,  1,  1, 19,  1,  1,  5])

In [19]:
X = data[['bedroomcnt','calculatedfinishedsquarefeet','latitude', 'longitude', 'lotsizesquarefeet', 
          'structuretaxvaluedollarcnt', 'taxvaluedollarcnt','landtaxvaluedollarcnt','taxamount', 'yearbuilt']]

In [20]:
X.head()

Unnamed: 0,bedroomcnt,calculatedfinishedsquarefeet,latitude,longitude,lotsizesquarefeet,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,yearbuilt
0,3.0,1684.0,34280990.0,-118488536.0,7528.0,122754.0,360170.0,237416.0,6735.88,1959.0
1,4.0,2263.0,33668120.0,-117677556.0,3643.0,346458.0,585529.0,239071.0,10153.02,2014.0
2,2.0,2217.0,34136312.0,-118175032.0,11423.0,61994.0,119906.0,57912.0,11484.48,1940.0
3,2.0,839.0,33755800.0,-118309000.0,70859.0,171518.0,244880.0,73362.0,3048.74,1987.0
4,4.0,2283.0,33485643.0,-117700234.0,6000.0,169574.0,434551.0,264977.0,5488.96,1981.0


## Training Data

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [22]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Decision Tree

In [23]:
dtree = DecisionTreeRegressor()

In [24]:
dtree.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [25]:
y_pred = dtree.predict(X_test)

In [26]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

-1.0001341417099785

## Random Forest

In [27]:
from sklearn.ensemble import RandomForestRegressor

In [61]:
rfr = RandomForestRegressor(n_estimators=800, max_depth=8)

In [62]:
rfr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=800, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [58]:
y_pred = rfr.predict(X_test)

In [59]:
r2_score(y_test, y_pred)

0.0089317308855977329

## Gradient Boosting

In [55]:
from sklearn.ensemble import GradientBoostingRegressor

In [56]:
gbr = GradientBoostingRegressor()

In [57]:
gbr.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [58]:
y_pred = gbr.predict(X_test)

In [59]:
r2_score(y_test, y_pred)

0.0070171587345255748

### Model Architecture

In [125]:
model = Sequential()

model.add(Dense(300, input_dim=X.shape[1], activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mae')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_92 (Dense)             (None, 300)               3300      
_________________________________________________________________
dense_93 (Dense)             (None, 200)               60200     
_________________________________________________________________
dense_94 (Dense)             (None, 100)               20100     
_________________________________________________________________
dense_95 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_96 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_97 (Dense)             (None, 1)                 26        
Total params: 89,951
Trainable params: 89,951
Non-trainable params: 0
_________________________________________________________________


### Execute Model and Predict

In [126]:
model.fit(X_train, y_train, epochs=15, batch_size=32, verbose=2)

Epoch 1/15
6s - loss: 0.0683
Epoch 2/15
5s - loss: 0.0678
Epoch 3/15
5s - loss: 0.0677
Epoch 4/15
5s - loss: 0.0677
Epoch 5/15
5s - loss: 0.0676
Epoch 6/15
5s - loss: 0.0676
Epoch 7/15
5s - loss: 0.0676
Epoch 8/15
5s - loss: 0.0676
Epoch 9/15
5s - loss: 0.0675
Epoch 10/15
5s - loss: 0.0675
Epoch 11/15
5s - loss: 0.0675
Epoch 12/15
5s - loss: 0.0674
Epoch 13/15
5s - loss: 0.0674
Epoch 14/15
6s - loss: 0.0674
Epoch 15/15
5s - loss: 0.0673


<keras.callbacks.History at 0x192bd5b70>

In [127]:
score = model.evaluate(X_test, y_test, batch_size=16)
print("\n")
print(score)

y_pred = model.predict(X_test, batch_size=16)

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)


0.0682949297253


0.0044720819706025594