In [86]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import Imputer, StandardScaler
from keras import models
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasRegressor

### Load Training Data

In [3]:
data = pd.read_csv('input/train_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,logerror,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,yardbuildingsqft26,yearbuilt,numberofstories,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock
0,0,0.0276,1.0,,,2.0,3.0,,4.0,2.0,...,,1959.0,,122754.0,360170.0,2015.0,237416.0,6735.88,,60371070000000.0
1,1,-0.1684,,,,3.5,4.0,,,3.5,...,,2014.0,,346458.0,585529.0,2015.0,239071.0,10153.02,,
2,2,-0.004,1.0,,,3.0,2.0,,4.0,3.0,...,,1940.0,,61994.0,119906.0,2015.0,57912.0,11484.48,,60374640000000.0
3,3,0.0218,1.0,,,2.0,2.0,,4.0,2.0,...,,1987.0,,171518.0,244880.0,2015.0,73362.0,3048.74,,60372960000000.0
4,4,-0.005,,,,2.5,4.0,,,2.5,...,,1981.0,2.0,169574.0,434551.0,2015.0,264977.0,5488.96,,60590420000000.0


In [4]:
data.shape

(90275, 54)

In [75]:
data.fillna(-1.0) #Replace NaN with -1.0

Unnamed: 0.1,Unnamed: 0,logerror,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,yardbuildingsqft26,yearbuilt,numberofstories,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock
0,0,0.0276,1.0,-1.0,-1.0,2.0,3.0,-1.0,4.0,2.0,...,-1.0,1959.0,-1.0,122754.0,360170.0,2015.0,237416.0,6735.88,-1.0,6.037107e+13
1,1,-0.1684,-1.0,-1.0,-1.0,3.5,4.0,-1.0,-1.0,3.5,...,-1.0,2014.0,-1.0,346458.0,585529.0,2015.0,239071.0,10153.02,-1.0,-1.000000e+00
2,2,-0.0040,1.0,-1.0,-1.0,3.0,2.0,-1.0,4.0,3.0,...,-1.0,1940.0,-1.0,61994.0,119906.0,2015.0,57912.0,11484.48,-1.0,6.037464e+13
3,3,0.0218,1.0,-1.0,-1.0,2.0,2.0,-1.0,4.0,2.0,...,-1.0,1987.0,-1.0,171518.0,244880.0,2015.0,73362.0,3048.74,-1.0,6.037296e+13
4,4,-0.0050,-1.0,-1.0,-1.0,2.5,4.0,-1.0,-1.0,2.5,...,-1.0,1981.0,2.0,169574.0,434551.0,2015.0,264977.0,5488.96,-1.0,6.059042e+13
5,5,-0.2705,1.0,-1.0,-1.0,4.0,4.0,-1.0,1.0,4.0,...,-1.0,1982.0,-1.0,880650.0,2447951.0,2015.0,1567301.0,27126.57,-1.0,6.037621e+13
6,6,0.0440,-1.0,-1.0,-1.0,1.0,2.0,-1.0,7.0,1.0,...,-1.0,1939.0,-1.0,64549.0,111521.0,2015.0,46972.0,2304.97,-1.0,6.037542e+13
7,7,0.1638,-1.0,-1.0,-1.0,2.5,3.0,-1.0,-1.0,2.5,...,-1.0,1994.0,2.0,107000.0,306000.0,2015.0,199000.0,3745.50,-1.0,6.111003e+13
8,8,-0.0030,-1.0,-1.0,-1.0,1.0,2.0,-1.0,-1.0,1.0,...,-1.0,1984.0,-1.0,66834.0,210064.0,2015.0,143230.0,2172.88,-1.0,6.059042e+13
9,9,0.0843,-1.0,-1.0,-1.0,2.0,2.0,-1.0,-1.0,2.0,...,-1.0,1977.0,1.0,109977.0,190960.0,2015.0,80983.0,1940.26,-1.0,6.059063e+13


In [76]:
y = data['logerror']

In [77]:
y.shape

(90275,)

In [78]:
x = data.drop(['logerror'], axis=1)

In [79]:
x.shape

(90275, 53)

### Preprocess Data

In [80]:
x_all = x.values

In [81]:
x_all.shape

(90275, 53)

In [95]:
X_train, X_test, Y_train, Y_test = train_test_split(x_all, y, test_size=0.1, random_state=42)

In [96]:
X_train.shape

(81247, 53)

In [97]:
X_test.shape

(9028, 53)

In [98]:
imputer= Imputer()
imputer.fit(X_train)
X_train = imputer.transform(X_train)
imputer.fit(X_test)
X_test = imputer.transform(X_test)

In [99]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### Model Architecture

In [102]:
model = Sequential()

model.add(Dense(32, activation='relu', input_dim=53))
model.add(Dropout(0.5))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mae', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 32)                1728      
_________________________________________________________________
dropout_14 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 33        
Total params: 1,761
Trainable params: 1,761
Non-trainable params: 0
_________________________________________________________________


### Execute Model and Predict

In [103]:
epochs = 20
batch_size = 64

early_stopping = EarlyStopping(monitor='val_loss', patience=2)

model_checkpoint = ModelCheckpoint(filepath='zestimate_model', verbose=1, save_best_only=True)

start = time.time()
model.fit(X_train, Y_train, validation_split=0.1, callbacks=[model_checkpoint, early_stopping], 
          epochs=epochs, batch_size=batch_size, verbose=2)
end = time.time()

print("Model took %0.2f seconds to train"%(end - start))

# Re-instantiate model to the best model saved
model = models.load_model('zestimate_model')

y_pred = model.predict(X_test, batch_size=batch_size)
score = model.evaluate(X_test, Y_test, verbose=0)

print(score)

Train on 73122 samples, validate on 8125 samples
Epoch 1/20
Epoch 00000: val_loss improved from inf to 0.06886, saving model to zestimate_model
2s - loss: 0.1765 - acc: 0.0089 - val_loss: 0.0689 - val_acc: 0.0090
Epoch 2/20
Epoch 00001: val_loss improved from 0.06886 to 0.06791, saving model to zestimate_model
1s - loss: 0.0698 - acc: 0.0095 - val_loss: 0.0679 - val_acc: 0.0090
Epoch 3/20
Epoch 00002: val_loss improved from 0.06791 to 0.06777, saving model to zestimate_model
1s - loss: 0.0686 - acc: 0.0095 - val_loss: 0.0678 - val_acc: 0.0090
Epoch 4/20
Epoch 00003: val_loss improved from 0.06777 to 0.06753, saving model to zestimate_model
1s - loss: 0.0685 - acc: 0.0095 - val_loss: 0.0675 - val_acc: 0.0090
Epoch 5/20
Epoch 00004: val_loss did not improve
1s - loss: 0.0684 - acc: 0.0095 - val_loss: 0.0676 - val_acc: 0.0090
Epoch 6/20
Epoch 00005: val_loss improved from 0.06753 to 0.06744, saving model to zestimate_model
2s - loss: 0.0683 - acc: 0.0095 - val_loss: 0.0674 - val_acc: 0.00