# Imports

In [126]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms

import pandas as pd
import numpy as np

import keras
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Flatten, Lambda, Conv2D, MaxPooling2D, Dropout, BatchNormalization
from keras import regularizers
from keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load Data

In [127]:
data = pd.read_csv("data/data_cleaned.csv")

# Data Preparation

In [128]:
# one hot
data = pd.get_dummies(data, prefix = ["HEAT", "AC", "QUALIFIED", "STYLE", "STRUCT", "GRADE", "CNDTN", "EXTWALL", "INTWALL", "ROOF", "ASSESSMENT_SUBNBHD"], columns = ["HEAT", "AC", "QUALIFIED", "STYLE", "STRUCT", "GRADE", "CNDTN", "EXTWALL", "INTWALL", "ROOF", "ASSESSMENT_SUBNBHD"])

In [129]:
# split
x_data = data.drop(["PRICE"], axis=1)
y_data = data["PRICE"]

In [130]:
# normalization
x_data = x_data.astype(float)
x_data = x_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [131]:
# train & test
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.33, random_state=42)

# NN

In [132]:
model = Sequential()
model.add(Dense(2048, activation="tanh", kernel_initializer='normal', input_shape=(252,)))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(1024, activation="tanh", kernel_initializer='normal'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(1024, activation="tanh", kernel_initializer='normal'))
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(1024, activation="relu", kernel_initializer='normal', 
    kernel_regularizer=regularizers.l1(0.02), bias_regularizer=regularizers.l1(0.02)))
model.add(Dropout(0.4))
model.add(Dense(128, activation="relu", kernel_initializer='normal', 
    kernel_regularizer=regularizers.l1_l2(0.02), bias_regularizer=regularizers.l1_l2(0.02)))
model.add(Dropout(0.4))
model.add(Dense(128, activation="relu", kernel_initializer='normal'))
model.add(Dropout(0.4))
model.add(Dense(1))
model.compile(
    loss='mean_squared_error',
    optimizer='nadam',
    metrics=["mae"])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_72 (Dense)             (None, 2048)              518144    
_________________________________________________________________
batch_normalization_10 (Batc (None, 2048)              8192      
_________________________________________________________________
dropout_59 (Dropout)         (None, 2048)              0         
_________________________________________________________________
dense_73 (Dense)             (None, 1024)              2098176   
_________________________________________________________________
batch_normalization_11 (Batc (None, 1024)              4096      
_________________________________________________________________
dropout_60 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_74 (Dense)             (None, 1024)              1049600   
__________

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)

history = model.fit(x_train, 
          y_train,
          batch_size = 128,
          shuffle = True,
          epochs = 40,
          validation_data = (x_test, y_test),
          callbacks=[es])

Train on 52058 samples, validate on 25641 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40

In [None]:
train_score = model.evaluate(x_train, y_train, verbose=0)
valid_score = model.evaluate(x_test, y_test, verbose=0)

y_pred = model.predict(x_test)

print('Train MAE: ', round(train_score[1], 4), ', Train Loss: ', round(train_score[0], 4)) 
print('Val MAE: ', round(valid_score[1], 4), ', Val Loss: ', round(valid_score[0], 4))

In [None]:
r2_score(y_test, y_pred)

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, c='black')
line = mlines.Line2D([0, 1], [0, 1], color='red')
transform = ax.transAxes
line.set_transform(transform)
plt.xlim(0, 2000000)
plt.ylim(0, 2000000)
ax.add_line(line)
plt.show()

In [None]:
def plot_hist(h, xsize=6, ysize=10):
    # Prepare plotting
    fig_size = plt.rcParams["figure.figsize"]
    plt.rcParams["figure.figsize"] = [xsize, ysize]
    fig, axes = plt.subplots(nrows=4, ncols=4, sharex=True)
    
    # summarize history for MAE
    plt.subplot(211)
    plt.plot(h['mean_absolute_error'])
    plt.plot(h['val_mean_absolute_error'])
    plt.title('Training vs Validation MAE')
    plt.ylabel('MAE')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    # summarize history for loss
    plt.subplot(212)
    plt.plot(h['loss'])
    plt.plot(h['val_loss'])
    plt.title('Training vs Validation Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    # Plot it all in IPython (non-interactive)
    plt.draw()
    plt.show()

    return

plot_hist(history.history, xsize=8, ysize=12)