# Import Required Libraries

In [23]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import metrics

from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder, OneHotEncoder


# Read the preprocessed dataset

In [2]:
df = pd.read_csv('ames_housing_preprocessed.csv')
df.shape

(1460, 80)

In [3]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,5,3,65.0,8450,1,1,3,3,4,...,0,3,4,1,0,1,2,8,4,12.247699
1,2,0,3,80.0,9600,1,1,3,3,2,...,0,3,4,1,0,4,1,8,4,12.109016
2,3,5,3,68.0,11250,1,1,0,3,4,...,0,3,4,1,0,8,2,8,4,12.317171
3,4,6,3,60.0,9550,1,1,0,3,0,...,0,3,4,1,0,1,0,8,0,11.849405
4,5,5,3,84.0,14260,1,1,0,3,2,...,0,3,4,1,0,11,2,8,4,12.42922


In [4]:
del df['Id']

In [5]:
df.shape

(1460, 79)

# Split the data into features and target

In [6]:
X = df.copy(deep=True)
del X['SalePrice']

In [7]:
df.shape

(1460, 79)

In [8]:
X.shape

(1460, 78)

In [9]:
y = df['SalePrice']

# Treat Categorical Data with OneHotEncoding

In [10]:
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold', 'MSZoning', 'LandContour', 'LotConfig', 'Neighborhood',
        'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
        'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'Foundation', 'GarageType', 'MiscFeature', 
        'SaleType', 'SaleCondition', 'Electrical', 'Heating')
colIndex = [X.columns.get_loc(c) for c in cols if c in X]

In [11]:
onehotencoder = OneHotEncoder(categorical_features = colIndex)
X_OHE = pd.DataFrame(onehotencoder.fit_transform(X).toarray())

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [12]:
X_OHE.shape

(1460, 659)

# Split the dataset

In [13]:
# Split data into train and test formate
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_OHE, y, test_size=0.2, random_state=7)

# Define the Model / Network

In [74]:
# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(1536, input_dim=659, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(0.1))
#     model.add(Dense(1344, kernel_initializer='normal', activation='relu')) # 1
    model.add(Dense(1152, kernel_initializer='normal', activation='relu'))
#     model.add(Dense(1024, kernel_initializer='normal', activation='relu')) # 1
#     model.add(Dropout(0.2))
    model.add(Dense(896, kernel_initializer='normal', activation='relu'))
    model.add(Dense(704, kernel_initializer='normal', activation='relu'))
#     model.add(Dense(544, kernel_initializer='normal', activation='relu')) # 1
#     model.add(Dropout(0.2))
# #  last baseline start
    model.add(Dense(448, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(0.1))
    model.add(Dense(256, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(0.2))
    model.add(Dense(144, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(0.2))
    model.add(Dense(70, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(0.2))
    model.add(Dense(36, kernel_initializer='normal', activation='relu'))
#     model.add(Dense(18, kernel_initializer='normal', activation='relu')) # 1
    model.add(Dense(8, kernel_initializer='normal', activation='relu'))
#     model.add(Dense(4, kernel_initializer='normal', activation='relu')) # 1
# #   last baseline end
#     model.add(Dropout(0.2))
#     model.add(Dense(80, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(0.2))
#     model.add(Dense(56, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(0.2))
#     model.add(Dense(24, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(0.2))
#     model.add(Dense(12, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(0.2))
#     model.add(Dense(4, kernel_initializer='normal', activation='relu'))
#     model.add(Dropout(0.2))
    model.add(Dense(1, kernel_initializer='normal'))
    
#     sgd = SGD(lr=0.01, momentum=0.8, decay=0.1)
    # Compile model
#     model.compile(loss='mean_squared_error', optimizer='adam', metrics=[metrics.mae, metrics.categorical_accuracy])
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=[metrics.mae, metrics.mse])
    return model

In [None]:

from keras.callbacks import EarlyStopping
# simple early stopping
es = EarlyStopping(monitor='val_loss', verbose=1)

# evaluate model with standardized dataset
estimators = []
# estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=65, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
# kfold = KFold(n_splits=10)
# results = cross_val_score(pipeline, X_train, y_train)
# print(results)
# print("%.2f (%.2f) RMSE" % (np.sqrt(abs(results.mean())), results.std()))

pipeline.fit(X_train, y_train)
prediction = pipeline.predict(X_test)
# accuracy_score(y, prediction)

In [76]:
prediction

array([11.88307  , 12.144483 , 11.994382 , 11.516606 , 11.64677  ,
       12.39386  , 11.816831 , 11.442051 , 11.76962  , 11.639763 ,
       12.1228   , 11.822619 , 11.456859 , 11.965747 , 12.041239 ,
       11.902916 , 12.173543 , 11.861756 , 11.595575 , 11.634294 ,
       11.951531 , 11.737838 , 11.474831 , 12.340887 , 12.43928  ,
       11.650148 , 11.372947 , 11.864032 , 12.114497 , 11.515336 ,
       11.950676 , 12.157028 , 12.463193 , 12.324316 , 11.977704 ,
       12.28049  , 11.563451 , 11.64045  , 12.342609 , 12.170265 ,
       11.4026375, 11.8802395, 11.790701 , 11.869895 , 12.300369 ,
       12.103766 , 11.958725 , 11.928058 , 12.396799 , 12.211249 ,
       12.189862 , 11.890883 , 11.899503 , 12.101077 , 11.521686 ,
       11.503634 , 12.994422 , 11.854377 , 12.11477  , 12.005037 ,
       11.844972 , 12.246424 , 11.574608 , 11.499643 , 11.475742 ,
       11.960121 , 11.565345 , 11.955793 , 11.821952 , 11.933886 ,
       12.111592 , 11.890682 , 11.312657 , 11.603841 , 12.4669

In [77]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
rmse = np.sqrt(mean_squared_error(y_test, prediction))
mae = mean_absolute_error(y_test, prediction)
r2 = r2_score(y_test, prediction)
adj_r2 = 1-(1-r2)*(X.shape[0]-1)/(X.shape[0]-X.shape[1]-1)
print("RMSE: %f" % (rmse))
print("MAE: %f" % (mae))
print("R Squared: %f" % (r2))
print("Adj. R Squared: %f" % (adj_r2))

RMSE: 0.218916
MAE: 0.185507
R Squared: 0.714006
Adj. R Squared: 0.697853
