In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm, skew
#from scipy.special import boxcox1px
from scipy.stats import boxcox_normmax
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [2]:
import scipy.special as ss

In [3]:
%matplotlib inline

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)
train["SalePrice"] = np.log1p(train["SalePrice"])
y = train['SalePrice'].reset_index(drop=True).copy()

In [6]:
train.drop('Id',inplace=True, axis=1)
test.drop('Id',inplace=True, axis=1)

In [7]:
print(train.shape, test.shape)

(1458, 80) (1459, 79)


In [8]:
data = pd.concat((train,test)).reset_index(drop=True)
print(data.shape)

(2917, 80)


In [9]:
data_nans = data.isnull().sum()
data_nans = data_nans[data_nans > 0]
data_nans.sort_values(ascending=False)

PoolQC          2908
MiscFeature     2812
Alley           2719
Fence           2346
SalePrice       1459
FireplaceQu     1420
LotFrontage      486
GarageQual       159
GarageCond       159
GarageFinish     159
GarageYrBlt      159
GarageType       157
BsmtExposure      82
BsmtCond          82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
BsmtFullBath       2
BsmtHalfBath       2
Utilities          2
Functional         2
Electrical         1
BsmtUnfSF          1
Exterior1st        1
Exterior2nd        1
TotalBsmtSF        1
GarageCars         1
BsmtFinSF2         1
BsmtFinSF1         1
KitchenQual        1
SaleType           1
GarageArea         1
dtype: int64

In [10]:
strings = ['MSSubClass', 'YrSold', 'MoSold']
for var in strings:
    data[var] = data[var].apply(str)

In [11]:
groups = ['Exterior1st','Exterior2nd','SaleType','Electrical','KitchenQual']
for group in groups:
    mode = data[group].mode()[0]
    data[group] = data[group].fillna(mode)

    
data.MSZoning = data.MSZoning.fillna('RL')

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond','BsmtQual',
            'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',"PoolQC"
           ,'Alley','Fence','MiscFeature','FireplaceQu','MasVnrType','Utilities']:
    data[col] = data[col].fillna('None')
    
for col in ('GarageArea', 'GarageCars','MasVnrArea','BsmtFinSF1','BsmtFinSF2'
           ,'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BsmtUnfSF','TotalBsmtSF', 'GarageYrBlt'):
    data[col] = data[col].fillna(0)
    

data.Functional = data.Functional.fillna('Typ')

data['LotFrontage'] = data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

In [12]:
data_nans = data.isnull().sum()
data_nans = data_nans[data_nans > 0]
data_nans.sort_values(ascending=False)

SalePrice    1459
dtype: int64

In [13]:
data.drop(['SalePrice'], axis=1, inplace=True)

In [14]:
data['TotalSF'] = data.TotalBsmtSF + data['1stFlrSF'] + data['2ndFlrSF']
data['TotalBath'] = data.FullBath + 0.5 * data.HalfBath + data.BsmtFullBath + 0.5 * data.BsmtHalfBath
data['TotalPorch'] = data.OpenPorchSF + data['3SsnPorch'] + data.EnclosedPorch + data.ScreenPorch + data.WoodDeckSF
data['YrBltAndRemod']=data.YearBuilt + data.YearRemodAdd

data['HasPool'] = data.PoolArea.apply(lambda x: 1 if x > 0 else 0)
data['HasGarage'] = data.GarageArea.apply(lambda x: 1 if x > 0 else 0)
data['HasBsmt'] = data.TotalBsmtSF.apply(lambda x: 1 if x > 0 else 0)
data['HasFirePl'] = data.Fireplaces.apply(lambda x: 1 if x > 0 else 0)

drops = ['Utilities', 'Street', 'PoolQC']
data = data.drop(drops,axis=1)

data.shape

(2917, 84)

In [15]:
qual_dict = {"None": 0, "Po": 1, "Fa": 2, "TA": 4, "Gd": 7, "Ex": 11}
qual_cols = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "HeatingQC", 
             "KitchenQual", "FireplaceQu", "GarageQual", "GarageCond", "PoolQC"]

for cat in data.columns:
    if cat in qual_cols:
        data[cat] = data[cat].map(qual_dict).astype('int64')

In [16]:
cat_features = data.select_dtypes(include=['object']).columns
print(cat_features)
num_features = data.select_dtypes(exclude=['object']).columns
print(num_features)

Index(['Alley', 'BldgType', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'Exterior1st',
       'Exterior2nd', 'Fence', 'Foundation', 'Functional', 'GarageFinish',
       'GarageType', 'Heating', 'HouseStyle', 'LandContour', 'LandSlope',
       'LotConfig', 'LotShape', 'MSSubClass', 'MSZoning', 'MasVnrType',
       'MiscFeature', 'MoSold', 'Neighborhood', 'PavedDrive', 'RoofMatl',
       'RoofStyle', 'SaleCondition', 'SaleType', 'YrSold'],
      dtype='object')
Index(['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtCond',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual',
       'BsmtUnfSF', 'EnclosedPorch', 'ExterCond', 'ExterQual', 'FireplaceQu',
       'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars', 'GarageCond',
       'GarageQual', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'HeatingQC',
       'KitchenAbvGr', 'KitchenQual', 'LotArea', 'LotFrontage', 'LowQualFinSF',
       'Mas

In [44]:
feat_num = data[num_features]
feat_cat = data[cat_features]

In [45]:
feat_num_T = feat_num.describe().transpose()

In [46]:
feat_num = feat_num.apply(lambda x: (x - np.mean(x)) / np.std(x)).reset_index(drop=True)

In [47]:
feat_num

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtCond,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtQual,...,YearBuilt,YearRemodAdd,TotalSF,TotalBath,TotalPorch,YrBltAndRemod,HasPool,HasGarage,HasBsmt,HasFirePl
0,-0.783214,1.210473,-0.103366,0.169987,0.064058,0.601514,-0.293134,1.090257,-0.249810,0.547756,...,1.047231,0.897702,0.030064,1.590503,-0.761881,1.094096,-0.064271,0.239306,0.166843,-1.026755
1,0.270789,-0.784638,-0.103366,0.169987,0.064058,1.213983,-0.293134,-0.818675,3.821109,0.547756,...,0.155606,-0.394865,-0.023717,0.350990,0.721802,-0.076667,-0.064271,0.239306,0.166843,0.973942
2,-0.617065,1.238507,-0.103366,0.169987,0.064058,0.106135,-0.293134,1.090257,-0.249810,0.547756,...,0.981185,0.849829,0.209336,1.590503,-0.880826,1.029054,-0.064271,0.239306,0.166843,0.973942
3,-0.510627,0.981526,-0.103366,0.169987,3.094194,-0.501830,-0.293134,1.090257,-0.249810,-0.703135,...,-1.858805,-0.682102,-0.089023,-0.268767,0.778145,-1.529280,-0.064271,0.239306,0.166843,0.973942
4,-0.032951,1.675376,-0.103366,1.385311,0.064058,0.486676,-0.293134,1.090257,-0.249810,0.547756,...,0.948161,0.754083,1.025022,1.590503,0.584076,0.964011,-0.064271,0.239306,0.166843,0.973942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912,-1.587994,0.490925,-0.103366,0.169987,0.064058,-0.988202,-0.293134,-0.818675,-0.249810,-0.703135,...,-0.042533,-0.682102,-1.158250,-0.888523,-1.143758,-0.336836,-0.064271,-4.178759,0.166843,-1.026755
2913,-1.587994,0.490925,-0.103366,0.169987,0.064058,-0.420768,-0.293134,-0.818675,-0.249810,-0.703135,...,-0.042533,-0.682102,-1.158250,-0.888523,-0.993511,-0.336836,-0.064271,0.239306,0.166843,-1.026755
2914,0.172139,-0.784638,-0.103366,1.385311,0.064058,1.767907,-0.293134,1.090257,-0.249810,-0.703135,...,-0.372764,0.562592,-0.121036,-0.268767,1.823609,0.010056,-0.064271,0.239306,0.166843,0.973942
2915,-0.487262,-0.784638,-0.103366,0.169987,0.064058,-0.229372,-0.293134,-0.818675,3.821109,0.547756,...,0.683976,0.371101,-0.845806,-0.888523,-0.442608,0.617119,-0.064271,-4.178759,0.166843,-1.026755


In [48]:
cat_data = pd.get_dummies(feat_cat).reset_index(drop=True)

In [49]:
fin_data = pd.concat([feat_num,cat_data],axis=1).reset_index(drop=True)

In [50]:
fin_data

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtCond,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtQual,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,YrSold_2006,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010
0,-0.783214,1.210473,-0.103366,0.169987,0.064058,0.601514,-0.293134,1.090257,-0.249810,0.547756,...,0,0,0,0,1,0,0,1,0,0
1,0.270789,-0.784638,-0.103366,0.169987,0.064058,1.213983,-0.293134,-0.818675,3.821109,0.547756,...,0,0,0,0,1,0,1,0,0,0
2,-0.617065,1.238507,-0.103366,0.169987,0.064058,0.106135,-0.293134,1.090257,-0.249810,0.547756,...,0,0,0,0,1,0,0,1,0,0
3,-0.510627,0.981526,-0.103366,0.169987,3.094194,-0.501830,-0.293134,1.090257,-0.249810,-0.703135,...,0,0,0,0,1,1,0,0,0,0
4,-0.032951,1.675376,-0.103366,1.385311,0.064058,0.486676,-0.293134,1.090257,-0.249810,0.547756,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912,-1.587994,0.490925,-0.103366,0.169987,0.064058,-0.988202,-0.293134,-0.818675,-0.249810,-0.703135,...,0,0,0,0,1,1,0,0,0,0
2913,-1.587994,0.490925,-0.103366,0.169987,0.064058,-0.420768,-0.293134,-0.818675,-0.249810,-0.703135,...,0,0,0,0,1,1,0,0,0,0
2914,0.172139,-0.784638,-0.103366,1.385311,0.064058,1.767907,-0.293134,1.090257,-0.249810,-0.703135,...,0,0,0,0,1,1,0,0,0,0
2915,-0.487262,-0.784638,-0.103366,0.169987,0.064058,-0.229372,-0.293134,-0.818675,3.821109,0.547756,...,0,0,0,0,1,1,0,0,0,0


In [51]:
print(fin_data.shape)
train = fin_data.iloc[:len(y),:]
test = fin_data.iloc[len(y):,:]
print(train.shape,y.shape,test.shape)

(2917, 294)
(1458, 294) (1458,) (1459, 294)


In [52]:
outliers = [30, 88, 462, 631, 1322]
train = train.drop(train.index[outliers])
y_train = y.drop(y.index[outliers])

In [53]:
print(train.shape,y_train.shape,test.shape)

(1453, 294) (1453,) (1459, 294)


In [54]:
train['SalePrice'] = y_train

In [55]:
print(train.shape)

(1453, 295)


In [56]:
y_train

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
          ...    
1453    12.072547
1454    12.254868
1455    12.493133
1456    11.864469
1457    11.901590
Name: SalePrice, Length: 1453, dtype: float64

In [57]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.0.0


In [31]:
# !pip install -q git+https://github.com/tensorflow/docs

In [58]:
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

In [59]:
train_set = train.sample(frac=0.8, random_state=0)
test_set = train.drop(train_set.index)

In [60]:
train_labels = train_set.pop('SalePrice')
test_labels = test_set.pop('SalePrice')

In [61]:
train_stats = train_set.describe()
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1stFlrSF,1162.0,-0.021105,0.961047,-2.138360,-0.736484,-0.201695,0.571284,5.374654
2ndFlrSF,1162.0,0.051499,1.025679,-0.784638,-0.784638,-0.784638,0.941811,4.039606
3SsnPorch,1162.0,0.023062,1.078181,-0.103366,-0.103366,-0.103366,-0.103366,16.052303
BedroomAbvGr,1162.0,0.018333,1.010668,-3.475984,-1.045337,0.169987,0.169987,6.246606
BsmtCond,1162.0,0.043197,0.999773,-3.976123,0.064058,0.064058,0.064058,3.094194
...,...,...,...,...,...,...,...,...
YrSold_2006,1162.0,0.218589,0.413467,0.000000,0.000000,0.000000,0.000000,1.000000
YrSold_2007,1162.0,0.237522,0.425748,0.000000,0.000000,0.000000,0.000000,1.000000
YrSold_2008,1162.0,0.202238,0.401841,0.000000,0.000000,0.000000,0.000000,1.000000
YrSold_2009,1162.0,0.227194,0.419200,0.000000,0.000000,0.000000,0.000000,1.000000


In [62]:
def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[len(train_set.keys())]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mae', 'mse'])
    return model

In [63]:
model = build_model()

In [64]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                18880     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 23,105
Trainable params: 23,105
Non-trainable params: 0
_________________________________________________________________


In [68]:
def norm(x):
    return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_set)
normed_test_data = norm(test_set)


In [69]:
example_batch = normed_train_data[:10]
example_result = model.predict(example_batch)
example_result



array([[nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan]], dtype=float32)

In [67]:
EPOCHS = 1000

history = model.fit(
  train_set, train_labels,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[tfdocs.modeling.EpochDots()])



Epoch: 0, loss:nan,  mae:nan,  mse:nan,  val_loss:nan,  val_mae:nan,  val_mse:nan,  
....................................................................................................
Epoch: 100, loss:nan,  mae:nan,  mse:nan,  val_loss:nan,  val_mae:nan,  val_mse:nan,  
....................................................................................................
Epoch: 200, loss:nan,  mae:nan,  mse:nan,  val_loss:nan,  val_mae:nan,  val_mse:nan,  
....................................................................................................
Epoch: 300, loss:nan,  mae:nan,  mse:nan,  val_loss:nan,  val_mae:nan,  val_mse:nan,  
....................................................................................................
Epoch: 400, loss:nan,  mae:nan,  mse:nan,  val_loss:nan,  val_mae:nan,  val_mse:nan,  
....................................................................................................
Epoch: 500, loss:nan,  mae:nan,  mse:nan,  val_loss:nan,  val