<h1><center>House Prices Predictions using Keras</center></h1>
<img src="https://i.ytimg.com/vi/LvfbopVq-WE/maxresdefault.jpg" width="500" height="600">
<br/>

<h2>References</h2>

[House Prices EDA, Lasso & LightGBM](https://www.kaggle.com/mviola/house-prices-eda-lasso-lightgbm-0-11635)

[ANN House Price Prediction](https://www.kaggle.com/ppsheth91/ann-keras-hyper-parameter-tuning-price-prediction)

<h2><center>In this notebook we are going to predict prices using Neural Network</center></h2>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler # To standardize the data
from sklearn.ensemble import IsolationForest # To find and eliminate the outliers.
from keras.models import Sequential # Sequential Neural Network
from keras.layers import Dense
from keras.callbacks import EarlyStopping # Early Stopping Callback in the NN
from keras.optimizers import Adam # Optimizer used in the NN
from kerastuner.tuners import RandomSearch # HyperParameter Tunining
import warnings
warnings.filterwarnings('ignore')

## Loading the Dataset

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
y = train['SalePrice'].values
data = pd.concat([train,test],axis=0,sort=False)
data.drop(['SalePrice'],axis=1,inplace=True)
data.head()

# Descriptive Statistics

In [None]:
data.info()

In [None]:
column_data_type = []
for col in data.columns:
    data_type = data[col].dtype
    if data[col].dtype in ['int64','float64']:
        column_data_type.append('numeric')
    else:
        column_data_type.append('categorical')
plt.figure(figsize=(15,5))
sns.countplot(x=column_data_type)
plt.show()

In [None]:
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending = False)
missing_values
NAN_col = list(missing_values.to_dict().keys())
missing_values_data = pd.DataFrame(missing_values)
missing_values_data.reset_index(level=0, inplace=True)
missing_values_data.columns = ['Feature','Number of Missing Values']
missing_values_data['Percentage of Missing Values'] = (100.0*missing_values_data['Number of Missing Values'])/len(data)
missing_values_data


# Filling NAN Values

In [None]:
data['BsmtFinSF1'].fillna(0, inplace=True)
data['BsmtFinSF2'].fillna(0, inplace=True)
data['TotalBsmtSF'].fillna(0, inplace=True)
data['BsmtUnfSF'].fillna(0, inplace=True)
data['Electrical'].fillna('FuseA',inplace = True)
data['KitchenQual'].fillna('TA',inplace=True)
data['LotFrontage'].fillna(data.groupby('1stFlrSF')['LotFrontage'].transform('mean'),inplace=True)
data['LotFrontage'].interpolate(method='linear',inplace=True)
data['MasVnrArea'].fillna(data.groupby('MasVnrType')['MasVnrArea'].transform('mean'),inplace=True)
data['MasVnrArea'].interpolate(method='linear',inplace=True)

In [None]:
for col in NAN_col:
    data_type = data[col].dtype
    if data_type == 'object':
        data[col].fillna('NA',inplace=True)
    else:
        data[col].fillna(data[col].mean(),inplace=True)

## Adding New Features

In [None]:
data['Total_Square_Feet'] = (data['BsmtFinSF1'] + data['BsmtFinSF2'] + data['1stFlrSF'] + 
                                                                 data['2ndFlrSF'] + data['TotalBsmtSF'])

data['Total_Bath'] = (data['FullBath'] + (0.5 * data['HalfBath']) + data['BsmtFullBath'] + 
                                                                  (0.5 * data['BsmtHalfBath']))

data['Total_Porch_Area'] = (data['OpenPorchSF'] + data['3SsnPorch'] + 
                                                data['EnclosedPorch'] + data['ScreenPorch'] + data['WoodDeckSF'])

data['SqFtPerRoom'] = data['GrLivArea'] / (data['TotRmsAbvGrd'] + data['FullBath'] +
                                                       data['HalfBath'] + data['KitchenAbvGr'])


# One Hot Encoding for Categorical Features

In [None]:
data = pd.get_dummies(data)
data.head()

# Splitting train and test data

In [None]:
train = data[:1460].copy()
test = data[1460:].copy()
train['SalePrice'] = y
train.head()

# Extracting Top Features

In [None]:
top_features = train.corr()[['SalePrice']].sort_values(by=['SalePrice'],ascending=False).head(30)
plt.figure(figsize=(5,10))
sns.heatmap(top_features,cmap='rainbow',annot=True,annot_kws={"size": 16},vmin=-1)

## Now that we have extracted the top features that influnces the SalePrice we would see their distribution to find outliers.

In [None]:
def plot_data(col, discrete=False):
    if discrete:
        fig, ax = plt.subplots(1,2,figsize=(14,6))
        sns.stripplot(x=col, y='SalePrice', data=train, ax=ax[0])
        sns.countplot(train[col], ax=ax[1])
        fig.suptitle(str(col) + ' Analysis')
    else:
        fig, ax = plt.subplots(1,2,figsize=(12,6))
        sns.scatterplot(x=col, y='SalePrice', data=train, ax=ax[0])
        sns.distplot(train[col], kde=False, ax=ax[1])
        fig.suptitle(str(col) + ' Analysis')
    
print('Plot Function is ready to use')

In [None]:
plot_data('OverallQual',True)

### We see there are two outliers with 10 overall quality and price less than 200000.

In [None]:
train = train.drop(train[(train['OverallQual'] == 10) & (train['SalePrice'] < 200000)].index)

In [None]:
plot_data('Total_Square_Feet')

## This seems more or less appropriate distribution with no outliers whatsoever.

In [None]:
plot_data('GrLivArea')

## Again no outliers that can be eliminated.

In [None]:
plot_data('Total_Bath')

## Here we clearly see two outliers that have Total_Bath more than 4 but with sale price less than 200000.

In [None]:
train = train.drop(train[(train['Total_Bath'] > 4) & (train['SalePrice'] < 200000)].index)

In [None]:
plot_data('TotalBsmtSF')

## Her as well we see 1 clear outlier that has TotalBsmtSF more than 3000 but sale price less than 300000.

In [None]:
train = train.drop(train[(train['TotalBsmtSF'] > 3000) & (train['SalePrice'] < 400000)].index)

## After resetting the index,this is the final train data that we get

In [None]:
train.reset_index()

## Outlier elimination through Isolation Forest!!
### We use this algorithm since it would be difficult to go through all the features and eliminate the outliers manually but it was important to do it for the features that have higher correlation with the SalePrice

In [None]:
clf = IsolationForest(max_samples = 100, random_state = 42)
clf.fit(train)
y_noano = clf.predict(train)
y_noano = pd.DataFrame(y_noano, columns = ['Top'])
y_noano[y_noano['Top'] == 1].index.values

train = train.iloc[y_noano[y_noano['Top'] == 1].index.values]
train.reset_index(drop = True, inplace = True)
print("Number of Outliers:", y_noano[y_noano['Top'] == -1].shape[0])
print("Number of rows without outliers:", train.shape[0])

## Scaling the features using Sklearn Standard Scalar

In [None]:
X = train.copy()
X.drop(['SalePrice'],axis=1,inplace=True)
y = train['SalePrice'].values
X.shape,y.shape

In [None]:
scale = StandardScaler()
X = scale.fit_transform(X)

# MODELLING


### We would use Random Algorithm from keras for hyper-parameter tuning of the model.

In [None]:
def build_model(hp):
    model = Sequential()
    for i in range(hp.Int('layers', 2, 10)):
        model.add(Dense(units=hp.Int('units_' + str(i),
                                            min_value=32,
                                            max_value=512,
                                            step=32),
                               activation='relu'))
    model.add(Dense(1))
    model.compile(
        optimizer=Adam(
            hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='mse',
        metrics=['mse'])
    return model

In [None]:
tuner = RandomSearch(
    build_model,
    objective='val_mse',
    max_trials=10,
    executions_per_trial=3,
    directory='model_dir',
    project_name='House_Price_Prediction')
tuner.search_space_summary()

In [None]:
# tuner.search(X[1100:],y[1100:],batch_size=128,epochs=200,validation_data=validation_data=(X[:1100],y[:1100]))
# model = tuner.get_best_models(1)[0]

# After implementing this and tuning further we get the below model that I have implemented separately.Won't be running this here.

In [None]:
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(320, input_dim=X.shape[1], activation='relu'))
    model.add(Dense(384, activation='relu'))
    model.add(Dense(352, activation='relu'))
    model.add(Dense(448, activation='relu'))
    model.add(Dense(160, activation='relu'))
    model.add(Dense(160, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    # Compile model
    model.compile(optimizer=Adam(learning_rate=0.0001), loss = 'mse')
    return model

In [None]:
model = create_model()
model.summary()

## We would be using early stopping callback and would use 1/10th of the training data as validation to estimate the optimum number of epochs that would prevent overfitting

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
history = model.fit(x=X,y=y,
          validation_split=0.1,
          batch_size=128,epochs=1000, callbacks=[early_stop])

In [None]:
losses = pd.DataFrame(model.history.history)
losses.plot()

In [None]:
model = create_model() # Resetting the model.

## Training the model with full training data and optimum number of epochs!!

In [None]:
history = model.fit(x=X,y=y,
          batch_size=128,epochs=170)

In [None]:
losses = pd.DataFrame(model.history.history)
losses.plot()

# Prediction & Evaluation

In [None]:
model.evaluate(X,y)

In [None]:
X_test = scale.transform(test)
result = model.predict(X_test)
result = pd.DataFrame(result,columns=['SalePrice'])
result.head()
result['Id'] = test['Id']
result = result[['Id','SalePrice']]
result.head()

In [None]:
result.to_csv('submission.csv',index=False)

## Note: I would just like to say that Keras is not the most suitable model for this problem since the dataset given in this problem is not sufficient!!