In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

print("tensorflow version", tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# DATA DESCRIPTIONS
<html>
<body>
<table style="width:100%">
  <tr align="left">
    <th>Column</th>
    <th>Description</th>
  </tr>
  <tr>
    <td>POSTED_BY</td>
    <td>Category marking who has listed the property</td>
  </tr>
  <tr>
    <td>UNDER_CONSTRUCTION</td>
    <td>Under Construction or Not</td>
  </tr>
  <tr>
    <td>RERA</td>
    <td>Rera approved or Not</td>
  </tr>
  <tr>
    <td>BHK_NO</td>
    <td>Number of Rooms</td>
  </tr>
  <tr>
    <td>BHKORRK</td>
    <td>Type of property</td>
  </tr>
  <tr>
    <td>SQUARE_FT</td>
    <td>Total area of the house in square feet</td>
  </tr>
  <tr>
    <td>READYTOMOVE</td>
    <td>Category marking Ready to move or Not</td>
  </tr>
  <tr>
    <td>RESALE</td>
    <td>Category marking Resale or not</td>
  </tr>
  <tr>
    <td>ADDRESS</td>
    <td>Address of the property</td>
  </tr>
  <tr>
    <td>LONGITUDE</td>
    <td>Longitude of the property</td>
  </tr>
  <tr>
    <td>LATITUDE</td>
    <td>Latitude of the property</td>
  </tr>
</table>

</body>
</html>

In [None]:
train = pd.read_csv("../input/house-price-prediction-challenge/train.csv")
test = pd.read_csv("../input/house-price-prediction-challenge/test.csv")
train.head()

---
# Exploratory Data Analysis

### 1. Check Null values

In [None]:
train.isnull().sum()

### 2. Statistical information

In [None]:
train.describe(include="all").transpose()

### 3. The distribution of actual target value

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(train["TARGET(PRICE_IN_LACS)"], kde=True)
plt.show()

There are some of outliers in this dataset and it may actually make sense to drop those outliers in analysis if they are just a few points that are very extreme because may not be really useful to actually have the model train on those extreme outliers.

In [None]:
print(f"The Dataset size before removing outliers : {train.shape}")
max_val = train['TARGET(PRICE_IN_LACS)'].describe()['75%'] + 1.5 * (train['TARGET(PRICE_IN_LACS)'].describe()['75%'] - train['TARGET(PRICE_IN_LACS)'].describe()['25%'])
train = train[train['TARGET(PRICE_IN_LACS)'] < max_val]
print(f"The Dataset size after removing outliers : {train.shape}")

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(train["TARGET(PRICE_IN_LACS)"], kde=True)
plt.show()

### 4. `POSTED_BY` feature

In [None]:
print(train["POSTED_BY"].value_counts(), end='\n\n')

Since this feature values in string, all the values will be mapped into number.

In [None]:
dic = {}
for v, i in zip(train["POSTED_BY"].unique(), range(len(train["POSTED_BY"].unique()))):
    dic[v] = i

train["POSTED_BY"] = train["POSTED_BY"].map(dic)
test["POSTED_BY"] = test["POSTED_BY"].map(dic)

### 5. `BHK_OR_RK` feature

In [None]:
print(train["BHK_OR_RK"].value_counts(), end='\n\n')

This feature has a very high skewed values, there will be no impact if we drop this feature from the dataset.

In [None]:
train.drop(labels=["BHK_OR_RK"], axis=1, inplace=True)
test.drop(labels=["BHK_OR_RK"], axis=1, inplace=True)

### 6. The correlation between target and features

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(train.corr(), cmap="YlGnBu", annot=True)
plt.show()

`UNDER_CONSTRUCTION` feature has a perfect negative correlation with `READY_TO_MOVE` feature but their correlations with the target are very low!! `BHK_NO.` has the highest correlation with the target.

### 7. Analyze features that have high correlation with the target

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x="BHK_NO.", y="TARGET(PRICE_IN_LACS)", data=train)
plt.show()

In [None]:
print("House that has 9 rooms in dataset : ", len(train[train["BHK_NO."]==9]))
print("House that has 11 rooms in dataset : ", len(train[train["BHK_NO."]==11]))
print("House that has 20 rooms in dataset : ", len(train[train["BHK_NO."]==20]))

Since there is only 1 observation for each room (9, 11, and 20), it'd be better to remove them rather than use them to train the model.

In [None]:
train = train[(train["BHK_NO."]!=9) & (train["BHK_NO."]!=11) & (train["BHK_NO."]!=20)]

### 8. Analyze features that have high correlations between them

Features with high correlation are more linearly dependent and hence have almost the same effect on the dependent variable. So, when two features have high correlation, we can drop one of the two features.

In [None]:
print(len(train[train["UNDER_CONSTRUCTION"]==0]))
print(len(train[train["UNDER_CONSTRUCTION"]==1]))
print(len(train[train["READY_TO_MOVE"]==0]))
print(len(train[train["READY_TO_MOVE"]==1]))

This really explains why both of those features are very high correlated, in short that if the house is under construction (value of 1) means `READY_TO_MOVE` will be 0 and vice versa.

In [None]:
train.drop(labels=["UNDER_CONSTRUCTION"], axis=1, inplace=True)
test.drop(labels=["UNDER_CONSTRUCTION"], axis=1, inplace=True)

### 9. Get the distribution of prices per geographical (latitude & longitude)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x='LONGITUDE', y='LATITUDE', hue='TARGET(PRICE_IN_LACS)', alpha=0.4, palette='RdYlGn', data=train)
plt.show()

### 10. `ADDRESS` feature

There are city information in this feature that maybe can give some information related to the price.

In [None]:
city = train['ADDRESS'].apply(lambda x : x.split(',')[1])
print(city.value_counts(), end='\n\n')
print("Total of unique values :", city.nunique())

There are 428 unique values in this feature and that won't be possible to do OneHot encoding with it. For now, this feature will be dropped. (FYI, this feature actually can be useful if we know how to group them based on province or maybe higher level. Since I'm not from India this task would be difficult because I don't know administrative area in India.)

In [None]:
train.drop(labels=["ADDRESS"], axis=1, inplace=True)
test.drop(labels=["ADDRESS"], axis=1, inplace=True)

---
# Feature Engineering

In [None]:
train.head()

### 1. OneHot encoding `POSTED_BY` feature

In [None]:
# train dataset
ohe = pd.get_dummies(train['POSTED_BY'], prefix="POSTED_BY", drop_first=True)
train.drop(labels=["POSTED_BY"], axis=1, inplace=True)
train = pd.concat([train, ohe], axis=1)

# test dataset
ohe = pd.get_dummies(test['POSTED_BY'], prefix="POSTED_BY", drop_first=True)
test.drop(labels=["POSTED_BY"], axis=1, inplace=True)
test = pd.concat([test, ohe], axis=1)

---
# Splitting The Dataset into Train and Test

In [None]:
from sklearn.model_selection import train_test_split

x = train.drop('TARGET(PRICE_IN_LACS)', axis=1).values
y = train['TARGET(PRICE_IN_LACS)'].values

x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.1, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(x_tr)
x_tr_scaled = scaler.transform(x_tr)
x_ts_scaled = scaler.transform(x_ts)

---
# Create The Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.losses import mae
from tensorflow.keras.callbacks import Callback, ModelCheckpoint
from tensorflow.keras.activations import linear, relu

class stopLearn(Callback):
    def on_epoch_end(self, epoch, logs={}):
        if logs.get('val_loss') < 21:
            print("\nStop training!!")
            self.model.stop_training=True

cb = stopLearn()
# checkpoint_name = 'models/Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
# cp = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
# callbacks_list = [cp, cb]

model = Sequential()
model.add(Dense(units=32, kernel_initializer='normal', input_dim=x_tr_scaled.shape[1], activation=relu))
model.add(Dense(units=64, activation=relu))
model.add(Dense(units=64, activation=relu))
model.add(Dense(units=128, activation=relu))
model.add(Dense(units=256, activation=relu))
model.add(Dense(units=1, activation=linear))

model.compile(optimizer=Adam(learning_rate=0.001), loss=mae)
# model.compile(optimizer=SGD(learning_rate=0.001), loss=mae)

model.summary()

In [None]:
model.fit(x=x_tr_scaled, y=y_tr, validation_data=(x_ts_scaled, y_ts), batch_size=128, epochs=500, callbacks=[cb])

# Evaluate The Model

In [None]:
plt.figure(figsize=(10,6))
pd.DataFrame(model.history.history).plot()
plt.show()

In [None]:
y_pred = model.predict(x_ts_scaled).reshape(x_ts_scaled.shape[0],)
pred_df = pd.DataFrame({'Actual value':y_ts, 'Predicted value':y_pred})

print(pred_df.head())

from sklearn.metrics import mean_absolute_error

MAE_val = mean_absolute_error(y_true=pred_df['Actual value'], y_pred=pred_df['Predicted value'])
print("\nfrom the MAE result, on average the model about {:.2f} off from true price point in mean of actual values which is not really good and quite bad prediction because it off around 30% based on the mean of the target value.".format(MAE_val))

In [None]:
'''
This score metric is for explain variance regression score function.
Best possible score is 1.0, lower values are worse.
'''
from sklearn.metrics import explained_variance_score

explained_variance_score(y_ts, y_pred)