In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Importing Visualization Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Getting the Dataset

In [None]:
delhi_housing = pd.read_csv('/kaggle/input/delhi-house-price-prediction/MagicBricks.csv')

In [None]:
delhi_housing.head()

In [None]:
delhi_housing.info()

In [None]:
delhi_housing.describe()

In [None]:
delhi_housing.columns

In [None]:
sns.pairplot(delhi_housing)

From paiplot we see that area,BHK,Bathroom has good correlation with price

In [None]:
sns.distplot(delhi_housing['Price'])

In [None]:
sns.heatmap(delhi_housing.corr(),annot=True)

Heatmap shows result of pairplot in a better way

Lets see how much of our Data is missing

In [None]:
sns.heatmap(delhi_housing.isnull(),yticklabels=False,cbar=False,cmap='viridis')

We can see there is lot of missing data in Per_Sqft and a little bit in Parking and seems like one in Bathroom.Lets handle it

Per_Sqft is Price/Area as we have both these field therefore it is information duplication we can delete this column

In [None]:
delhi_housing.drop('Per_Sqft',axis=1,inplace=True)

In [None]:
sns.heatmap(delhi_housing.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
delhi_housing['Parking'].mean()

we can see that Parking cloumn has an average value of 1.9 therefore we will be filling the missing parking values with 2(as parking can only be a whole no)

In [None]:
delhi_housing['Parking'].unique()

In [None]:
def average(parking):
    if pd.isnull(parking):
        return 2
    else:
        return parking

In [None]:
delhi_housing['Parking'] = delhi_housing['Parking'].apply(average)

In [None]:
delhi_housing['Parking'].unique()

In [None]:
sns.heatmap(delhi_housing.isnull(),yticklabels=False,cbar=False,cmap='viridis')

The no of missing value is very low we can now simply delete them

In [None]:
delhi_housing.dropna(inplace=True)

In [None]:
sns.heatmap(delhi_housing.isnull(),yticklabels=False,cbar=False,cmap='viridis')

Adding Dummy variables for Categorical Columns

In [None]:
delhi_housing.info()

In [None]:
delhi_housing['Furnishing'].unique()

In [None]:
furnished = pd.get_dummies(delhi_housing['Furnishing'],drop_first=True)

In [None]:
delhi_housing['Status'].unique()

In [None]:
status = pd.get_dummies(delhi_housing['Status'],drop_first=True)

In [None]:
status

In [None]:
delhi_housing['Transaction'].unique()

In [None]:
transaction = pd.get_dummies(delhi_housing['Transaction'],drop_first=True)

In [None]:
delhi_housing['Type'].unique()

In [None]:
types = pd.get_dummies(delhi_housing['Type'],drop_first=True)

In [None]:
locality=pd.get_dummies(delhi_housing['Locality'],drop_first=True)

In [None]:
locality

In [None]:
delhi_housing.drop(['Furnishing','Status','Transaction','Type','Locality'],axis=1,inplace=True)

In [None]:
delhi_housing = pd.concat([delhi_housing,furnished,status,transaction,types,locality ],axis=1)

In [None]:
delhi_housing.head()

In [None]:
delhi_housing.columns

seperating target columns from features

In [None]:
X = delhi_housing.loc[:, delhi_housing.columns != 'Price']
y = delhi_housing['Price']

In [None]:
X

In [None]:
y

splitting the data into training and test set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

applying LinerRegression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train,y_train)

In [None]:
print(lm.intercept_)

In [None]:
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df

In [None]:
predictions = lm.predict(X_test)

In [None]:
plt.scatter(y_test,predictions)

Dont know why but the model performed very bad

In [None]:
sns.distplot((y_test-predictions),bins=50);

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

scaling the data to put into ANN

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train

In [None]:
X_train= scaler.fit_transform(X_train)

In [None]:
X_train

In [None]:
X_test = scaler.transform(X_test)

In [None]:
X_train.shape

In [None]:
X_test.shape

training the model in ANN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=25)

In [None]:
model = Sequential()

model.add(Dense(371,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(185,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(93,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(46,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1))

model.compile(optimizer='adam',loss='mse')

In [None]:
model.fit(x=X_train,y=y_train.values,
          validation_data=(X_test,y_test.values),
          epochs=10000,callbacks=[early_stop])

In [None]:
losses = pd.DataFrame(model.history.history)

In [None]:
losses.plot()

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score

In [None]:
predictions = model.predict(X_test)

In [None]:
mean_absolute_error(y_test,predictions)

In [None]:
np.sqrt(mean_squared_error(y_test,predictions))

In [None]:
explained_variance_score(y_test,predictions)

got an explained variance of 73%

In [None]:
# Our predictions
plt.scatter(y_test,predictions)

# Perfect predictions
plt.plot(y_test,y_test,'r')

atlest it performed better than LinearRegression

Predicting the price of a single house

In [None]:
single_house = delhi_housing.drop('Price',axis=1).iloc[0]

In [None]:
single_house = scaler.transform(single_house.values.reshape(-1, 371))

In [None]:
delhi_housing['Price'][0]

In [None]:
model.predict(single_house)