In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from sklearn.metrics import mean_absolute_error,mean_squared_error,explained_variance_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

In [None]:
df.head(2)

### Exploratory Data Analysis

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().transpose()

### Checking the data for the presence of Null Values

In [None]:
df.isnull().sum()

### Visualization

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(df["price"])

#### Price range is falling between 0 and 2000000 majorly

In [None]:
sns.countplot(df["bedrooms"])

#### Seems to have an outlier with 33 bedrooms

In [None]:
df.corr()["price"].sort_values() ### Looking for the highly correkated features with price

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x='price',y='sqft_living',data=df)

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='bedrooms',y='price',data=df)

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x='price',y='long',data=df)

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x='price',y='lat',data=df)

In [None]:
plt.figure(figsize=(12,7))
sns.scatterplot(x='long',y='lat',data=df)

In [None]:
plt.figure(figsize=(12,7))
sns.scatterplot(x='long',y='lat',data=df,hue='price')

In [None]:
df.sort_values('price',ascending=False).head(20)

### Removing the top 1% of the data - Outliers

In [None]:
len(df)*0.01

In [None]:
non_top_1_perc = df.sort_values('price',ascending=False).iloc[216:]

In [None]:
non_top_1_perc.head(20)

In [None]:
print(f'Minimum value of price is {non_top_1_perc["price"].min()}')
print(f'Maximum value of price is {non_top_1_perc["price"].max()}')

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(non_top_1_perc['price'])

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='bedrooms',data=non_top_1_perc)

In [None]:
non_top_1_perc.sort_values("bedrooms",ascending=False).head(20)

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='long',y='lat',data=non_top_1_perc,hue='price',edgecolor=None,alpha=0.2,palette='plasma')

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='waterfront',y='price',data=non_top_1_perc)

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(x='bedrooms',y='price',data=non_top_1_perc)

In [None]:
def get_distro(df,df2,col,col2):
    plt.figure(figsize=(18,6))
    
    plt.subplot(1,2,1)
    sns.boxplot(df[col],df[col2])
    plt.title("Raw Data",fontweight='bold')
    
    plt.subplot(1,2,2)
    sns.boxplot(df2[col],df2[col2])
    plt.title("Filter Data",fontweight='bold')
    
    plt.show()
    

In [None]:
get_distro(df,non_top_1_perc,'waterfront','price')
# sns.boxplot(df['waterfront'],df['price'])

### Feataure Engineering

In [None]:
df.head()

In [None]:
df = df.drop('id',axis=1)

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df["year"] = df['date'].apply(lambda date: date.year)
df["month"] = df["date"].apply(lambda date: date.month)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(x='month',y='price',data=df)

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(x='year',y='price',data=df)

In [None]:
df.groupby("month").mean()["price"].plot(figsize=(12,6))
plt.ylabel("Price Mean",fontweight='bold')
plt.autoscale(tight=True)

In [None]:
df.groupby("year").mean()["price"].plot(figsize=(12,6))
plt.ylabel("Price Mean",fontweight='bold')
plt.autoscale(tight=True)

In [None]:
df = df.drop("date",axis=1)

In [None]:
df.head()

In [None]:
df= df.drop("zipcode",axis=1)

### Data Preprocessing

In [None]:
X = df.drop('price',axis=1).values
y = df["price"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=101)

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

### Model Building

In [None]:
X_train.shape

In [None]:
model = Sequential()

model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))

model.add(Dense(1))

model.compile(optimizer='adam',loss='mse')

In [None]:
model.fit(x=X_train,y=y_train,epochs=400,validation_data=(X_test,y_test),batch_size=128)

In [None]:
loss_df = pd.DataFrame(model.history.history)

In [None]:
loss_df.plot()

In [None]:
predictions = model.predict(X_test)

In [None]:
print(f"MAE for the model predicted is {mean_absolute_error(y_test,predictions)}")
print(f"MSE for the model predicted is {mean_squared_error(y_test,predictions)}")
print(f"RMSE for the model predicted is {np.sqrt(mean_squared_error(y_test,predictions))}")
print(f"Explained variance Score for the model predicted is {explained_variance_score(y_test,predictions)}")

In [None]:
plt.figure(figsize=(12,6))
plt.scatter(y_test,predictions)
plt.plot(y_test,y_test,'r')
plt.xlabel("True Value",fontweight='bold')
plt.ylabel("Predicted Value",fontweight='bold')

In [None]:
single_house = df.drop('price',axis=1).iloc[0]

In [None]:
single_house = scaler.transform(single_house.values.reshape(-1,19))

In [None]:
model.predict(single_house)

In [None]:
df['price'].iloc[0]

### Retrain the model with `non_top_1_perc`


In [None]:
non_top_1_perc.head()

In [None]:
non_top_1_perc['date'] = pd.to_datetime(non_top_1_perc['date'])

In [None]:
non_top_1_perc.head(2)

In [None]:
non_top_1_perc["month"] = non_top_1_perc['date'].apply(lambda date:date.month)
non_top_1_perc["year"] = non_top_1_perc['date'].apply(lambda date:date.year)

In [None]:
non_top_1_perc.head(2)

In [None]:
plt.figure(figsize=(12,5))
sns.boxplot(x='month',y='price',data=non_top_1_perc)

In [None]:
plt.figure(figsize=(12,5))
non_top_1_perc.groupby('month').mean()['price'].plot()
plt.autoscale(tight=True)

In [None]:
plt.figure(figsize=(12,5))
non_top_1_perc.groupby('year').mean()['price'].plot()
plt.autoscale(tight=True)

In [None]:
non_top_1_perc = non_top_1_perc.drop(["id",'date','zipcode'],axis=1)

In [None]:
X1 = non_top_1_perc.drop('price',axis=1).values
y1 = non_top_1_perc["price"].values

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split( X1, y1, test_size=0.3, random_state=101)

In [None]:
scaler_1 = MinMaxScaler()

In [None]:
X1_train = scaler_1.fit_transform(X1_train)

In [None]:
X1_test = scaler_1.transform(X1_test)

In [None]:
model_retrain = Sequential()

model_retrain.add(Dense(19,activation='relu'))
model_retrain.add(Dense(19,activation='relu'))
model_retrain.add(Dense(19,activation='relu'))
model_retrain.add(Dense(19,activation='relu'))

model_retrain.add(Dense(1))

model_retrain.compile(optimizer='adam',loss='mse')

In [None]:
model_retrain.fit(X1_train,y1_train,validation_data=(X1_test,y1_test),epochs=400,batch_size=128)

In [None]:
losses_retrain = pd.DataFrame(model_retrain.history.history)

In [None]:
losses_retrain.plot()

In [None]:
predictions_retrain = model_retrain.predict(X1_test)

In [None]:
print(f"MAE for the model predicted is {mean_absolute_error(y1_test,predictions_retrain)}")
print(f"MSE for the model predicted is {mean_squared_error(y1_test,predictions_retrain)}")
print(f"RMSE for the model predicted is {np.sqrt(mean_squared_error(y1_test,predictions_retrain))}")
print(f"Explained variance Score for the model predicted is {explained_variance_score(y1_test,predictions_retrain)}")

In [None]:
plt.scatter(y1_test,predictions_retrain)
plt.plot(y1_test,y1_test,'r')
plt.xlabel("True Value",fontweight='bold')
plt.ylabel("Predicted Value",fontweight='bold')

In [None]:
single_house_retrain = non_top_1_perc.drop('price',axis=1).iloc[0]

In [None]:
single_house_retrain = scaler_1.transform(single_house_retrain.values.reshape(-1,19))

In [None]:
model_retrain.predict(single_house_retrain)

In [None]:
 non_top_1_perc['price'].iloc[0]

#### We can conclude that Model with explainable variance of 0.8013325822336658 is better over moddel retrained with removing top 1% with explainable variance of 0.7379875841167598