In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(12,6))
sns.displot(data=df, x ='price' , kde=True)
# right skewed plot

In [None]:
sns.countplot(data=df, x='bedrooms')

In [None]:
df.corr()['price'].sort_values()[:-1]

In [None]:
sns.scatterplot(data=df,x='sqft_living',y='price')

In [None]:
sns.boxplot(data=df,x='bedrooms',y='price')

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(data=df,x='long',y='lat' , palette='plasma' , hue='price',alpha=0.2,edgecolor='black')
# price pattern doesn't look so clear because of the presence of outliers

In [None]:
df.sort_values('price',ascending=False)

In [None]:
len(df)*0.01

In [None]:
non_top_1_percent = df.sort_values('price',ascending=False).iloc[216: , :]

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(data=non_top_1_percent,x='long',y='lat' , palette='coolwarm' , hue='price',alpha=0.2,edgecolor='black')
# now we can see the price pattern more clearly

In [None]:
sns.boxplot(data=df,x='waterfront',y='price')

In [None]:
df.head()

In [None]:
df.drop('id',axis=1,inplace=True)

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df['year'] = df['date'].apply(lambda x : x.year)
df['month'] = df['date'].apply(lambda x : x.month)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=df,x='month',y='price')
# no significant difference

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=df,x='year',y='price')

In [None]:
df.groupby('month').median()['price']

In [None]:
df.groupby('year').median()['price']

In [None]:
df.groupby('bedrooms').median()['price']

In [None]:
df.drop('date',axis=1,inplace=True)

In [None]:
df['zipcode'].nunique()

In [None]:
df.groupby('zipcode').median()['price'].sort_values() # isn't much fruitful, will drop it

In [None]:
df.drop('zipcode',axis=1,inplace=True)

In [None]:
X = df.drop('price',axis=1).values
y = df['price'].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
sc = MinMaxScaler()

In [None]:
X_train = sc.fit_transform(X_train)

In [None]:
X_test = sc.transform(X_test)

In [None]:
X_train.shape

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
model = Sequential()

model.add(Dense(19,activation='relu')) # input layer

model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))

model.add(Dense(1)) # output layer

model.compile(optimizer='adam',loss='mse')

In [None]:
model.fit(x=X_train,y=y_train,validation_data=(X_test,y_test),epochs=600,verbose=1)

In [None]:
loss_df = pd.DataFrame(model.history.history)

In [None]:
loss_df.head()

In [None]:
loss_df.plot()

In [None]:
pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score

In [None]:
np.sqrt(mean_squared_error(y_test,pred))

In [None]:
df['price'].median()

In [None]:
mean_absolute_error(y_test,pred)

In [None]:
explained_variance_score(y_test,pred)

In [None]:
plt.figure(figsize=(12,6))
plt.scatter(y_test,pred)
plt.plot(y_test,y_test,'r')

In [None]:
new_val = df.drop('price',axis=1).iloc[0].values
new_val = sc.transform(new_val.reshape(-1,19))

In [None]:
model.predict(new_val)

In [None]:
df['price'][0]

# pretty close

# comparing the ann model with catboost

In [None]:
from catboost import CatBoostRegressor

In [None]:
cb = CatBoostRegressor(random_state=123,iterations=10000)

In [None]:
cb.fit(X_train,y_train,eval_set=(X_test,y_test), early_stopping_rounds=200)

In [None]:
cb_pred = cb.predict(X_test)

In [None]:
mean_squared_error(y_test,cb_pred)**0.5

In [None]:
mean_absolute_error(y_test,cb_pred)

In [None]:
explained_variance_score(y_test,cb_pred)

In [None]:
cb.predict(new_val)

In [None]:
df['price'][0]

# In this case, catboost outperformed our ann model