In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Exploratory data analysis

In [None]:
df = pd.read_csv('/kaggle/input/housesalesprediction/kc_house_data.csv')

In [None]:
# lets see if the dataset has null values along the columns
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe().transpose()

In [None]:
df = df.drop('id',axis=1)

In [None]:
# distribution of target

plt.figure(figsize=(10,5))
sns.histplot(df['price'])

In [None]:
# distribution of number of bedrooms (potential good variable)
sns.countplot(x=df['bedrooms'])

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='bedrooms',y='price',data=df)

In [None]:
# square footage of interior housing living space versus price

plt.figure(figsize=(10,5))
sns.scatterplot(x='price',y='sqft_living',data=df)

In [None]:
# longitude distribution over price

plt.figure(figsize=(12,8))
sns.scatterplot(x='price',y='long',data=df)

In [None]:
# latitude distribution over price

plt.figure(figsize=(12,8))
sns.scatterplot(x='price',y='lat',data=df)

In [None]:
# plotting all latitude and longitude, with price as hue

plt.figure(figsize=(12,8))
sns.scatterplot(x='long',y='lat',data=df,hue='price')

In [None]:
df.sort_values('price',ascending=False)['price'].head(20)

In [None]:
len(df)*(0.01)

In [None]:
non_top_1_perc = df.sort_values('price',ascending=False).iloc[216:]

In [None]:
# plotting all latitude and longitude, with price as hue, and eliminating higher 1% prices

plt.figure(figsize=(12,8))
sns.scatterplot(x='long',y='lat',
                data=non_top_1_perc,
                hue='price',
                palette='RdYlGn',
                edgecolor=None,
                alpha=0.2,
                s=10)

In [None]:
# from the map, it looks like nearwater houses have higher prices. Lets check:

plt.figure(figsize=(12,8))
sns.boxplot(x='waterfront',y='price',data=df)

In [None]:
# feature engineering from date column

df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].apply(lambda d: d.month)
df['year'] = df['date'].apply(lambda d: d.year)
df['day'] = df['date'].apply(lambda d: d.day)

df = df.drop('date',axis=1)

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='year',y='price',data=df)

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='month',y='price',data=df)

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='day',y='price',data=df)

In [None]:
df.groupby('month').mean()['price'].plot()

In [None]:
df.groupby('year').mean()['price'].plot()

In [None]:
df.groupby('day').mean()['price'].plot()

### now lets prepare data and run simple neural network regression model

In [None]:
# train/test split

X = df.drop('price',axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
# scaling values

scaler = MinMaxScaler()

X_train= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape)
print(X_test.shape)

In [None]:
# setting and compiling model

model = Sequential()

model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')

In [None]:
# training model

model.fit(x=X_train,
          y=y_train.values,
          validation_data=(X_test,y_test.values),
          batch_size=128, 
          epochs=400,
          verbose=1)

In [None]:
loss_hist = pd.DataFrame(model.history.history)

In [None]:
loss_hist.plot()

In [None]:
# predicting on new data

preds = model.predict(X_test)

In [None]:
# evaluation on Test Data
# https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

print(f"MAE: {mean_absolute_error(y_test, preds)}")
print(f"MSE: {mean_squared_error(y_test, preds)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, preds))}")
print(f"Explained variance: {explained_variance_score(y_test, preds)}")

In [None]:
print(df['price'].mean())
print(df['price'].median())

In [None]:
# our predictions
plt.scatter(y_test, preds)

# perfect predictions
plt.plot(y_test,y_test,'r')

In [None]:
# plotting error distribution

errors = y_test.values.reshape(6484, 1) - preds
sns.displot(errors)

In [None]:
# predict on a new house

new_house = df.drop('price',axis=1).iloc[0]

new_house = scaler.transform(new_house.values.reshape(-1, 21))

model.predict(new_house)