In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')
df.head(3)

In [None]:
df.isnull().sum()

In [None]:
df.describe().T.head(2)

In [None]:
pd.merge(df.describe().T, 
         pd.DataFrame(df.dtypes, columns=['data type']),
         how='right',
         left_index=True, 
         right_index=True).applymap(lambda x: '-' if pd.isnull(x) else x)

In [None]:
plt.figure(figsize=(10, 6))
sns.displot(df['price'])

In [None]:
sns.countplot(df['bedrooms']);

In [None]:
plt.rcParams.update({'font.size': 13})
df.corr()['price'].sort_values(key=lambda x: abs(x), ascending=False).plot.bar(figsize=(10, 3));

In [None]:
plt.figure(figsize=(10, 5))
sns.scatterplot(x='sqft_living', y='price', marker='o', data=df);

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='bedrooms', y='price', data=df);

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x='long', y='lat', hue='price', data=df);

In [None]:
non_top_1_pct = df[df['price'] >= df['price'].quantile(q=0.01)]
non_top_1_pct.shape, df.shape

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x='long', y='lat', hue='price', data=non_top_1_pct,
                edgecolor=None, palette='RdYlGn', alpha=0.2);

In [None]:
df.drop('id', axis=1, inplace=True)

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['date']

In [None]:
df['year'] = df['date'].apply(lambda x: x.year)
df['month'] = df['date'].apply(lambda x: x.month)
df.head().T

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='month', y='price', data=df);

In [None]:
# visually inspect the correlation between house pricing and the sales month of the year
df.groupby('month')['price'].mean().plot();

In [None]:
df.drop('date', axis=1, inplace=True)
df.columns

In [None]:
df['zipcode'].value_counts().shape[0], df.shape[0]

In [None]:
# Drop the zipcode column for simplicity
df.drop('zipcode', axis=1, inplace=True)

In [None]:
df['yr_renovated'].value_counts()

Although there are many `0` values in the yr_renovated column that implies that the house is not renovated, because year renovated has a positive correlation with the price, and `0` is the least possible value, we are going to keep data as is

There is a similar situation with `sqft_basement`

In [None]:
df['sqft_basement'].value_counts()

Although sqft_basement with `0`  means no basement that is actually a categorical feature, we are going to keep the data as is, considering that the data is positively correlated with `price`

In [None]:
X = df.drop('price', axis=1).values
y = df['price'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape, X_test.shape

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

model.add(Dense(19, activation='relu'))
model.add(Dense(19, activation='relu'))
model.add(Dense(19, activation='relu'))
model.add(Dense(19, activation='relu'))

model.add(Dense(1))

model.compile(optimizer='adam',
              loss='mse')

In [None]:
model.fit(X_train, y_train,
          validation_data=(X_test, y_test),
          batch_size=128,
          epochs=100,
          verbose=0)

In [None]:
losses = pd.DataFrame(model.history.history)

In [None]:
losses.plot()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score

In [None]:
predictions = model.predict(X_test)

In [None]:
root_mse = np.sqrt(mean_squared_error(y_test, predictions))
root_mse

In [None]:
mae = mean_absolute_error(y_test, predictions)
mae

In [None]:
# How much variance is bein explained by the model
explained_variance_score(y_test, predictions)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions)
plt.plot(y_test, y_test, 'r')