In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../input/housesalesprediction/kc_house_data.csv")


In [None]:
df.info()

In [None]:
df.describe().transpose()

# Features

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(df["price"])

In [None]:
sns.countplot(df["bedrooms"])

In [None]:
sns.barplot(x="bedrooms", y="price", data=df)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x="sqft_living",y="price",data=df)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x="sqft_basement", y="price", data=df)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x="long", y="price", data=df)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x="lat", y="price", data=df)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x="long", y="lat",
                data=df, hue="price",
                palette="RdYlBu", edgecolor=None)

In [None]:
sns.boxplot(x="waterfront", y="price", data=df)

In [None]:
sns.boxplot(x="floors", y="price", data=df)

In [None]:
sns.boxplot(x="condition", y="price", data=df)

In [None]:
sns.boxplot(x="grade", y="price", data=df)

In [None]:
sns.boxplot(x="view", y="price", data=df)

In [None]:
df.sort_values("price", ascending=False).head(50)

# Remove houses worth more than 3 million and create a model for houses up to 3 million. Discard 45 most expensive houses out of 21613.

In [None]:
df = df.sort_values("price", ascending=False).iloc[45:]

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x="long", y="lat",
                data=df, hue="price",
                palette="RdYlBu", edgecolor=None)

In [None]:
df.head()

In [None]:
df = df.drop("id", axis=1)

In [None]:
df["date"] = pd.to_datetime(df["date"])

In [None]:
df["year"] = df["date"].apply(lambda date:date.year)

In [None]:
df["month"] = df["date"].apply(lambda date:date.month)

In [None]:
df.groupby("year").mean()["price"]

In [None]:
sns.barplot(x="month", y="price", data=df)

In [None]:
df = df.drop("date", axis=1)

In [None]:
df.head()

In [None]:
df["zipcode"].value_counts()

Requires domain expertise for feature engineering.

In [None]:
df = df.drop("zipcode", axis=1)

In [None]:
df["yr_renovated"].value_counts()

In [None]:
df.head()

In [None]:
features = ["price", "bedrooms", "sqft_living", "sqft_lot", "floors", "waterfront",
            "view", "condition", "grade", "sqft_above", "sqft_basement", "yr_built",
            "yr_renovated", "lat", "long", "sqft_living15", "sqft_lot15", "year", "month"]
mask = np.zeros_like(df[features].corr(), dtype=np.bool) 
mask[np.triu_indices_from(mask)] = True 

f, ax = plt.subplots(figsize=(16, 12))
plt.title('Pearson Correlation Matrix',fontsize=25)

sns.heatmap(df[features].corr(),linewidths=0.25,vmax=0.7,square=True,cmap="BuGn",
            linecolor='w',annot=True,annot_kws={"size":8},mask=mask,cbar_kws={"shrink": .9});


# Train Test Split

In [None]:
X = df.drop("price", axis=1)
y = df["price"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0)

60% train, 20% validation and 20% test

In [None]:
X_train.shape

In [None]:
X_val.shape

In [None]:
X_test.shape

# Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
X_val = scaler.transform(X_val)

In [None]:
X_test = scaler.transform(X_test)

# Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

In [None]:
model = Sequential()

model.add(Dense(19, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(8, activation="relu"))

model.add(Dense(1))

model.compile(optimizer="adam", loss="mse")

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
#early_stop = EarlyStopping(monitor="val_loss", patience=1000)

In [None]:
check_point = ModelCheckpoint("best_model.h5", monitor="val_loss", verbose=0, save_best_only=True)

In [None]:
model.fit(x=X_train, y=y_train.values,
          validation_data=(X_val, y_val.values),
          batch_size=32, epochs=10000, 
          callbacks=[check_point], verbose=0)

In [None]:
losses = pd.DataFrame(model.history.history)

In [None]:
losses.plot()

In [None]:
from keras.models import load_model

In [None]:
saved_model = load_model('best_model.h5')

In [None]:
predictions = saved_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,predictions))

In [None]:
r2 = r2_score(y_test, predictions)

In [None]:
avg = np.mean(y_test)

In [None]:
print("Average house price in test set: {}".format(avg))
print("RMSE: {}".format(rmse))
print("R-squared score: {}".format(r2))