In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read in King County data file

housing_df = pd.read_csv("../input/housesalesprediction/kc_house_data.csv")

# Exploratory data analysis

In [None]:
housing_df.info()

**Notes:**
* 21 columns
    * Almost all numerical (date = only object column) 
    * Most look useful
    * No NULL values

In [None]:
housing_df.head()

In [None]:
housing_df.describe().transpose()

## Examine the distribution of prices and how they relate to various other property attributes, e.g. bedrooms, sq ft, etc.

In [None]:
# Plot home price distribution

plt.figure(figsize=(10, 8))
sns.distplot(housing_df['price'], bins=50, hist_kws=dict(edgecolor="white", linewidth=1))

plt.ticklabel_format(style='plain')

**Notes:**
* Fairly normal distribution
* Long tail on the high end

In [None]:
# Examine relationship of bedrooms to price

fig, axes = plt.subplots(1, 2, figsize=(12, 8))

axes[0].set_title('Bedrooms Countplot')
sns.countplot(housing_df['bedrooms'], ax=axes[0], color='maroon')

axes[1].set_title('Bedrooms vs. Price')
sns.boxplot(x='bedrooms', y='price', data=housing_df, ax=axes[1])

plt.tight_layout()

**Notes:**
* Homes mostly concentrated in 2-4 bedroom range
* High variance in bedroom number vs. price
* There apparently are 10, 11, and 33-bedroom homes, but median prices of these homes do not seem very high. This may be an error in the data

In [None]:
# Plot square feet vs price

plt.figure(figsize=(10, 6))
sns.scatterplot(x='sqft_living', y='price', data=housing_df).set_title('Square Feet vs. Price')

plt.ticklabel_format(style='plain')

**Notes:**
* Predictably, a pretty good correlation between living space and price

## Plot homes using latitude & longitude

In [None]:
# Map homes, set hue='price' to see geographic distribution of home sales

plt.figure(figsize=(12,8))
sns.scatterplot(x='long', y='lat', data=housing_df, hue='price').set_title('King County Home Sales')

### Difficult to distinguish most home prices because prices have such a long tail on the high end... Find an appropriate cut-off point where we can reduce the number of outliers without sacrificing too much data. And maybe use a better color scheme.

In [None]:
# Look at top 50 home sales

housing_df.sort_values('price', ascending=False).head(50)

**Notes:**
* Record volume appears to start increasing around \\$3.0M - \\$3.5M, in accordance with the histplot shown above.

In [None]:
# Look at 99th percentile record

nn_perc_index = round(len(housing_df) * 0.01)

nn_perc = housing_df.sort_values('price', ascending=False).iloc[nn_perc_index]['price']

print("99th percentile home price = {}".format(nn_perc))

### 99th percentile appears to be right around where the tail really starts to go long --> rerun geographic analysis using bottom 99% of data

In [None]:
bottom_99_perc = housing_df.sort_values('price', ascending=False).iloc[nn_perc_index:]

In [None]:
# Re-map homes using bottom 99%, set hue='price', to see geographic distribution of home sales

plt.figure(figsize=(12,8))
sns.scatterplot(x='long', y='lat',
                data=bottom_99_perc, hue='price',
                palette='RdYlGn', edgecolor=None, alpha=0.2).set_title('King County Home Sales')

**Notes:**
* There appears to be pretty high concentration among high-priced homes 
* Many of the areas where high-price homes are concentrated appear to be on the water

In [None]:
# Plot waterfront and view vs price boxplot

fig, axes = plt.subplots(1, 2, figsize=(12, 8))

axes[0].set_title('Waterfront Impact on Price')
sns.boxplot(x='waterfront', y='price', data=housing_df, ax=axes[0])

axes[1].set_title('View Impact on Price')
sns.boxplot(x='view', y='price', data=housing_df, ax=axes[1])

**Notes:**
* Predictably, having a waterfront property or a better view appears to correlate with higher prices

# Feature engineering

In [None]:
housing_df.head()

`id` column appears to be just random numbers... drop `id` column

In [None]:
# Drop `id` column

housing_df = housing_df.drop('id', axis=1)

### Extract date features

In [None]:
housing_df['date'] = pd.to_datetime(housing_df['date'])

In [None]:
# Extract year

housing_df['year'] = housing_df['date'].apply(lambda date : date.year)

In [None]:
# Extract month

housing_df['month'] = housing_df['date'].apply(lambda date : date.month)

In [None]:
# Extract day

housing_df['day'] = housing_df['date'].apply(lambda date : date.day)

In [None]:
# Plot relationship of year to price

sns.lineplot(x=housing_df['year'].unique(), y=housing_df.groupby('year').mean()['price']).set_title('Year vs. Price')

In [None]:
# Plot relationship of month to price

months = sorted(housing_df['month'].unique())
months_avg_prices = housing_df.groupby('month').mean()['price']

sns.lineplot(x=months, y=months_avg_prices).set_title('Month vs. Price')

In [None]:
# Plot relationship of day to price

days = sorted(housing_df['day'].unique())
days_avg_prices = housing_df.groupby('day').mean()['price']

sns.lineplot(x=days, y=days_avg_prices).set_title('Day vs. Price')

**Notes:**
* Strong relationship between year and price (there are only two years represented in this dataset)
* Looks like there could be some relationship between month and price
* There does not seem to be any relationship between day and price... drop `day` column

In [None]:
# Drop `date`, `day` columns

housing_df = housing_df.drop(['date', 'day'], axis=1)

In [None]:
housing_df.info()

### Zipcode is a categorical variable, so we cannot feed it into our model as is... Extract zipcode features

In [None]:
housing_df['zipcode'].value_counts()

### Too many values for dummy variables... convert zipcode to ordinal values by replacing with zipcode average home price

In [None]:
zipcode_data = housing_df.groupby('zipcode').mean()

zipcode_data.sort_values('price', ascending=False)

In [None]:
# Add average home price for each zipcode function

housing_df['zipcode_avg'] = housing_df['zipcode'].apply(lambda zipcode : zipcode_data.loc[zipcode]['price'])

In [None]:
# Drop `zipcode` column

housing_df = housing_df.drop(['zipcode'], axis=1)

### Explore year built vs. year renovated... whether either has strong predictive power

In [None]:
def most_recent_year(year_built, year_renovated):
    if year_built > year_renovated:
        return year_built
    else:
        return year_renovated

In [None]:
# Explore whether looking at the most recent of the two is more powerful than looking at each one individually

housing_df['most_recent_work'] = housing_df.apply(lambda x : most_recent_year(x['yr_built'], x['yr_renovated']), axis=1)

In [None]:
# Plot year built, year renovated, and the most recent of the two vs price

fig, axes = plt.subplots(1, 3, figsize=(18, 4))

sns.scatterplot(x='yr_built', y='price', data=housing_df, ax=axes[0])
sns.scatterplot(x='yr_renovated', y='price', data=housing_df, ax=axes[1])
sns.scatterplot(x='most_recent_work', y='price', data=housing_df, ax=axes[2])

Not helpful... drop `most_recent_work` column

In [None]:
# Drop `most_recent_work` column

housing_df = housing_df.drop('most_recent_work', axis=1)

### Look for bad data

In [None]:
housing_df[(housing_df['bedrooms'] == 33) | (housing_df['bedrooms'] == 11) | (housing_df['bedrooms'] == 10)]

Some of these records seem like bad data.
* 33 bedrooms, 1.75 bathrooms, only 1600 sq ft, and a \\$640K sale price... Seems wrong. Let's drop it.
* Skeptical of others as well, but going to leave them for now

In [None]:
# Drop 33-bedroom home

drop_index = housing_df[housing_df['bedrooms'] == 33].index
housing_df = housing_df.drop(drop_index)

Cannot think of any other features to engineer at this time... Let's move on to building our model

# Linear regression

### Train test split

In [None]:
X = housing_df.drop('price', axis=1)
y = housing_df['price']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Build model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear_model = LinearRegression()

### Train model

In [None]:
linear_model.fit(X_train, y_train)

### Make predictions

In [None]:
linear_predictions = linear_model.predict(X_test)

### Evaluate model

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(y_test, linear_predictions)
plt.plot(y_test, y_test, color='r')

In [None]:
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error
from math import sqrt

In [None]:
linear_score = explained_variance_score(y_test, linear_predictions)
linear_mae = mean_absolute_error(y_test, linear_predictions)
linear_rmse = sqrt(mean_squared_error(y_test, linear_predictions))

print("Linear Regression Score: {:.4f}".format(linear_score))
print("Linear Regression MAE: {:.4f}".format(linear_mae))
print("Linear Regression RMSE: {:.4f}".format(linear_rmse))

# Support vector regression

### Train test split

In [None]:
X = housing_df.drop('price', axis=1).values
y = housing_df['price'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Scale data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
x_scaler = StandardScaler()
y_scaler = StandardScaler()

In [None]:
# Transform feature trainging & testing values
X_train = x_scaler.fit_transform(X_train)
X_test = x_scaler.transform(X_test)

# Transform target training values
y_train = y_scaler.fit_transform(y_train.reshape(-1, 1))

### Build models

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

### RBF Kernel

In [None]:
rbf_regressor = SVR(kernel='rbf')

In [None]:
# Cross validation parameters

param_grid = {
    'C': [1, 5, 10, 15],
    'gamma': [0.1, 0.01, 0.001]
}

In [None]:
rbf_cross_val = GridSearchCV(rbf_regressor, param_grid, n_jobs=-1, verbose=3)

In [None]:
rbf_cross_val.fit(X_train, np.ravel(y_train))

In [None]:
rbf_predictions = rbf_cross_val.predict(X_test)

In [None]:
rbf_predictions = y_scaler.inverse_transform(rbf_predictions)

In [None]:
rbf_score = explained_variance_score(y_test, rbf_predictions)
rbf_mae = mean_absolute_error(y_test, rbf_predictions)
rbf_rmse = sqrt(mean_squared_error(y_test, rbf_predictions))

print("RBF Score: {:.4f}".format(rbf_score))
print("RBF MAE: {:.4f}".format(rbf_mae))
print("RBF RMSE: {:.4f}".format(rbf_rmse))

### Linear Kernel

In [None]:
lin_regressor = SVR(kernel='linear')

In [None]:
lin_regressor.fit(X_train, np.ravel(y_train))

In [None]:
lin_predictions = lin_regressor.predict(X_test)

In [None]:
lin_predictions = y_scaler.inverse_transform(lin_predictions)

In [None]:
lin_score = explained_variance_score(y_test, lin_predictions)
lin_mae = mean_absolute_error(y_test, lin_predictions)
lin_rmse = sqrt(mean_squared_error(y_test, lin_predictions))

print("Linear Score: {:.4f}".format(lin_score))
print("Linear MAE: {:.4f}".format(lin_mae))
print("Linear RMSE: {:.4f}".format(lin_rmse))

### Polynomial Kernel

In [None]:
poly_regressor = SVR(kernel='poly')

In [None]:
poly_regressor.fit(X_train, np.ravel(y_train))

In [None]:
poly_predictions = poly_regressor.predict(X_test)

In [None]:
poly_predictions = y_scaler.inverse_transform(poly_predictions)

In [None]:
poly_score = explained_variance_score(y_test, poly_predictions)
poly_mae = mean_absolute_error(y_test, poly_predictions)
poly_rmse = sqrt(mean_squared_error(y_test, poly_predictions))

print("Polynomial Score: {:.4f}".format(poly_score))
print("Polynomial MAE: {:.4f}".format(poly_mae))
print("Polynomial RMSE: {:.4f}".format(poly_rmse))

### Evaluate models

In [None]:
# Print RBF results
rbf_score = explained_variance_score(y_test, rbf_predictions)
rbf_mae = mean_absolute_error(y_test, rbf_predictions)
rbf_rmse = sqrt(mean_squared_error(y_test, rbf_predictions))

print("RBF Score: {:.4f}".format(rbf_score))
print("RBF MAE: {:.4f}".format(rbf_mae))
print("RBF RMSE: {:.4f}\n".format(rbf_rmse))

# Print linear results
lin_score = explained_variance_score(y_test, lin_predictions)
lin_mae = mean_absolute_error(y_test, lin_predictions)
lin_rmse = sqrt(mean_squared_error(y_test, lin_predictions))

print("Linear Score: {:.4f}".format(lin_score))
print("Linear MAE: {:.4f}".format(lin_mae))
print("Linear RMSE: {:.4f}\n".format(lin_rmse))

# Print polynomial results
poly_score = explained_variance_score(y_test, poly_predictions)
poly_mae = mean_absolute_error(y_test, poly_predictions)
poly_rmse = sqrt(mean_squared_error(y_test, poly_predictions))

print("Polynomial Score: {:.4f}".format(poly_score))
print("Polynomial MAE: {:.4f}".format(poly_mae))
print("Polynomial RMSE: {:.4f}".format(poly_rmse))

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20, 8))

sns.scatterplot(x=y_test, y=rbf_predictions, ax=ax[0])
ax[0].plot(y_test, y_test, color='r')
ax[0].set_title('RBF SVR Predictions')
ax[0].set_xlabel('Actual Sale Prices')
ax[0].set_ylabel('Predicted Prices')

sns.scatterplot(x=y_test, y=lin_predictions, ax=ax[1])
ax[1].plot(y_test, y_test, color='r')
ax[1].set_title('Linear SVR Predictions')
ax[1].set_xlabel('Actual Sale Prices')
ax[1].set_ylabel('Predicted Prices')

sns.scatterplot(x=y_test, y=poly_predictions, ax=ax[2])
ax[2].plot(y_test, y_test, color='r')
ax[2].set_title('Polynomial SVR Predictions')
ax[2].set_xlabel('Actual Sale Prices')
ax[2].set_ylabel('Predicted Prices')

Notes:
* RBF using cross validation has best explained variance score by far
* Other models might work with cross validation as well, but model fitting is far too slow
* Manually tested some different hyperparameter combinations for the others, and I think it's unlikely that either will outperform RBF

# Random forest

### Train test split

In [None]:
X = housing_df.drop('price', axis=1).values
y = housing_df['price'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Build model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

### Create standard random forest

In [None]:
rand_forest = RandomForestRegressor(n_estimators=300)

In [None]:
rand_forest.fit(X_train, y_train)

In [None]:
rand_forest_predictions = rand_forest.predict(X_test)

In [None]:
rand_forest_score = explained_variance_score(y_test, rand_forest_predictions)
rand_forest_mae = mean_absolute_error(y_test, rand_forest_predictions)
rand_forest_rmse = sqrt(mean_squared_error(y_test, rand_forest_predictions))

print("Random Forest Score: {:.4f}".format(rand_forest_score))
print("Random Forest MAE: {:.4f}".format(rand_forest_mae))
print("Random Forest RMSE: {:.4f}\n".format(rand_forest_rmse))

### Create randomized cross-validation model

In [None]:
rf_rand = RandomForestRegressor()

In [None]:
# Number of trees
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

# Number of features at every split
max_features = ['auto', 'sqrt', 'log2']

# Max levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)

# Min samples to split node
min_samples_split = [2, 5, 10]

# Min samples at each leaf
min_samples_leaf = [1, 2, 4]

# Bootstrapping
bootstrap = [True, False]

random_param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

print(random_param_grid)

In [None]:
rf_rand_regressor = RandomizedSearchCV(rf_rand, random_param_grid, cv=3, verbose=3, n_jobs=-1)

In [None]:
rf_rand_regressor.fit(X_train, y_train)

In [None]:
rf_rand_predictions = rf_rand_regressor.predict(X_test)

In [None]:
rf_rand_score = explained_variance_score(y_test, rf_rand_predictions)
rf_rand_mae = mean_absolute_error(y_test, rf_rand_predictions)
rf_rand_rmse = sqrt(mean_squared_error(y_test, rf_rand_predictions))

print("Randomized RF Score: {:.4f}".format(rf_rand_score))
print("Randomized RF MAE: {:.4f}".format(rf_rand_mae))
print("Randomized RF RMSE: {:.4f}".format(rf_rand_rmse))

### Evaluate models

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 8))

sns.scatterplot(x=y_test, y=rand_forest_predictions, ax=ax[0])
ax[0].plot(y_test, y_test, color='r')
ax[0].set_title('Standard RF Predictions')
ax[0].set_xlabel('Actual Sale Prices')
ax[0].set_ylabel('Predicted Prices')

sns.scatterplot(x=y_test, y=rf_rand_predictions, ax=ax[1])
ax[1].plot(y_test, y_test, color='r')
ax[1].set_title('Randomized CV RF Predictions')
ax[1].set_xlabel('Actual Sale Prices')
ax[1].set_ylabel('Predicted Prices')

In [None]:
# Print standard RF results
rand_forest_score = explained_variance_score(y_test, rand_forest_predictions)
rand_forest_mae = mean_absolute_error(y_test, rand_forest_predictions)
rand_forest_rmse = sqrt(mean_squared_error(y_test, rand_forest_predictions))

print("Random Forest Score: {:.4f}".format(rand_forest_score))
print("Random Forest MAE: {:.4f}".format(rand_forest_mae))
print("Random Forest RMSE: {:.4f}\n".format(rand_forest_rmse))

# Print cross-validated RF results
rf_rand_score = explained_variance_score(y_test, rf_rand_predictions)
rf_rand_mae = mean_absolute_error(y_test, rf_rand_predictions)
rf_rand_rmse = sqrt(mean_squared_error(y_test, rf_rand_predictions))

print("Randomized RF Score: {:.4f}".format(rf_rand_score))
print("Randomized RF MAE: {:.4f}".format(rf_rand_mae))
print("Randomized RF RMSE: {:.4f}".format(rf_rand_rmse))

# Neural network

### Train test split

In [None]:
X = housing_df.drop('price', axis=1).values
y = housing_df['price'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42 )

### Scale data

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

### Build model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [None]:
X_train.shape

In [None]:
model = Sequential()

model.add(Dense(20, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(5, activation='relu'))

model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')

earlystop = EarlyStopping(monitor='val_loss', min_delta=250000, patience=50)

### Train model

In [None]:
model.fit(x=X_train, y=y_train, validation_data=(X_val, y_val),
          callbacks=[earlystop], batch_size=128, epochs=4000)

In [None]:
losses = pd.DataFrame(model.history.history)

In [None]:
losses.plot()

### Make predictions

In [None]:
nn_predictions = model.predict(X_test)

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(y_test, nn_predictions)
plt.plot(y_test, y_test, color='r')

### Evaluate model

In [None]:
nn_score = explained_variance_score(y_test, nn_predictions)
nn_mae = mean_absolute_error(y_test, nn_predictions)
nn_rmse = sqrt(mean_squared_error(y_test, nn_predictions))

print("Neural Network Score: {:.4f}".format(nn_score))
print("Neural Network MAE: {:.4f}".format(nn_mae))
print("Neural Network RMSE: {:.4f}".format(nn_rmse))

# Summarize Results

In [None]:
print("==========================================")
print("Linear Regression Results:\n")
print("Explained Varaince Score = {:.4f}".format(linear_score))
print("Mean Absolute Error = {:.4f}".format(linear_mae))
print("Root Mean Squared Error = {:.4f}".format(linear_rmse))
print("==========================================")
print("Support Vector Regression Results:\n")
print("RBF Kernel:")
print("Explained Variance Score = {:.4f}".format(rbf_score))
print("Mean Absolute Error = {:.4f}".format(rbf_mae))
print("Root Mean Squared Error = {:.4f}\n".format(rbf_rmse))
print("Linear Kernel:")
print("Explained Variance Score = {:.4f}".format(lin_score))
print("Mean Absolute Error = {:.4f}".format(lin_mae))
print("Root Mean Squared Error = {:.4f}\n".format(lin_rmse))
print("Polynomial Kernel:")
print("Explained Variance Score = {:.4f}".format(poly_score))
print("Mean Absolute Error = {:.4f}".format(poly_mae))
print("Root Mean Squared Error = {:.4f}".format(poly_rmse))
print("==========================================")
print("Random Forest Regression Results:\n")
print("Standard Random Forest:")
print("Explained Variance Score = {:.4f}".format(rand_forest_score))
print("Mean Absolute Error = {:.4f}".format(rand_forest_mae))
print("Root Mean Squared Error = {:.4f}\n".format(rand_forest_rmse))
print("Randomized CV Random Forest:")
print("Randomized RF Score: {:.4f}".format(rf_rand_score))
print("Randomized RF MAE: {:.4f}".format(rf_rand_mae))
print("Randomized RF RMSE: {:.4f}".format(rf_rand_rmse))
print("==========================================")
print("Neural Network Results:\n")
print("Explained Variance Score = {:.4f}".format(nn_score))
print("Mean Absolute Error = {:.4f}".format(nn_mae))
print("Root Mean Squared Error = {:.4f}".format(nn_rmse))
print("==========================================")