# King County Housing Data Analysis and Price Predictions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
houses = pd.read_csv('../input/kc-house-data/kc_house_data.csv')

In [None]:
houses.head()

In [None]:
houses.info()

### Missing Values

In [None]:
houses.isnull().sum()

In [None]:
houses[houses['sqft_above'].isnull()]

In [None]:
plt.figure(figsize=(10, 7))
sns.boxplot(x='grade', y='sqft_above', data=houses)

I will fill up these null values with the mean of sqft_above value for that grade

In [None]:
houses['sqft_above'] = houses[['sqft_above', 'grade']].apply(
    lambda sqft_grade:
    houses.groupby('grade').mean()['sqft_above'].loc[sqft_grade[1]] if pd.isnull(sqft_grade[0]) else sqft_grade[0],
    axis=1
)

### Correaltion Matrix

Full Correlation Heatmap:

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(houses.drop('id', axis=1).corr(), annot=True, cmap='viridis_r')

Correalation of house price with others

In [None]:
plt.figure(figsize=(10, 7))
houses.corr().sort_values('price').drop('price')['price'].plot(kind='bar', title='Correlation with house prices in King County')

### Number of Bedrooms

In [None]:
plt.figure(figsize=(10, 7))
sns.countplot(x='bedrooms', data=houses).set(ylabel='Count', title='Number of different houses depending on bedrooms', xlabel='Number of bedrooms')

In [None]:
plt.figure(figsize=(10, 7))
sns.boxplot(x='bedrooms', y='price', data=houses).set(xlabel='Number of Bedrooms', ylabel='Price', title='Comparison of House price and number of bedrooms')

Number of bedrooms do not seem to have very high correalation to house price in King County. Most houses are priced below 1 million USD as mean values for all the boxes fall below this price.

In [None]:
print('Correlation value between Number of Bedrooms and Price: ', houses.corr().loc['price', 'bedrooms'])

### Sqft Living Space

In [None]:
plt.figure(figsize=(10, 7))
sns.scatterplot(x='sqft_living', y='price', data=houses).set(xlabel='Sqft Living Space', ylabel='Price of the house')

It seems that sqft living space has a high correation with house price. However, there are some very big outliers here.
These outlier can have an adverse effect on the accuracy of our model.

### Waterfront

In [None]:
plt.figure(figsize=(10, 7))
sns.boxplot(x='waterfront', y='price', data=houses).set(xlabel='', ylabel='Price', title='Comparison of prices for houses having a waterfront or not',
                                                        xticklabels=['Do not have waterfront', 'Have waterfront'])

In [None]:
plt.figure(figsize=(10, 7))
sns.countplot(x='waterfront', data=houses).set(xlabel='', ylabel='Price', title='Number of waterfront and non waterfront houses',
                                             xticklabels=['Do not have waterfront', 'Have waterfront'])

Houses that are waterfront properties do tend to have higher prices. But the number of waterfront properties in the sample with regards to non waterfront properties are very low.

### Year Built and Year Renovated

Year Built:

In [None]:
plt.figure(figsize=(10, 7))
sns.lineplot(x='yr_built', y='price', data=houses)

In [None]:
houses['century_old'] = houses['yr_built'].apply(lambda year: 1 if year <= 1915 else 0)

In [None]:
plt.figure(figsize=(10, 7))
sns.boxplot(x='century_old', y='price', data=houses).set(xlabel='', ylabel='Price', title='Comparison of prices depending on the year the house was built',
                                                        xticklabels=['Less than 100 years old', 'Over a 100 years Old'])

In [None]:
plt.figure(figsize=(10, 7))
sns.countplot(x='century_old', data=houses).set(xlabel='', ylabel='Price', title='Number of Houses according to thier age',
                                                xticklabels=['Less than 100 years old', 'Over a 100 years Old'])

Houses that are over a 100 years old tend to have a higher price point.

Let's use the newly ctreated feature century_old instead of yr_built for training the model.

In [None]:
houses.drop('yr_built', axis=1, inplace=True)

Year Renovated:

In [None]:
houses['yr_renovated'].value_counts()

A lot af values are 0, which suggest no renovation work done on the houses. Let's see how the renovation affect the prices for the houses:

In [None]:
plt.figure(figsize=(10, 7))
sns.lineplot(x='yr_renovated', y='price', data=houses[houses['yr_renovated'] != 0]).set(xlabel='Year of Renotation', ylabel='Price',
                                                                                        title='Relation between house price and year of renovation')

Recently renovated houses seem to have higher price.

### Latitude and Logitude

Lets see the map of King County, for reference:

<img src="https://www.kingcounty.gov/about/region/~/media/about/maps/KC_simplemap_Oct2013.ashx" />

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='long', y='lat', data=houses, hue='price', palette='magma_r', alpha=0.15)

It looks like, the prices are very evenly spread among the County. But there is an issue, if we look at the legend the price is 3 million USD. We saw earlier that most of the houses are priced below 1 million. So let's see waht percentage of houses are in the below 3, 2.5, 2 and 1.5 million price limit

In [None]:
print('Percentage of houses priced below 3 million USD: ', len(houses[houses['price'] < 3000000]) / len(houses) * 100)
print('Percentage of houses priced below 2.5 million USD: ', len(houses[houses['price'] < 2500000]) / len(houses) * 100)
print('Percentage of houses priced below 2 million USD: ', len(houses[houses['price'] < 2000000]) / len(houses) * 100)
print('Percentage of houses priced below 1.5 million USD: ', len(houses[houses['price'] < 1500000]) / len(houses) * 100)

It seems if we put a cap on price at 2 million we will be losing just 1% of data. Lets see how adding this cap affects our latitude and longitude analysis

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x='long', y='lat', data=houses[houses['price'] < 2000000], hue='price', palette='magma_r', alpha=0.15)

If we refer to attached map, the high price areas correspond to urban areas in Seattle, Mercer Island and Bellevue.

As we can get the area related information from the combination of latitude and longitude, I believe zipcode can be dropped.

In [None]:
houses.drop('zipcode', axis=1, inplace=True)

I am creating a categoreical column zone by splitting the map in 4 parts depending on latitude and longitude columns

In [None]:
lat_mid = houses['lat'].min() + ((houses['lat'].max() - houses['lat'].min()) / 2)
long_mid = houses['long'].min() + ((houses['long'].max() - houses['long'].min()) / 2)

In [None]:
houses['zone'] = houses[['lat', 'long']].apply(
    lambda lat_long:
    1 if ((lat_long[0] < lat_mid) and (lat_long[1] < long_mid)) else (
        2 if ((lat_long[0] >= lat_mid) and (lat_long[1] < long_mid)) else (
            3 if ((lat_long[0] < lat_mid) and (lat_long[1] >= long_mid)) else 4
        )
    ),
    axis=1
)

In [None]:
houses = pd.concat([houses.drop(['lat', 'long', 'zone'], axis=1), pd.get_dummies(houses['zone'], drop_first=True)], axis =1)

In [None]:
houses.head()

### Date

In [None]:
houses['date'] = pd.to_datetime(houses['date'])

In [None]:
houses['year_sold'] = houses['date'].apply(lambda date: date.year)
houses['month_sold'] = houses['date'].apply(lambda date: date.month)

In [None]:
plt.figure(figsize=(10, 7))
sns.boxplot(x='year_sold', y='price', data=houses)

In [None]:
plt.figure(figsize=(10, 7))
sns.boxplot(x='month_sold', y='price', data=houses)

In [None]:
print('min: ', houses['date'].min(), '\nmax: ',houses['date'].max())

Both year and month of sale seem to have no effect on the price of the house. It dose make sense to me as the data spans for just 13 months which is not a big enough window for fluctuations in house prices.

Since the date and it's extracted features do not have much effect on house price. I will drop them.

In [None]:
houses.drop(['date', 'year_sold', 'month_sold'], axis=1, inplace=True)

### ID

Id column is just a unique identifier and not a feature so I will drop it.

In [None]:
houses.drop('id', axis=1, inplace=True)

### Train Test Split and Data Standardization

In [None]:
houses.info()

In [None]:
X = houses.drop('price', axis=1)
y = houses['price']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr_model = LinearRegression()

In [None]:
lr_model.fit(X_train, y_train)

In [None]:
lr_predictions = lr_model.predict(X_test)

### Random Forrest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
random_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 100, num = 10)],
    'min_samples_split': [2, 5, 10, 15, 100],
    'min_samples_leaf': [1, 2, 5, 10]
}

In [None]:
rf = RandomForestRegressor()

In [None]:
rf_random_search = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, scoring='neg_mean_squared_error', n_iter=10, cv=5, verbose=1, random_state=11)

In [None]:
rf_random_search.fit(X_train, y_train)

In [None]:
rf_random_search.best_params_

In [None]:
rf_predictions = rf_random_search.predict(X_test)

### ANN

In [None]:
from tensorflow.keras.models import Sequential

In [None]:
from tensorflow.keras.layers import Dense

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
ann = Sequential()

In [None]:
ann.add(Dense(18, activation='relu'))
ann.add(Dense(18, activation='relu'))
ann.add(Dense(18, activation='relu'))
ann.add(Dense(18, activation='relu'))
ann.add(Dense(1))

In [None]:
ann.compile(optimizer='adam', loss='mse')

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', patience=25, verbose=1)

In [None]:
ann.fit(x=X_train, y=y_train.values, verbose=1, batch_size=32, epochs=10000, validation_data=(X_test, y_test.values), callbacks=[early_stop])

In [None]:
ann_predictions = ann.predict(X_test)

### Comparision

In [None]:
predictions_df = pd.DataFrame(y_test)
predictions_df['Linear Regression'] = lr_predictions
predictions_df['Random Forrest Regressor'] = rf_predictions
predictions_df['Artifical Neural Network'] = ann_predictions

In [None]:
predictions_df.head()

In [None]:
sns.pairplot(predictions_df, x_vars=['Linear Regression', 'Random Forrest Regressor', 'Artifical Neural Network'], y_vars=['price'], height=7)

In [None]:
from sklearn.metrics import mean_squared_error, explained_variance_score, mean_absolute_error, r2_score

In [None]:
print('Linear Regression:')
print('Mean Absolute Error:', mean_absolute_error(predictions_df['price'], predictions_df['Linear Regression']))
print('Mean Squared Error:', mean_squared_error(predictions_df['price'], predictions_df['Linear Regression']))
print('Explained Variance Score:', explained_variance_score(predictions_df['price'], predictions_df['Linear Regression']))
print('R2 Score:', r2_score(predictions_df['price'], predictions_df['Linear Regression']))

In [None]:
print('Random Forrest Regressor:')
print('Mean Absolute Error:', mean_absolute_error(predictions_df['price'], predictions_df['Random Forrest Regressor']))
print('Mean Squared Error:', mean_squared_error(predictions_df['price'], predictions_df['Random Forrest Regressor']))
print('Explained Variance Score:', explained_variance_score(predictions_df['price'], predictions_df['Random Forrest Regressor']))
print('R2 Score:', r2_score(predictions_df['price'], predictions_df['Random Forrest Regressor']))

In [None]:
print('Artifical Neural Network:')
print('Mean Absolute Error:', mean_absolute_error(predictions_df['price'], predictions_df['Artifical Neural Network']))
print('Mean Squared Error:', mean_squared_error(predictions_df['price'], predictions_df['Artifical Neural Network']))
print('Explained Variance Score:', explained_variance_score(predictions_df['price'], predictions_df['Artifical Neural Network']))
print('R2 Score:', r2_score(predictions_df['price'], predictions_df['Artifical Neural Network']))