This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between May 2014 and May 2015.

# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error as mse, explained_variance_score as evs

In [None]:
df = pd.read_csv('../input/housesalesprediction/kc_house_data.csv')

In [None]:
df.head()

In [None]:
df.info()

# House Prices

In [None]:
plt.figure(figsize=(12,6))
plt.hist(df.price, bins=100, rwidth=0.8)
plt.xlabel('Price')
plt.ylabel('Count')
plt.title('Distribution of house prices')
plt.show()

A large proportion of the houses are under 1 million

# Bedrooms

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='bedrooms', data=df)
plt.xlabel('No. of bedrooms')
plt.ylabel('Count')
plt.title('Distribution of bedrooms')
plt.show()

# Bathrooms

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(x='bathrooms', data=df)
plt.xlabel('No. of bathrooms')
plt.xticks(rotation=90)
plt.ylabel('Count')
plt.title('Distribution of bathrooms')
plt.show()

# Living area

In [None]:
plt.figure(figsize=(12,6))
plt.hist(df.sqft_living, bins=100, rwidth=0.8, color='brown')
plt.xlabel('Living area')
plt.ylabel('Count')
plt.title('Distribution of living area')
plt.show()

# Lot area

In [None]:
plt.figure(figsize=(12,6))
plt.hist(df.sqft_lot, bins=100, rwidth=0.8, color='teal')
plt.xlabel('Lot area')
plt.ylabel('Count')
plt.title('Distribution of lot area')
plt.show()

# Max & Min Living area

In [None]:
max_liv_area = df[['id', 'sqft_living']].sort_values(by='sqft_living', ascending=False).head(5)
print(max_liv_area)
min_liv_area = df[['id', 'sqft_living']].sort_values(by='sqft_living', ascending=False).tail(5)
print(min_liv_area)

# Max & Min Lot area

In [None]:
max_lot_area = df[['id', 'sqft_lot']].sort_values(by='sqft_lot', ascending=False).head(5)
print(max_lot_area)
min_lot_area = df[['id', 'sqft_lot']].sort_values(by='sqft_lot', ascending=False).tail(5)
print(min_lot_area)

Some houses have very large living and lot areas, probably they belong to wealthy people. A major proportion of the data is of the middle and upper middle class

# Floors

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='floors', data=df)
plt.xlabel('No. of floors')
plt.ylabel('Count')
plt.title('No. of floors')
plt.show()

# Waterfront

In [None]:
plt.figure(figsize=(8,6))
sns.countplot('waterfront', data=df)
plt.show()

# Condition

In [None]:
plt.figure(figsize=(8,6))
sns.countplot('condition', data=df)
plt.show()

# Grade

In [None]:
plt.figure(figsize=(8,6))
sns.countplot('grade', data=df)
plt.show()

# Year built

In [None]:
plt.figure(figsize=(22,8))
sns.countplot(df.yr_built)
plt.xticks(rotation=90)
plt.show()

# Year renovated

In [None]:
df.yr_renovated.value_counts()

Year of renovation is 0 in most cases which implies it is unknown. This is not a very useful feature to learn from

# Geographical location of houses

In [None]:
#read shape file
kc_map = gpd.read_file('../input/king-countyshape-file/shape file_king county/tl_2017_53033_roads.shp')
#select coordinate reference system
crs = {'init':'epsg:4326'}
#convert lat & long into points
geo = [Point(xy) for xy in zip(df['long'],df['lat'])]
#create geo dataframe
geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geo)
#plot the points on geographical map
fig, ax = plt.subplots(figsize=(17,11))
kc_map.plot(ax=ax, alpha=0.4, color='gray')
geo_df.plot(ax=ax, markersize=20, color='red', marker='o')
plt.show()

# Pairplot of select features 

In [None]:
df_sub = df[['price', 'bedrooms', 'sqft_living', 'bathrooms', 'yr_built', 'grade', 'lat', 'long']]
sns.pairplot(df_sub)

# Correlation between price and independent variables

In [None]:
df_model = df.drop(['id', 'date', 'yr_renovated', 'zipcode'], axis=1)
df_model.corr()

# Split data into train and test sets

In [None]:
X = df_model.drop(['price', 'geometry'],axis=1)
y = df_model['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Simple linear regression

In [None]:
lr = LinearRegression()
model_lr = lr.fit(X_train, y_train)
y_pred_te = lr.predict(X_test)
y_pred_tr = lr.predict(X_train)
print(f'train error: {mse(y_train, y_pred_tr, squared=False)}')
print(f'test score: {mse(y_test, y_pred_te, squared=False)}')

# 2. Random forest regressor

In [None]:
rf = RandomForestRegressor()
model_rf = rf.fit(X_train, y_train)
y_pred_te = rf.predict(X_test)
y_pred_tr = rf.predict(X_train)
print(f'train error: {mse(y_train, y_pred_tr, squared=False)}')
print(f'test score: {mse(y_test, y_pred_te, squared=False)}')

# Explained variance score

In [None]:
evs(y_test, y_pred_te)

# Plot predicted vs actual test data

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(y_test, y_pred_te, label='Predicted')
plt.plot(y_test, y_test, "r", label='Actual')
plt.legend(fontsize=15)
plt.xlabel('y_true', fontsize=15)
plt.ylabel('y_predicted', fontsize=15)
plt.title('Actual vs Predicted prices for test observations', fontsize=20)
plt.show()

# Checking model performance

In [None]:
i = 1239
predicted_price = rf.predict(X_test.iloc[i].values.reshape(1,-1))
print(f"Predicted price: {predicted_price[0]}\n Actual price: {y_test.iloc[i]}")