In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem Statement: Predicting London House Prices
The housing prices in London are extortionate. As a strapped-for-cash university graduate, it seems financially implausible that I would be able to purchase a house here. If I could just get onto the property ladder, on the other hand, I might be able to begin my climb toward a deluxe property at the foot of Hyde Park.

Using housing data from as early as 1995, I might be able to predict future prices. I expect either to be filled with hope, or have those hopes smashed to smithereens and condemning me to a life spent living in my parents' basement.

* **Data:** 2 csv files, comprising monthly and yealy variables about the houses in London dating from 1995 to 2020.
* **Target variable:** Average yearly price

# Data Exploration and Visualization

## Read in the Data

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

os.symlink('../input', 'input')

monthly_data = pd.read_csv('input/housing-in-london/housing_in_london_monthly_variables.csv',
                                  parse_dates=['date'])
yearly_data = pd.read_csv('input/housing-in-london/housing_in_london_yearly_variables.csv',
                                 parse_dates=['date'])

In [None]:
monthly_data.head()

In [None]:
yearly_data.head()

## Missing Values
It is immediately visible that some columns have missing values. To quantify this, let's see what proportion of each column is _not_ missing (i.e. how much is potentially useful data):

In [None]:
# fraction values that are not null
yearly_data.notnull().sum()/len(yearly_data)

In [None]:
monthly_data.notnull().sum()/len(monthly_data)

Notably, roughly 33% of life satisfaction data is potentially useful, and 50% for the number of crimes data. We may want to first try and construct a simple model without using these features, and find a way to include them later if required.

The data for both area size and number of houses accounts for ~62% of the total dataset. Recycling percentage has ~80% and number of jobs has ~86%. The rest are above ~95% present. We will try imputation for missing data of these features, keeping in mind the percentage of missing values that have to be imputed for each feature in subsequent analysis.

# Filtering Data from Outside of London
It appears that we have some data for houses outside of London. It is reasonable to expect that if we are predicting *London* housing prices, just the data from London will be sufficient for accurate predictions. Conveniently, the data has a 'borough_flag' indicating whether the house is located in a London borough or not. We will use this to filter out the non-London data:

In [None]:
# keep only the rows corresponding to locations in London
london_yearly = yearly_data.loc[yearly_data['borough_flag'] == 1]
london_monthly = monthly_data.loc[monthly_data['borough_flag'] == 1]

In [None]:
# check to make sure the areas match between the yearly and monthly datasets
set(london_monthly['area'].unique()) == set(london_yearly['area'].unique())

# Converting Monthly to Yearly Data
We will calculate yearly average prices for each area, using the monthly 'average_price' and 'houses_sold' columns.
We will also find the total number of crimes in each year, but must deal with the missing values first.

First, lets calculate the average prices, for which we appear to have all the data:

In [None]:
# the sum of prices over all the houses sold (average times sale count)
sum_prices = london_monthly['average_price'] * london_monthly['houses_sold']
# group prices by year and area, and sum for each distinct (year, area) pair
sum_prices = pd.concat([london_monthly['date'], london_monthly['area'], sum_prices], axis=1)
sum_prices = sum_prices.groupby([sum_prices['date'].dt.year, sum_prices['area']]).sum()

# total number of houses sold
sum_sales = london_monthly['houses_sold'].groupby([london_monthly['date'].dt.year, london_monthly['area']]).sum()

# element wise division of the average prices by the number of houses sold
monthly_average = sum_prices.div(sum_sales, axis=0)
monthly_average

Before naively imputing missing crime data, we might want to see if more data is missing for particular years:

In [None]:
missing_crimes = london_monthly[london_monthly['no_of_crimes'].isnull()]
missing_crimes.groupby(missing_crimes['date'].dt.year).size()

It appears years further into the past have more missing values. This may support an argument for considering more recent data. This would also make sense when considering inflation and other changes over time. For now, we will use some simple imputation to have some data available for early years.

Is there a chronological trend to the crime rates?

In [None]:
# total number of crimes by year and area
sum_crimes = london_monthly['no_of_crimes'].groupby([london_monthly['date'].dt.year, london_monthly['area']]).sum()
sns.lineplot(sum_crimes.reset_index()['date'], sum_crimes.reset_index()['no_of_crimes'])

The drops to zero are in line with our missing data. For the data we have, it appears that there is a small dip, but not too much variation over 20 years. This suggests imputing missing data with mean values.

In [None]:
from sklearn.impute import SimpleImputer

# imputation
imputer = SimpleImputer(strategy='mean')
imputed_crimes = pd.DataFrame(imputer.fit_transform(np.array(london_monthly['no_of_crimes']).reshape(-1, 1)))
imputed_crimes[['date', 'area']] = london_monthly[['date', 'area']]

# sum of crimes by year and area
sum_crimes = imputed_crimes.groupby([imputed_crimes['date'].dt.year, imputed_crimes['area']]).sum()

# putting it together with our averge prices
yearly_aggregates = pd.concat([sum_crimes, monthly_average], axis=1)
yearly_aggregates.columns = ['average_price', 'num_crimes']
yearly_aggregates

In [None]:
# for example: the average house price and number of crimes in Hounslow, in 2014 
yearly_aggregates.loc[(2014, 'hounslow')]

Finally, let's merge our results with the rest of the yearly data:

In [None]:
features_to_use = ['area', 'date', 'median_salary', 'mean_salary', 'recycling_pct', 'population_size', 'number_of_jobs', 'area_size', 'no_of_houses']
total_data = london_yearly[features_to_use]
# the date is the same day of each year, so we can simplify our values by dropping day and month
total_data['date'] = total_data['date'].dt.year

# join with aggregated monthly data
total_data = total_data.set_index(['date', 'area']).join(yearly_aggregates).reset_index()
total_data

# Visualisations

How does the average price depend on area? Looks like the far right (Westminster) is a clear winner for expensive houses.

In [None]:
sns.barplot(total_data['area'], total_data['average_price'])

How about mean and median salary? In trying to plot these, I discovered that there are some missing values in mean salary:

In [None]:
total_data.loc[total_data['mean_salary'] == '#']

In [None]:
total_data['mean_salary'] = total_data.replace('#', 'NaN')['mean_salary'].astype(float)

Notice the two distinct arms below. 
For one arm, there is a low median salary, but high housing prices. This appear to correspond to City of London. The other appears to be Barking and Dagenham, Barnet, and Sutton, where the median salary is high, but the prices are low.

In [None]:
sns.scatterplot(total_data['median_salary'], total_data['average_price'], hue=total_data['area'])
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
sns.scatterplot(total_data['mean_salary'], total_data['average_price'], hue=total_data['area'])
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

# Creating Train, Val, Test Splits and Constructing a Pipeline

In [None]:
total_data['recycling_pct'] = total_data['recycling_pct'].replace('na', 'NaN').astype(float)
total_data.dtypes

In [None]:
from sklearn.model_selection import train_test_split

X = total_data.drop(['average_price'], axis=1)
y = total_data['average_price']

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

categorical_features = [col for col in X_train.columns if X_train[col].dtype == "object"]
numeric_features = [col for col in X_train.columns if X_train[col].dtype in ['float64']]

numerical_preprocessor = Pipeline([('imputer', SimpleImputer())])
categorical_preprocessor = Pipeline([('encoder', OneHotEncoder(sparse=False))])

preprocessor = ColumnTransformer(
                    transformers=[
                        ('num', numerical_preprocessor, numeric_features),
                        ('cat', categorical_preprocessor, categorical_features)])

def train_model(model):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_val)

    score = mean_absolute_error(predictions, y_val)
    return score

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

xgb_model = XGBRegressor()
random_forest_model = RandomForestRegressor()

random_forest_score = train_model(random_forest_model)
xgb_score = train_model(xgb_model)
print(f"MAE for random forest: {random_forest_score}\nMAE for XGB Regressor: {xgb_score}")

# Explainability and Feature Importance

In [None]:
from xgboost import plot_importance as plot_xgb_importance

# the xgboost importance plots are less versatile than is available from sklearn models, but we can still see what we get
plot_xgb_importance(xgb_model)

## Permutation Importance: Relative Feature Importances

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

preprocessed_features = list(X_val.columns[2:]) + list(X_val['area'].unique())

preprocessed = pd.DataFrame(preprocessor.fit_transform(X_val, y_val))
preprocessed.columns = preprocessed_features
perm = PermutationImportance(model, random_state=1).fit(preprocessed, y_val)
eli5.show_weights(perm, feature_names=preprocessed.columns.tolist())

## Partial Dependence Plots: Effect of Individual Features

In [None]:
from pdpbox import pdp, get_dataset, info_plots

pdp_num_jobs = pdp.pdp_isolate(model=model, dataset=preprocessed, model_features=preprocessed_features,
                                   feature='number_of_jobs')

pdp_recycling_pct = pdp.pdp_isolate(model=model, dataset=preprocessed, model_features=preprocessed_features,
                                   feature='recycling_pct')

pdp.pdp_plot(pdp_num_jobs, 'Number of Jobs')
pdp.pdp_plot(pdp_recycling_pct, 'Recycling Percentage')
plt.show()

# SHAP Values: Feature Influence on Individual Predictions

In [None]:
import shap

data_for_prediction = preprocessed.iloc[-10]

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(data_for_prediction)

shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, data_for_prediction)

# SHAP Summaries

In [None]:
shap_values = explainer.shap_values(preprocessed)
shap.summary_plot(shap_values, preprocessed)

It appears that a small number of houses has the effect of decreasing housing prices, while a small area size increases prices. Perhaps we should be thinking more about the interaction between these two variables, in other words: the house density?

# SHAP Dependence Plots
To investigate if there is such an interaction, let's try a dependence plot:

In [None]:
shap.dependence_plot('no_of_houses', shap_values, preprocessed, interaction_index='area_size')

It seems like for the smaller area sizes, a smaller number of houses makes a big difference to the price.