## London borough with the best House Price to Crime Rate ratio

### Data Sources

##### London Crime data by borough from Sept 2016 - Aug 2018
https://data.london.gov.uk/dataset/recorded_crime_summary


##### London House Prices data by borough from Jan 2016 - July 2018
https://data.london.gov.uk/dataset/uk-house-price-index


##### Population of London boroughs (Estimate Jun 2017)
https://www.citypopulation.de/php/uk-greaterlondon.php

In [19]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib notebook

In [20]:
# Load data
crime_raw_df = pd.read_csv('./data/MPS Borough Level Crime.csv')
house_price_raw_df = pd.read_excel('./data/UK House price index.xls', sheet_name='Average price')
pop_raw_df = pd.read_excel('./data/london_borough_population.xlsx', header=1)

### Format/Aggregate Datasets

In [21]:
def format_crime_data(df):
    """
    Crime Data: Performs clean up and aggregate steps on
    Returns a series
    
    """
    # Keep only the required columns
    keep_columns = [
        'Borough',
        '201701', '201702', '201703', '201704', '201705', '201706',
        '201707', '201708', '201709', '201710', '201711', '201712',
    ]
    df = df[keep_columns]

    # Set borough as the index
    df = df.set_index('Borough')

    # Drop Heathrow borough rows
    df = df.drop('London Heathrow and London City Airports', axis=0) 

    # Get the total crimes of 2017
    df = df.sum(axis=1)

    # Group all crime categories and find sum
    df = df.groupby('Borough').agg('sum')
    
    df.name = 'Total Crime'
    
    return df
    
crime_df = format_crime_data(crime_raw_df)

In [22]:
def format_house_price_data(df):
    """
    House Price data: Performs clean up and aggregate steps on
    Returns a series
    
    """
    # Get rows for 2017
    df = df.loc['2017-01-01': '2017-12-01']
    
    # Keep the required rows and columns
    columns = [
        'Barking & Dagenham', 'Barnet', 'Bexley', 'Brent',
        'Bromley', 'Camden', 'Croydon', 'Ealing', 'Enfield', 'Greenwich',
        'Hackney', 'Hammersmith & Fulham', 'Haringey', 'Harrow', 'Havering',
        'Hillingdon', 'Hounslow', 'Islington', 'Kensington & Chelsea',
        'Kingston upon Thames', 'Lambeth', 'Lewisham', 'Merton', 'Newham',
        'Redbridge', 'Richmond upon Thames', 'Southwark', 'Sutton',
        'Tower Hamlets', 'Waltham Forest', 'Wandsworth', 'Westminster'
    ]
    df = df[columns]
    
    df.columns = map(lambda x: x.replace('&', 'and'), df.columns)
    
    # Get borough names as index
    df = df.T    
    
    # Get the mean house price for 2017
    df = df.mean(axis=1)
    
    # Convert float to int
    df = df.astype('int')
    
    df.name = 'Average House Price'

    return df
    
house_price_df = format_house_price_data(house_price_raw_df)

### Format Population data

In [23]:
def format_population_data(df):
    """
    Population data: Performs clean up and aggregate steps on 
    Returns a series
    
    """
    # 'City of Westminster' is called 'Westminster' in other tables
    df['Borough'] = df['Borough'].replace('City of Westminster', 'Westminster')
    df = df.set_index('Borough')
    return df

pop_df = format_population_data(pop_raw_df)

### Merge Crime, House Price and Population data into one table

In [24]:
final_df = pd.concat([house_price_df, crime_df, pop_df], axis=1, sort=True)
final_df.head()

Unnamed: 0,Average House Price,Total Crime,Population
Barking and Dagenham,287708,18671,210711
Barnet,538128,26989,387803
Bexley,335742,14815,246124
Brent,487991,29741,329102
Bromley,441077,22774,329391


### Get Crime Percentage

In [40]:
house_price_df.head()

Barking and Dagenham    287708.844250
Barnet                  538128.896375
Bexley                  335742.353075
Brent                   487991.766475
Bromley                 441077.412875
Name: Average House Price, dtype: float64

ValueError: can not merge DataFrame with instance of type <class 'pandas.core.series.Series'>

In [25]:
# Get percentage (1 d.p.) of crimes in each borough
final_df['Crime Percent'] = round((final_df['Total Crime']/final_df['Population']) * 100, 1)
final_df.head()

Unnamed: 0,Average House Price,Total Crime,Population,Crime Percent
Barking and Dagenham,287708,18671,210711,8.9
Barnet,538128,26989,387803,7.0
Bexley,335742,14815,246124,6.0
Brent,487991,29741,329102,9.0
Bromley,441077,22774,329391,6.9


### Get the arrays for plottings

In [26]:
# Returns an array of mean value with same shape as input
avg = lambda series: np.full(series.shape[0], series.mean())

# Mean House Price array
house_price_avg = avg(final_df['Average House Price'])

# Mean Crime Percent array
crime_percent_avg = avg(final_df['Crime Percent'])

### Add colour column

In [33]:
def add_colour(df):
    colours = [
        '#5DFF29', # Green
        '#FF9B51', # Orange
        '#FF3F3F', # Red
    ]
    h_avg = df['Average House Price'].mean()
    c_avg = df['Crime Percent'].mean()
    
    # Fill Colours column with correct colours
    df.loc[(df['Average House Price'] <= h_avg) & (df['Crime Percent'] <= c_avg), 'Colours'] = colours[0]
    df.loc[(df['Average House Price'] < h_avg) & (df['Crime Percent'] > c_avg), 'Colours'] = colours[1]
    df.loc[(df['Average House Price'] > h_avg) & (df['Crime Percent'] < c_avg), 'Colours'] = colours[1]
    df.loc[(df['Average House Price'] > h_avg) & (df['Crime Percent'] > c_avg), 'Colours'] = colours[2]

    return df

final_df = add_colour(final_df)
final_df.head()

Unnamed: 0,Average House Price,Total Crime,Population,Crime Percent,Colours
Barking and Dagenham,287708,18671,210711,8.9,#5DFF29
Barnet,538128,26989,387803,7.0,#FF9B51
Bexley,335742,14815,246124,6.0,#5DFF29
Brent,487991,29741,329102,9.0,#5DFF29
Bromley,441077,22774,329391,6.9,#5DFF29


In [28]:
# # Add a column which is (Average house price * Crime Rate) 
# # Idea is that the smaller this number, the better
# final_df['House Price x Crime Rate'] = final_df['Average House Price'] * final_df['Crime Rate']
# final_df = final_df.sort_values('House Price x Crime Rate').head()

In [32]:
plt.figure()

plt.scatter(final_df['Average House Price'], final_df['Crime Percent'], s=12, color=final_df['Colours'])

# Draw mean lines for house price and crime percent
plt.plot(final_df['Average House Price'], crime_percent_avg, color='grey')
plt.plot(house_price_avg, final_df['Crime Percent'], color='grey')

# remove the top and right lines of frame of the chart
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)



plt.show()

<IPython.core.display.Javascript object>