In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime

In [2]:
crime_data_path = 'data/crime'

crime_data_paths = list(Path(crime_data_path).glob('*'))

In [3]:
def get_street_data(year, month):
    """Gets all street data in a given year and month. It concatenates the data for all forces."""
    
    ym_data_path = Path(crime_data_path, '%s-%02d' % (year, month))
    
    data_frames = []
    
    for file_path in ym_data_path.glob('*'):
        if file_path.stem.endswith('street'):
            df = pd.read_csv(file_path)
            data_frames.append(df)
    
    df_total = pd.concat(data_frames).reset_index(drop=True)
            
    return df_total

def get_street_data_from_to(from_y, from_m, to_y, to_m):
    """Gets all street data from a given year and month to a given year and month."""
    
    from_date = datetime.strptime('%s-%02d' % (from_y, from_m), '%Y-%m')
    to_date = datetime.strptime('%s-%02d' % (to_y, to_m), '%Y-%m')
    
    data_frames = []
    
    for ym_data_path in crime_data_paths:
        crime_data_date = datetime.strptime(ym_data_path.stem, '%Y-%m')
        
        if crime_data_date >= from_date and crime_data_date <= to_date:
            df = get_street_data(crime_data_date.year, crime_data_date.month)
            data_frames.append(df)
            
    df_total = pd.concat(data_frames).reset_index(drop=True)
    return df_total

In [4]:
street_data_2015 = get_street_data_from_to(2013, 1, 2014, 12)
street_data_2016_18 = get_street_data_from_to(2016, 6, 2018, 16)
IMD15 = pd.read_csv('data/IMD_LSOA_2015.csv')
IMD15 = pd.read_csv('data/IMD_LSOA_2019.csv')
LSOA_pops = pd.read_excel('data/LSOA_populations_2012.xlsx', sheet_name='Population Denominators')
LSOA_workplace_pops = pd.read_csv('data/LSOA_workplace_population_2011.csv')

ValueError: unconverted data remains: 6

In [None]:
LSOA_workplace_pops

In [None]:
IMD15.columns

In [None]:
IMD15 = IMD15.sort_values(by='LSOA code (2011)').reset_index(drop=True)
LSOA_pops = LSOA_pops.sort_values(by='LSOA code (2011)').reset_index(drop=True)
LSOA_workplace_pops = LSOA_workplace_pops.sort_values(by='geography code').reset_index(drop=True)

In [None]:
LSOA_workplace_pops = LSOA_workplace_pops[LSOA_workplace_pops['geography code'].str.startswith('E')]
LSOA_workplace_pops = LSOA_workplace_pops.reset_index(drop=True)

In [None]:
# Clean crime data

# Remove NaN LSOA entries
na_mask = street_data_2015['LSOA code'].isna()
street_data_2015 = street_data_2015[~na_mask]

# Remove non-English crimes
street_data_2015 = street_data_2015[street_data_2015['LSOA code'].str.startswith('E')]

street_data_2015 = street_data_2015.reset_index(drop=True)

In [None]:
crime_types = list(set(street_data_2015['Crime type']))
LSOAs = IMD15['LSOA code (2011)']

In [None]:
crimes_per_LSOA_dict = {}

for LSOA in LSOAs:
    crimes_per_LSOA_dict[LSOA] = np.zeros(len(crime_types), dtype=int)

current_LSOA = street_data_2015.iloc[0,7]
current_LSOA_data = crimes_per_LSOA_dict[current_LSOA]

na_mask = street_data_2015['LSOA code'].isna()

for row in street_data_2015[~na_mask].itertuples():
    LSOA = row[8]
    crime = row[10]
    crime_i = crime_types.index(crime)
    
    if LSOA != current_LSOA:
        current_LSOA = LSOA
        current_LSOA_data = crimes_per_LSOA_dict[current_LSOA]
    
    current_LSOA_data[crime_i] += 1
    
rows = []

for k, v in crimes_per_LSOA_dict.items():
    rows.append([k] + list(v))

crimes_per_LSOA = pd.DataFrame(rows)
crimes_per_LSOA.columns = ['LSOA'] + crime_types
crimes_per_LSOA = crimes_per_LSOA.sort_values(by='LSOA').reset_index(drop=True)

crimes_per_LSOA.head()

In [None]:
crimes_rates_per_LSOA = crimes_per_LSOA.copy()
crimes_rates_per_LSOA.loc[:, 'Violence and sexual offences':] = crimes_rates_per_LSOA.iloc[:,1:].divide(LSOA_pops.iloc[:,-1], axis=0)

In [None]:
crimes_rates_per_LSOA = crimes_per_LSOA.copy()
crimes_rates_per_LSOA.loc[:, 'Violence and sexual offences':] = crimes_rates_per_LSOA.iloc[:,1:].divide(LSOA_pops['Total population: mid 2012 (excluding prisoners)'], axis=0)

In [None]:
pops = LSOA_pops['Total population: mid 2012 (excluding prisoners)']
pops += LSOA_workplace_pops['Population: All usual residents aged 16 to 74; measures: Value'] * (63.91 / 63.02)

crimes_rates_per_LSOA = crimes_per_LSOA.copy()
crimes_rates_per_LSOA.loc[:, 'Violence and sexual offences':] = crimes_rates_per_LSOA.iloc[:,1:].divide(pops, axis=0)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

y = IMD15['Crime Score']
y = y
X = np.array(crimes_rates_per_LSOA.iloc[:, 1:])
#X = np.random.random(X.shape)

X_train = X
X_test = X
y_train = y
y_test = y

degree=1

polyreg=make_pipeline(PolynomialFeatures(degree),LinearRegression())

polyreg.fit(X_train,y_train)

y_pred = polyreg.predict(X_test)
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(y_test, y_test, 'red')
ax.scatter(y_test, y_pred, s=0.2)
ax.set_xlabel('Data')
ax.set_ylabel('Prediction')
ax.set_title('2015 IMD Crime scores')

ax.set_xlim(np.min(y_test), np.max(y_test))
ax.set_ylim(np.min(y_test), np.max(y_test))

plt.show()

In [None]:
d= np.array(y_pred_ranking) - np.array(y_test_ranking)

np.mean(np.abs(d)), np.sqrt(np.mean(d**2))

In [None]:
sorted_y_pred = np.sort(y_pred)

y_pred_ranking = []

for y in y_pred:
    y_pred_ranking.append(np.where(sorted_y_pred == y)[0][0])
    
    
sorted_y_test= np.sort(y_test)

y_test_ranking = []

for y in y_test:
    y_test_ranking.append(np.where(sorted_y_test == y)[0][0])
    
fig, ax = plt.subplots(figsize=(8,8))
#ax.plot(sorted(y_test), sorted(y_test), 'red')
ax.scatter(y_test_ranking, y_pred_ranking, s=1)
ax.set_xlabel('Data')
ax.set_ylabel('Prediction')
ax.set_title('2015 IMD Crime ranking')

plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge

y = IMD15['Crime Score']
y = y
X = np.array(crimes_rates_per_LSOA.iloc[:, 1:])
#X = np.random.random(X.shape)

X_train = X
X_test = X
y_train = y
y_test = y

reg = Ridge(alpha=.5)
reg.fit(X_train,y_train)

y_pred = reg.predict(X_test)
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(y_test, y_test, 'red')
ax.scatter(y_test, y_pred)
ax.set_xlabel('Data')
ax.set_ylabel('Prediction')
ax.set_title('2015 IMD Crime scores')

ax.set_xlim(np.min(y_test), np.max(y_test))
ax.set_ylim(np.min(y_test), np.max(y_test))

plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

y = IMD15['Crime Score']
y = np.exp(0.6*y)
X = np.array(crimes_rates_per_LSOA.iloc[:, 1:])
#X = np.random.random(X.shape)

X_train = X
X_test = X
y_train = y
y_test = y

degree=1

polyreg=make_pipeline(PolynomialFeatures(degree),LinearRegression())

polyreg.fit(X_train,y_train)

y_pred = polyreg.predict(X_test)
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(sorted(y_test), sorted(y_test), 'red')
ax.scatter(y_test, y_pred)
ax.set_xlabel('Data')
ax.set_ylabel('Prediction')
ax.set_title('2015 IMD Crime scores')

ax.set_xscale('log')
ax.set_yscale('log')

plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

y = IMD15['Crime Score']
X = np.array(crimes_rates_per_LSOA.iloc[:, 1:])
#X = np.random.random(X.shape)

p_train = 0.3
train_mask = np.random.choice([True, False], crimes_rates_per_LSOA.shape[0], replace=True, p=[p_train, 1-p_train])
X_train = X[train_mask,:]
X_test = X[~train_mask,:]
y_train = y[train_mask]
y_test = y[~train_mask]

degree=3

polyreg=make_pipeline(PolynomialFeatures(degree),LinearRegression())

polyreg.fit(X_train,y_train)

y_pred = polyreg.predict(X_test)
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(y_test, y_test, 'red')
ax.scatter(y_test, y_pred)
ax.set_xlabel('Data')
ax.set_ylabel('Prediction')
ax.set_title('2015 IMD Crime scores')

ax.set_xlim(np.min(y_test), np.max(y_test))
ax.set_ylim(np.min(y_test), np.max(y_test))

plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

y = IMD15['Crime Score']
y = np.exp(1.3*y)
X = np.array(crimes_rates_per_LSOA.iloc[:, 1:])
#X = np.random.random(X.shape)

p_train = 0.3
train_mask = np.random.choice([True, False], crimes_rates_per_LSOA.shape[0], replace=True, p=[p_train, 1-p_train])
X_train = X[train_mask,:]
X_test = X[~train_mask,:]
y_train = y[train_mask]
y_test = y[~train_mask]

degree=3

polyreg=make_pipeline(PolynomialFeatures(degree),LinearRegression())

polyreg.fit(X_train,y_train)

y_pred = polyreg.predict(X_test)
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(sorted(y_test), sorted(y_test), 'red')
ax.scatter(y_test, y_pred, s=0.3)
ax.set_xlabel('Data')
ax.set_ylabel('Prediction')
ax.set_title('2015 IMD Crime scores')

ax.set_xscale('log')
ax.set_yscale('log')

ax.set_xlim(np.min(y_test), np.max(y_test))
ax.set_ylim(np.min(y_test), np.max(y_test))

plt.show()

In [None]:
kendle correlation

In [None]:
sorted_y_pred = np.sort(y_pred)

y_pred_ranking = []

for y in y_pred:
    y_pred_ranking.append(np.where(sorted_y_pred == y)[0][0])
    
    
sorted_y_test= np.sort(y_test)

y_test_ranking = []

for y in y_test:
    y_test_ranking.append(np.where(sorted_y_test == y)[0][0])
    
fig, ax = plt.subplots(figsize=(8,8))
#ax.plot(sorted(y_test), sorted(y_test), 'red')
ax.scatter(y_test_ranking, y_pred_ranking, s=1)
ax.set_xlabel('Data')
ax.set_ylabel('Prediction')
ax.set_title('2015 IMD Crime ranking')

plt.show()

In [None]:
d= np.array(y_pred_ranking) - np.array(y_test_ranking)

np.mean(np.abs(d)), np.sqrt(np.mean(d**2))

Point 4.7.7 of [the IMD technical report](https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/464485/English_Indices_of_Deprivation_2015_-_Technical-Report.pdf) says that shrinkage was applied to the scores. Could this be the issue?