# Crime data - feature engineering

In [2]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


import datetime

## LSOA level data

In [3]:
crime = pd.read_csv('data/london_crime_by_lsoa.csv')

In [4]:
crime['date'] = crime['year'].map(str) + "-" + crime['month'].map(str)

# create target variable 'is_commited': whether each type of crime is commited or not for each LSOA & Month
crime['is_commited'] = crime['value'].apply(lambda x: 0 if x == 0 else 1)

crime['day'] = 1
crime['datetime'] = pd.to_datetime(crime[['year','month','day']])

crime = crime.sort_values(by=['lsoa_code','minor_category','datetime']).reset_index()

In [5]:
crime.head()

Unnamed: 0,index,lsoa_code,borough,major_category,minor_category,value,year,month,date,is_commited,day,datetime
0,3150451,E01000001,City of London,Burglary,Burglary in Other Buildings,0,2008,1,2008-1,0,1,2008-01-01
1,4046291,E01000001,City of London,Burglary,Burglary in Other Buildings,0,2008,2,2008-2,0,1,2008-02-01
2,7496239,E01000001,City of London,Burglary,Burglary in Other Buildings,0,2008,3,2008-3,0,1,2008-03-01
3,12990257,E01000001,City of London,Burglary,Burglary in Other Buildings,0,2008,4,2008-4,0,1,2008-04-01
4,5751091,E01000001,City of London,Burglary,Burglary in Other Buildings,0,2008,5,2008-5,0,1,2008-05-01


In [6]:
crime['year'].sort_values().unique()

array([2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016])

In [7]:
# Looking for null categories:
crime.groupby('minor_category')['value'].sum()

minor_category
Assault with Injury                       451001
Burglary in Other Buildings               263011
Burglary in a Dwelling                    491282
Business Property                          21295
Common Assault                            413690
Counted per Victim                          3840
Criminal Damage To Dwelling               154116
Criminal Damage To Motor Vehicle          265463
Criminal Damage To Other Building          66003
Drug Trafficking                           35819
Going Equipped                              5530
Handling Stolen Goods                      16100
Harassment                                458124
Motor Vehicle Interference & Tampering     56224
Murder                                       949
Offensive Weapon                           37983
Other Criminal Damage                     145356
Other Drugs                                 2998
Other Fraud & Forgery                       1485
Other Notifiable                          100819
Other

In [12]:
crime[crime['year']>2011].groupby('minor_category')['value'].sum()

# From 2012 there are several null categories due to reclassification. Also, census data is from 2011. 

minor_category
Assault with Injury                       234177
Burglary in Other Buildings               140561
Burglary in a Dwelling                    254880
Business Property                           9490
Common Assault                            254570
Counted per Victim                             0
Criminal Damage To Dwelling                69832
Criminal Damage To Motor Vehicle          124748
Criminal Damage To Other Building          33237
Drug Trafficking                           18732
Going Equipped                              2906
Handling Stolen Goods                       7909
Harassment                                290840
Motor Vehicle Interference & Tampering     39259
Murder                                       482
Offensive Weapon                           19282
Other Criminal Damage                      76873
Other Drugs                                 1601
Other Fraud & Forgery                          0
Other Notifiable                           61655
Other

#### Getting previous crime and target value

In [13]:
prev = (crime[['lsoa_code','minor_category','datetime','value','is_commited']]
        .sort_values(by=['lsoa_code','minor_category','datetime']))

prev[["prev_date", "prev_value",'prev_is_commited']] = (prev
                                                        .groupby(['lsoa_code', "minor_category"])["datetime", "value",'is_commited']
                                                        .transform(lambda grp: grp.shift(1)))

crime = (crime.merge(prev[['lsoa_code','minor_category','datetime','prev_value','prev_is_commited']],on=['lsoa_code','minor_category','datetime']))

#### Getting previous year crime and target value for LSOA & Crime type

In [14]:
cumsum = (crime[['lsoa_code','minor_category','value']]
          .groupby(['lsoa_code','minor_category'])
          .rolling(min_periods=12, window=12)
          .sum()
          .reset_index())


In [15]:
crime['prev_year_crime_sum'] = cumsum[['value']]
crime['prev_year_crime_sum'] = crime['prev_year_crime_sum'] - crime['value']

#### Getting previous year crime and target value for LSOA

In [16]:
lsoa = (crime
        .groupby(['lsoa_code','datetime'])['value']
        .sum()
        .reset_index()
        .sort_values(by=['lsoa_code','datetime']))

cumsum_lsoa = (lsoa[['lsoa_code','value']]
               .groupby(['lsoa_code'])
               .rolling(min_periods=12, window=12)
               .sum()
               .reset_index())

lsoa['prev_year_lsoa_sum'] = cumsum_lsoa['value']
lsoa['prev_year_lsoa_sum'] = lsoa['prev_year_lsoa_sum'] - lsoa['value']

crime = crime.merge(lsoa[['lsoa_code','datetime','prev_year_lsoa_sum']],on=['lsoa_code','datetime'])

#### Getting previous year crime and target value for crime type

In [17]:
minor_category = (crime
                  .groupby(['minor_category','datetime'])['value']
                  .sum()
                  .reset_index()
                  .sort_values(by=['minor_category','datetime']))

In [18]:
cumsum_minorc = (minor_category[['minor_category','value']]
                 .groupby(['minor_category'])
                 .rolling(min_periods=12, window=12)
                 .sum()
                 .reset_index())

In [19]:
minor_category['prev_year_category_sum'] = cumsum_minorc['value']
minor_category['prev_year_category_sum'] = minor_category['prev_year_category_sum'] - lsoa['value']

In [20]:
crime = (crime
         .merge(minor_category[['minor_category','datetime','prev_year_category_sum']], on=['minor_category','datetime']))

In [21]:
# with open('crime_lsoa_month_prev_year_values.pkl', 'wb') as picklefile:
#     pickle.dump(crime, picklefile)

# with open('crime_lsoa_month_prev_year_values.pkl', 'rb') as picklefile:
#     crime = pickle.load(picklefile)

## Borough level data

In [23]:
crime = pd.read_csv('data/london_crime_by_lsoa.csv')

crime['date'] = crime['year'].map(str) + "-" + crime['month'].map(str)

crime['day'] = 1
crime['datetime'] = pd.to_datetime(crime[['year','month','day']])



In [24]:
crime_borough = (crime
                 .groupby(['borough','minor_category','year','month','datetime'])['value']
                 .sum()
                 .reset_index())

crime_borough['is_commited'] = crime_borough['value'].apply(lambda x: 0 if x == 0 else 1)

#### Getting previous crime and target value for last year

In [25]:
crime_borough = crime_borough.sort_values(by=['borough','minor_category','datetime'])

cumsum = (crime_borough[['borough','minor_category','value']]
          .groupby(['borough','minor_category'])
          .rolling(min_periods=12, window=12)
          .sum()
          .reset_index())

crime_borough['prev_year_crime_sum'] = cumsum[['value']]

crime_borough['prev_year_crime_sum'] = crime_borough['prev_year_crime_sum'] - crime['value']

#### Getting previous year crime and target value for borough

In [27]:
borough = (crime_borough
           .groupby(['borough','datetime'])['value']
           .sum()
           .reset_index()
           .sort_values(by=['borough','datetime']))

cumsum_borough = (borough[['borough','value']]
                  .groupby(['borough'])
                  .rolling(min_periods=12, window=12)
                  .sum()
                  .reset_index())

borough['prev_year_borough_sum'] = cumsum_borough['value']
borough['prev_year_borough_sum'] = borough['prev_year_borough_sum'] - borough['value']

crime_borough = crime_borough.merge(borough[['borough','datetime','prev_year_borough_sum']],on=['borough','datetime'])


#### Getting previous value for crime type for last 12 months

In [29]:
minor_category = (crime_borough
                  .groupby(['minor_category','datetime'])['value']
                  .sum()
                  .reset_index()
                  .sort_values(by=['minor_category','datetime']))

cumsum_minorc = (minor_category[['minor_category','value']]
                 .groupby(['minor_category'])
                 .rolling(min_periods=12, window=12)
                 .sum()
                 .reset_index())

minor_category['prev_year_category_sum'] = cumsum_minorc['value']
minor_category['prev_year_category_sum'] = minor_category['prev_year_category_sum'] - minor_category['value']

crime_borough = crime_borough.merge(minor_category[['minor_category','datetime','prev_year_category_sum']],on=['minor_category','datetime'])

#### Getting previous crime and target value

In [30]:
month = (crime_borough[['borough','minor_category','datetime','value','is_commited']]
         .sort_values(by=['borough','minor_category','datetime']))

month[["prev_date", "prev_value",'prev_is_commited']] = (month
                                                         .groupby(['borough', "minor_category"])["datetime", "value",'is_commited']
                                                         .transform(lambda grp: grp.shift(1)))

crime_borough = crime_borough.merge(month[['borough','minor_category','datetime','prev_value','prev_is_commited']],on=['borough','minor_category','datetime'])

In [32]:
# with open('pickle/crime_borough_month_prev_year_values.pkl', 'wb') as picklefile:
#     pickle.dump(crime, picklefile)

