# Notebook 022: Build Model Features

This notebook combines features from the cleansed crime incidents dataset as well as a number of other datasets to append each crime incident record with the additional engineered features used in our analysis

In [1]:
import urllib
import os
import operator
import pathlib
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
DATA_ROOT = '../data/'
FIGURES_ROOT = '../figures/crime'

READDIR_ROOT = os.path.join(DATA_ROOT, 'raw')
WRITEDIR_ROOT = os.path.join(DATA_ROOT, 'interim/crime')
FEATURES_ROOT = os.path.join(DATA_ROOT, 'processed')

readfile_crime = os.path.join(FEATURES_ROOT, 'crime-records-for-model-no-features-added.csv')
readfile_property = os.path.join(FEATURES_ROOT, 'property-assessment-features-2013-2019.csv')
readfile_weather = os.path.join(FEATURES_ROOT, 'boston-daily-weather-20140101-20190831.csv')
# readfile_streetlights = 
# readfile_demographics = 
# readfile_colleges = 
# readfile_schools = 
# readfile_violations = 

readfile_zipshapes = os.path.join(READDIR_ROOT, 'shapefile/zipcodes/ZIP_Codes.shp')
readfile_cityshape = os.path.join(READDIR_ROOT, 'shapefile/city-boundary/City_of_Boston_Boundary.shp')
readfile_streetshapes = os.path.join(READDIR_ROOT, 'shapefile/street-segments/Boston_Street_Segments.shp')
readfile_tractshapes = os.path.join(READDIR_ROOT, 'shapefile/census-tracts/Census_2010_Tracts.shp')
readfile_hoodshapes = os.path.join(READDIR_ROOT, 'shapefile/boston-neighborhoods/Boston_Neighborhoods.shp')
readfile_zonesubshapes = os.path.join(READDIR_ROOT, 'shapefile/zoning-subdistricts/Zoning_Subdistricts.shp')
readfile_openshapes = os.path.join(READDIR_ROOT, 'shapefile/open-spaces/Open_Space.shp')

print(
    'readfile paths for datasets used in this notebook are:\n\t{}\n\t{}\n\t{}'.format(
        readfile_crime, readfile_property, readfile_weather, readfile_zipshapes, WRITEDIR_ROOT
    )
)

readfile paths for datasets used in this notebook are:
	../data/processed/crime-records-for-model-no-features-added.csv
	../data/processed/property-assessment-features-2013-2019.csv
	../data/processed/boston-daily-weather-20140101-20190831.csv


## Read in datasources for merging

In [38]:
df_crime = pd.read_csv(readfile_crime, dtype=str)
df_property = pd.read_csv(readfile_property, dtype={'shape-id': str, 'fiscal-year': int})
df_weather = pd.read_csv(readfile_weather)

In [39]:
# print info for each dataframe
print('CRIME DATAFRAME\n')
df_crime.info()
display(df_crime.head())
print('\n\nPROPERTY ASSESSMENT FEATURES DATAFRAME\n')
df_property.info()
display(df_property.head())
print('\n\nWEATHER FEATURES DATAFRAME\n')
df_weather.info()
display(df_weather.head())

CRIME DATAFRAME

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151072 entries, 0 to 151071
Data columns (total 18 columns):
crime-type             151072 non-null object
INCIDENT_NUMBER        151072 non-null object
OFFENSE_DESCRIPTION    151072 non-null object
timestamp              151072 non-null object
lat                    151072 non-null object
lon                    151072 non-null object
year                   151072 non-null object
month                  151072 non-null object
day-of-week            151072 non-null object
hour                   151072 non-null object
ZIP5                   151044 non-null object
ZIP5_area              151044 non-null object
Name                   151002 non-null object
Neighborhood_area      151002 non-null object
Neighborhood_area_2    151002 non-null object
TRACTCE10              151072 non-null object
TRACTCE10_area         151072 non-null object
TRACTCE10_area_2       151072 non-null object
dtypes: object(18)
memory usage: 20.7+ MB


Unnamed: 0,crime-type,INCIDENT_NUMBER,OFFENSE_DESCRIPTION,timestamp,lat,lon,year,month,day-of-week,hour,ZIP5,ZIP5_area,Name,Neighborhood_area,Neighborhood_area_2,TRACTCE10,TRACTCE10_area,TRACTCE10_area_2
0,fraud,I192078177,forgery / counterfeiting,2019-08-01 17:46:00,42.30492239,-71.10298073,2019,8,Thursday,17,2130,105221153.02392578,Roxbury,91844545.97481573,3.29,980300,22640579.7627,3847996.3515625
1,harassment-disturbance,I192078061,harassment,2019-06-12 21:00:00,42.35555336,-71.15274721,2019,6,Wednesday,21,2135,78620825.92480469,Brighton,80167877.560494,2.88,202,6475526.5708,1102312.71484375
2,theft,I192078038,larceny theft of mv parts & accessories,2019-03-10 08:00:00,42.34562521,-71.04129066,2019,3,Sunday,8,2210,32373210.07080078,South Boston Waterfront,27087395.391558822,0.97,60600,20379799.0356,3468813.69921875
3,theft,I192078015,larceny all others,2019-07-08 10:29:00,42.33930416,-71.05160423,2019,7,Monday,10,2127,67536142.7421875,South Boston,62721306.14391709,2.25,60700,1380119.5437,234830.15234375
4,theft,I192077997,auto theft - leased/rented vehicle,2019-04-13 08:00:00,42.32856401,-71.06835343,2019,4,Saturday,8,2119,44511498.52441406,Roxbury,91844545.97481573,3.29,80100,11047170.5891,1879154.8203125




PROPERTY ASSESSMENT FEATURES DATAFRAME

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13002 entries, 0 to 13001
Data columns (total 5 columns):
shape-id       13002 non-null object
fiscal-year    13002 non-null int64
value          12927 non-null float64
shape-type     13002 non-null object
metric         13002 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 508.0+ KB


Unnamed: 0,shape-id,fiscal-year,value,shape-type,metric
0,0,2013,37000.0,zipcode,residential-median-value
1,2026,2013,,zipcode,residential-median-value
2,2108,2013,691300.0,zipcode,residential-median-value
3,2109,2013,456950.0,zipcode,residential-median-value
4,2110,2013,566512.5,zipcode,residential-median-value




WEATHER FEATURES DATAFRAME

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Data columns (total 13 columns):
date            1339 non-null object
weathercodes    660 non-null object
sunrise         1339 non-null int64
sunset          1339 non-null int64
tempavg         1339 non-null float64
tempmax         1339 non-null float64
tempmin         1339 non-null float64
humavg          1338 non-null float64
windavg         1339 non-null float64
windsustain     1339 non-null int64
windmax         1339 non-null int64
precip          1339 non-null float64
snowfall        1339 non-null float64
dtypes: float64(7), int64(4), object(2)
memory usage: 136.1+ KB


Unnamed: 0,date,weathercodes,sunrise,sunset,tempavg,tempmax,tempmin,humavg,windavg,windsustain,windmax,precip,snowfall
0,2016-01-01,,713,1622,2.8,5.0,0.6,58.0,12.7,23,32,0.0,0.0
1,2016-01-02,,714,1623,2.2,4.4,-0.6,53.0,13.3,22,28,0.0,0.0
2,2016-01-03,,714,1624,3.3,6.7,-0.6,57.0,11.7,21,28,0.0,0.0
3,2016-01-04,SN,714,1625,-3.9,2.2,-10.0,59.0,12.4,21,28,0.0,0.0
4,2016-01-05,,714,1626,-8.3,-3.3,-13.3,41.0,10.0,18,23,0.0,0.0


## Merge property assessment features

In [40]:
# subset property features data to contain only census-tract level metrics
df_property = df_property.loc[df_property['shape-type']=='census tract']
df_property.shape

(9570, 5)

In [41]:
# make list of metrics for matching
print(df_property['metric'].value_counts())

prop_metrics_list = list(df_property['metric'].value_counts().index)

print()
print(prop_metrics_list)

commercial-mix-ratio                 1267
industrial-mix-ratio                 1267
owner-occupied-ratio                 1190
residential-gini-coef                1183
residential-median-value             1183
industrial-mix-ratio-3yr-cagr         724
commercial-mix-ratio-3yr-cagr         724
owner-occupied-ratio-3yr-cagr         680
residential-median-value-3yr-cagr     676
residential-gini-coef-3yr-cagr        676
Name: metric, dtype: int64

['commercial-mix-ratio', 'industrial-mix-ratio', 'owner-occupied-ratio', 'residential-gini-coef', 'residential-median-value', 'industrial-mix-ratio-3yr-cagr', 'commercial-mix-ratio-3yr-cagr', 'owner-occupied-ratio-3yr-cagr', 'residential-median-value-3yr-cagr', 'residential-gini-coef-3yr-cagr']


In [42]:
# create temporary match-key column for crime dataset record matching
df_crime['tract-match-key'] = (df_crime['TRACTCE10'].astype(str) + df_crime['year'].astype(str)).values

df_crime['tract-match-key'].head()

0    9803002019
1    0002022019
2    0606002019
3    0607002019
4    0801002019
Name: tract-match-key, dtype: object

In [43]:
# Start list of "temporary match" column names for dropping before saving final dataframe

temp_drop_list = ['tract-match-key']

In [44]:
# create equivalent match-key column for property metrics dataframe
df_property['shape_match'] = (df_property['shape-id'].astype(str) + df_property['fiscal-year'].astype(str)).values
df_property.head()

Unnamed: 0,shape-id,fiscal-year,value,shape-type,metric,shape_match
420,100,2013,402150.0,census tract,residential-median-value,1002013
421,201,2013,444900.0,census tract,residential-median-value,2012013
422,202,2013,447581.0,census tract,residential-median-value,2022013
423,301,2013,412700.0,census tract,residential-median-value,3012013
424,302,2013,326650.0,census tract,residential-median-value,3022013


In [45]:
############################################
# Merge property features to crime dataframe
############################################

for metric in prop_metrics_list:
    temp_match_dict = dict(
        zip(
            df_property.loc[df_property['metric']==metric][['value', 'shape_match']].values[:,1],
            df_property.loc[df_property['metric']==metric][['value', 'shape_match']].values[:,0]
        )
    )
    df_crime[metric] = df_crime['tract-match-key'].copy().map(temp_match_dict)

In [46]:
# view resulting dataframe with added property features
print(df_crime.info())
df_crime.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151072 entries, 0 to 151071
Data columns (total 29 columns):
crime-type                           151072 non-null object
INCIDENT_NUMBER                      151072 non-null object
OFFENSE_DESCRIPTION                  151072 non-null object
timestamp                            151072 non-null object
lat                                  151072 non-null object
lon                                  151072 non-null object
year                                 151072 non-null object
month                                151072 non-null object
day-of-week                          151072 non-null object
hour                                 151072 non-null object
ZIP5                                 151044 non-null object
ZIP5_area                            151044 non-null object
Name                                 151002 non-null object
Neighborhood_area                    151002 non-null object
Neighborhood_area_2                  151002 non-n

Unnamed: 0,crime-type,INCIDENT_NUMBER,OFFENSE_DESCRIPTION,timestamp,lat,lon,year,month,day-of-week,hour,...,commercial-mix-ratio,industrial-mix-ratio,owner-occupied-ratio,residential-gini-coef,residential-median-value,industrial-mix-ratio-3yr-cagr,commercial-mix-ratio-3yr-cagr,owner-occupied-ratio-3yr-cagr,residential-median-value-3yr-cagr,residential-gini-coef-3yr-cagr
0,fraud,I192078177,forgery / counterfeiting,2019-08-01 17:46:00,42.30492239,-71.10298073,2019,8,Thursday,17,...,0.0,0.000294,0.0,0.0,756500.0,-0.034814,0.0,0.0,0.024264,0.0
1,harassment-disturbance,I192078061,harassment,2019-06-12 21:00:00,42.35555336,-71.15274721,2019,6,Wednesday,21,...,0.069416,0.0,0.550355,0.199606,745950.0,0.0,-0.010424,-0.028224,0.068142,0.011166
2,theft,I192078038,larceny theft of mv parts & accessories,2019-03-10 08:00:00,42.34562521,-71.04129066,2019,3,Sunday,8,...,0.47813,0.000938,0.456287,0.228793,538500.0,-0.084828,0.010549,-0.014131,0.06034,0.018436
3,theft,I192078015,larceny all others,2019-07-08 10:29:00,42.33930416,-71.05160423,2019,7,Monday,10,...,0.0,0.0,,,,0.0,0.0,,,
4,theft,I192077997,auto theft - leased/rented vehicle,2019-04-13 08:00:00,42.32856401,-71.06835343,2019,4,Saturday,8,...,0.375058,0.076862,0.460751,0.172898,355500.0,0.060168,-0.007779,-0.023444,0.082123,0.025972


## Merge weather data features

In [47]:
# create temporary date match column and add to drop list
temp_drop_list = temp_drop_list + ['date']

df_crime['date'] = df_crime['timestamp'].str[:10]

In [51]:
# create list of weather-related column names for later manipulation
weather_match_cols = df_weather.columns

weather_match_cols

Index(['date', 'weathercodes', 'sunrise', 'sunset', 'tempavg', 'tempmax',
       'tempmin', 'humavg', 'windavg', 'windsustain', 'windmax', 'precip',
       'snowfall'],
      dtype='object')

In [52]:
# merge weather values to crime data 
df_crime = df_crime.merge(df_weather, how='left', on='date')

# print results
df_crime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151072 entries, 0 to 151071
Data columns (total 42 columns):
crime-type                           151072 non-null object
INCIDENT_NUMBER                      151072 non-null object
OFFENSE_DESCRIPTION                  151072 non-null object
timestamp                            151072 non-null object
lat                                  151072 non-null object
lon                                  151072 non-null object
year                                 151072 non-null object
month                                151072 non-null object
day-of-week                          151072 non-null object
hour                                 151072 non-null object
ZIP5                                 151044 non-null object
ZIP5_area                            151044 non-null object
Name                                 151002 non-null object
Neighborhood_area                    151002 non-null object
Neighborhood_area_2                  151002 non-n

## Create night time feature using sunrise and sunset data from weather merge