# Capstone - Seattle Crime Data (June, 2009 - August, 2017)

* Student name: Andrew Wester
* Student pace: Full Time
* Scheduled project review date/time: Thursday, March 28, 2019
* Instructor name: Rafael Carrasco
* Blog post URL:


### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import folium
from folium import plugins
import os
import shutil
from IPython.display import Image, IFrame, display_html
from IPython.core.display import HTML 

import warnings
warnings.filterwarnings('ignore')

#!pip install jupyternotify
import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)
## Run %%notify to create notification for completed cell
%notify

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
crime_df = pd.read_csv('Data/seattle_crime/Seattle_Police_Department_911_Incident_Response.csv')

In [3]:
crime_df.head()

Unnamed: 0,CAD CDW ID,CAD Event Number,General Offense Number,Event Clearance Code,Event Clearance Description,Event Clearance SubGroup,Event Clearance Group,Event Clearance Date,Hundred Block Location,District/Sector,Zone/Beat,Census Tract,Longitude,Latitude,Incident Location,Initial Type Description,Initial Type Subgroup,Initial Type Group,At Scene Time
0,﻿15736,10000246357,2010246357,242.0,FIGHT DISTURBANCE,DISTURBANCES,DISTURBANCES,07/17/2010 08:49:00 PM,3XX BLOCK OF PINE ST,M,M2,8100.2001,-122.338147,47.610975,"(47.610975163, -122.338146748)",,,,
1,15737,10000246471,2010246471,65.0,THEFT - MISCELLANEOUS,THEFT,OTHER PROPERTY,07/17/2010 08:50:00 PM,36XX BLOCK OF DISCOVERY PARK BLVD,Q,Q1,5700.1012,-122.404613,47.658325,"(47.658324899, -122.404612874)",,,,
2,15738,10000246255,2010246255,250.0,"MISCHIEF, NUISANCE COMPLAINTS","NUISANCE, MISCHIEF COMPLAINTS","NUISANCE, MISCHIEF",07/17/2010 08:55:00 PM,21XX BLOCK OF 3RD AVE,M,M2,7200.2025,-122.342843,47.613551,"(47.613551471, -122.342843234)",,,,
3,15739,10000246473,2010246473,460.0,TRAFFIC (MOVING) VIOLATION,TRAFFIC RELATED CALLS,TRAFFIC RELATED CALLS,07/17/2010 09:00:00 PM,7XX BLOCK OF ROY ST,D,D1,7200.1002,-122.341847,47.625401,"(47.625401388, -122.341846999)",,,,
4,15740,10000246330,2010246330,250.0,"MISCHIEF, NUISANCE COMPLAINTS","NUISANCE, MISCHIEF COMPLAINTS","NUISANCE, MISCHIEF",07/17/2010 09:00:00 PM,9XX BLOCK OF ALOHA ST,D,D1,6700.1009,-122.339709,47.627425,"(47.627424837, -122.339708605)",,,,


In [4]:
crime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1433853 entries, 0 to 1433852
Data columns (total 19 columns):
CAD CDW ID                     1433853 non-null object
CAD Event Number               1433853 non-null int64
General Offense Number         1433853 non-null int64
Event Clearance Code           1422435 non-null float64
Event Clearance Description    1422434 non-null object
Event Clearance SubGroup       1422434 non-null object
Event Clearance Group          1422434 non-null object
Event Clearance Date           1422269 non-null object
Hundred Block Location         1430366 non-null object
District/Sector                1432684 non-null object
Zone/Beat                      1433852 non-null object
Census Tract                   1431089 non-null float64
Longitude                      1433852 non-null float64
Latitude                       1433852 non-null float64
Incident Location              1433852 non-null object
Initial Type Description       856040 non-null object
Initia

### Start Cleaning Data for Columns to be Dropped and Null Values

In [5]:
drops = crime_df.columns[15:]

In [6]:
crime_df.isna().sum()

CAD CDW ID                           0
CAD Event Number                     0
General Offense Number               0
Event Clearance Code             11418
Event Clearance Description      11419
Event Clearance SubGroup         11419
Event Clearance Group            11419
Event Clearance Date             11584
Hundred Block Location            3487
District/Sector                   1169
Zone/Beat                            1
Census Tract                      2764
Longitude                            1
Latitude                             1
Incident Location                    1
Initial Type Description        577813
Initial Type Subgroup           577813
Initial Type Group              577813
At Scene Time                  1023030
dtype: int64

In [7]:
crime_df.drop(columns=drops, inplace=True)

In [8]:
crime_df.dropna(inplace=True)

In [9]:
%%timeit
crime_df['Event Clearance Date'] = pd.to_datetime(crime_df['Event Clearance Date'])

4.18 ms ± 253 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
crime_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1415994 entries, 0 to 1433852
Data columns (total 15 columns):
CAD CDW ID                     1415994 non-null object
CAD Event Number               1415994 non-null int64
General Offense Number         1415994 non-null int64
Event Clearance Code           1415994 non-null float64
Event Clearance Description    1415994 non-null object
Event Clearance SubGroup       1415994 non-null object
Event Clearance Group          1415994 non-null object
Event Clearance Date           1415994 non-null datetime64[ns]
Hundred Block Location         1415994 non-null object
District/Sector                1415994 non-null object
Zone/Beat                      1415994 non-null object
Census Tract                   1415994 non-null float64
Longitude                      1415994 non-null float64
Latitude                       1415994 non-null float64
Incident Location              1415994 non-null object
dtypes: datetime64[ns](1), float64(4), int64(2), obj

In [11]:
crime_df['Event Clearance Date'].describe()

count                 1415994
unique                1186168
top       2015-07-11 21:41:00
freq                       12
first     2009-06-17 16:14:00
last      2017-08-29 11:44:01
Name: Event Clearance Date, dtype: object

In [12]:
crime_groups = list(crime_df['Event Clearance Group'].unique())
print("There are", len(crime_groups), "different types of Crime in Seattle")
districts = crime_df['District/Sector'].unique()
print("There are", len(districts), "different Districts in Seattle")
print("There are", len(crime_df), "total Crimes committed in Seattle")

There are 44 different types of Crime in Seattle
There are 19 different Districts in Seattle
There are 1415994 total Crimes committed in Seattle


In [13]:
districts

array(['M', 'Q', 'D', 'R', 'U', 'K', 'N', 'S', 'F', 'J', 'L', 'W', 'E',
       'C', 'G', 'B', 'O', '99', 'H'], dtype=object)

In [14]:
tops = crime_df[crime_df['District/Sector'] == 'M']['Event Clearance Group'].value_counts().nlargest(5).to_dict()
tops

{'DISTURBANCES': 18727,
 'TRAFFIC RELATED CALLS': 14931,
 'LIQUOR VIOLATIONS': 14550,
 'SUSPICIOUS CIRCUMSTANCES': 11743,
 'SHOPLIFTING': 8029}

In [15]:
for key in tops.keys():
    total = len(crime_df[crime_df['District/Sector'] == 'M'])
    percent = round((tops[key] / total)*100, 3)
    print(percent)

14.62
11.656
11.359
9.168
6.268


In [16]:
crimes_dict = {}
for district in districts:
    df = crime_df[crime_df['District/Sector'] == district]
    total = len(df)
    crimes_dict[district] = {}
    top_5 = df['Event Clearance Group'].value_counts().nlargest(5).to_dict()
    crimes_dict[district]["top_5"] = ""
    for key in top_5.keys():
        percent = round((top_5[key] / total)*100, 3)
        crimes_dict[district]["top_5"] += "{} = {} percent of Crime, ".format(
            key.title(), percent)
#         crimes_dict[district]["top_5"] += "\n"


In [17]:
crimes_dict

{'M': {'top_5': 'Disturbances = 14.62 percent of Crime, Traffic Related Calls = 11.656 percent of Crime, Liquor Violations = 11.359 percent of Crime, Suspicious Circumstances = 9.168 percent of Crime, Shoplifting = 6.268 percent of Crime, '},
 'Q': {'top_5': 'Traffic Related Calls = 21.557 percent of Crime, Suspicious Circumstances = 14.751 percent of Crime, Disturbances = 11.935 percent of Crime, Car Prowl = 4.642 percent of Crime, False Alarms = 4.214 percent of Crime, '},
 'D': {'top_5': 'Traffic Related Calls = 17.376 percent of Crime, Disturbances = 13.992 percent of Crime, Suspicious Circumstances = 12.059 percent of Crime, Liquor Violations = 5.786 percent of Crime, Car Prowl = 4.686 percent of Crime, '},
 'R': {'top_5': 'Traffic Related Calls = 21.72 percent of Crime, Suspicious Circumstances = 18.362 percent of Crime, Disturbances = 11.712 percent of Crime, False Alarms = 4.243 percent of Crime, Motor Vehicle Collision Investigation = 3.864 percent of Crime, '},
 'U': {'top_5'

In [18]:
for key in crimes_dict.keys():
    items = (list(crimes_dict[key].values()))
    string = ()
    for i in items:
        print(i)

Disturbances = 14.62 percent of Crime, Traffic Related Calls = 11.656 percent of Crime, Liquor Violations = 11.359 percent of Crime, Suspicious Circumstances = 9.168 percent of Crime, Shoplifting = 6.268 percent of Crime, 
Traffic Related Calls = 21.557 percent of Crime, Suspicious Circumstances = 14.751 percent of Crime, Disturbances = 11.935 percent of Crime, Car Prowl = 4.642 percent of Crime, False Alarms = 4.214 percent of Crime, 
Traffic Related Calls = 17.376 percent of Crime, Disturbances = 13.992 percent of Crime, Suspicious Circumstances = 12.059 percent of Crime, Liquor Violations = 5.786 percent of Crime, Car Prowl = 4.686 percent of Crime, 
Traffic Related Calls = 21.72 percent of Crime, Suspicious Circumstances = 18.362 percent of Crime, Disturbances = 11.712 percent of Crime, False Alarms = 4.243 percent of Crime, Motor Vehicle Collision Investigation = 3.864 percent of Crime, 
Disturbances = 17.26 percent of Crime, Suspicious Circumstances = 16.625 percent of Crime, Tra

In [19]:
detailed_crime_dict = {}
for district in districts:
    df = crime_df[crime_df['District/Sector'] == district]
    total = len(df)
    detailed_crime_dict["District {}".format(district)] = {}
    for crime in crime_groups:
        df1 = df[df['Event Clearance Group'] == crime]
        percent = round((len(df1) / total)*100, 3)
        detailed_crime_dict["District {}".format(district)][crime.capitalize()] = \
            "{} account for {} percent of Crime in District".format(crime.title(), percent, district)

In [20]:
detailed_crime_dict

{'District M': {'Disturbances': 'Disturbances account for 14.62 percent of Crime in District',
  'Other property': 'Other Property account for 3.522 percent of Crime in District',
  'Nuisance, mischief ': 'Nuisance, Mischief  account for 1.361 percent of Crime in District',
  'Traffic related calls': 'Traffic Related Calls account for 11.656 percent of Crime in District',
  'Suspicious circumstances': 'Suspicious Circumstances account for 9.168 percent of Crime in District',
  'Mental health': 'Mental Health account for 1.413 percent of Crime in District',
  'Liquor violations': 'Liquor Violations account for 11.359 percent of Crime in District',
  'Trespass': 'Trespass account for 5.754 percent of Crime in District',
  'Assaults': 'Assaults account for 2.264 percent of Crime in District',
  'Narcotics complaints': 'Narcotics Complaints account for 5.8 percent of Crime in District',
  'Accident investigation': 'Accident Investigation account for 1.842 percent of Crime in District',
  '

## Slice Data to Smaller Dataset Due to Large Size of Main Dataset

In [21]:
crimes = crime_df[1000:5000]
crimes.head()

Unnamed: 0,CAD CDW ID,CAD Event Number,General Offense Number,Event Clearance Code,Event Clearance Description,Event Clearance SubGroup,Event Clearance Group,Event Clearance Date,Hundred Block Location,District/Sector,Zone/Beat,Census Tract,Longitude,Latitude,Incident Location
1000,16764,10000247611,2010247611,65.0,THEFT - MISCELLANEOUS,THEFT,OTHER PROPERTY,2010-07-18 21:41:00,88XX BLOCK OF 13TH AVE SW,F,F3,11300.3004,-122.35168,47.523774,"(47.523774444, -122.351679629)"
1001,16765,10000247756,2010247756,200.0,ALARMS - COMMERCIAL BURGLARY (FALSE),BURGLARY ALARMS (FALSE),FALSE ALARMS,2010-07-18 21:49:00,81XX BLOCK OF GREENWOOD AVE N,B,B3,2900.1007,-122.3553,47.688028,"(47.68802767, -122.355300312)"
1002,16766,10000247785,2010247785,460.0,TRAFFIC (MOVING) VIOLATION,TRAFFIC RELATED CALLS,TRAFFIC RELATED CALLS,2010-07-18 21:56:00,10XX BLOCK OF S LANE ST,G,G2,9100.1006,-122.318055,47.596662,"(47.596662187, -122.318054901)"
1003,16767,10000247789,2010247789,177.0,LIQUOR VIOLATION - INTOXICATED PERSON,LIQUOR VIOLATIONS,LIQUOR VIOLATIONS,2010-07-18 21:59:00,5XX BLOCK OF QUEEN ANNE AVE N,Q,Q3,7100.2003,-122.356715,47.62393,"(47.623929893, -122.356714541)"
1004,16768,10000247788,2010247788,177.0,LIQUOR VIOLATION - INTOXICATED PERSON,LIQUOR VIOLATIONS,LIQUOR VIOLATIONS,2010-07-18 22:09:00,18XX BLOCK OF NW MARKET ST,J,J2,4700.4002,-122.381042,47.668674,"(47.668674212, -122.381041843)"


In [22]:
crimes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 1000 to 5006
Data columns (total 15 columns):
CAD CDW ID                     4000 non-null object
CAD Event Number               4000 non-null int64
General Offense Number         4000 non-null int64
Event Clearance Code           4000 non-null float64
Event Clearance Description    4000 non-null object
Event Clearance SubGroup       4000 non-null object
Event Clearance Group          4000 non-null object
Event Clearance Date           4000 non-null datetime64[ns]
Hundred Block Location         4000 non-null object
District/Sector                4000 non-null object
Zone/Beat                      4000 non-null object
Census Tract                   4000 non-null float64
Longitude                      4000 non-null float64
Latitude                       4000 non-null float64
Incident Location              4000 non-null object
dtypes: datetime64[ns](1), float64(4), int64(2), object(8)
memory usage: 500.0+ KB


In [23]:
crime = crimes[['Event Clearance Group', 'Event Clearance Description', 'Event Clearance Date', 'District/Sector', 
                'Zone/Beat', 'Latitude', 'Longitude', 'Incident Location']]
crime.head()

Unnamed: 0,Event Clearance Group,Event Clearance Description,Event Clearance Date,District/Sector,Zone/Beat,Latitude,Longitude,Incident Location
1000,OTHER PROPERTY,THEFT - MISCELLANEOUS,2010-07-18 21:41:00,F,F3,47.523774,-122.35168,"(47.523774444, -122.351679629)"
1001,FALSE ALARMS,ALARMS - COMMERCIAL BURGLARY (FALSE),2010-07-18 21:49:00,B,B3,47.688028,-122.3553,"(47.68802767, -122.355300312)"
1002,TRAFFIC RELATED CALLS,TRAFFIC (MOVING) VIOLATION,2010-07-18 21:56:00,G,G2,47.596662,-122.318055,"(47.596662187, -122.318054901)"
1003,LIQUOR VIOLATIONS,LIQUOR VIOLATION - INTOXICATED PERSON,2010-07-18 21:59:00,Q,Q3,47.62393,-122.356715,"(47.623929893, -122.356714541)"
1004,LIQUOR VIOLATIONS,LIQUOR VIOLATION - INTOXICATED PERSON,2010-07-18 22:09:00,J,J2,47.668674,-122.381042,"(47.668674212, -122.381041843)"


In [25]:
crime['Event Clearance Group'].nunique()

39

## Create List of Locations by Latitude and Longitude from Sliced Data

In [26]:
locations = crime[['Latitude', 'Longitude']]
locationlist = locations.values.tolist()
print(len(locationlist))
locationlist[:5]

4000


[[47.523774444, -122.351679629],
 [47.68802767, -122.35530031200001],
 [47.596662187, -122.31805490100001],
 [47.623929893, -122.356714541],
 [47.668674212, -122.38104184299999]]

In [27]:
crime_clusters=folium.Map(location=[crime.Latitude.mean(),crime.Longitude.mean()],
                      zoom_start=11,tiles='OpenStreetMap')
plugins.MarkerCluster(locationlist).add_to(crime_clusters)
plugins.ScrollZoomToggler().add_to(crime_clusters)
crime_clusters.save(os.path.join('Maps/', 'crime_clusters.html'))
crime_clusters

In [28]:
seattle_geo = '/Users/steeznation/Flatiron/Module05/capstone/seattle-boundaries-data/data/spd-beats.geojson'

seattle_coords = (47.6062, -122.3321)

In [29]:
zones = crime_df[['Zone/Beat', 'District/Sector', 'Latitude', 'Longitude']]
zones.head()

Unnamed: 0,Zone/Beat,District/Sector,Latitude,Longitude
0,M2,M,47.610975,-122.338147
1,Q1,Q,47.658325,-122.404613
2,M2,M,47.613551,-122.342843
3,D1,D,47.625401,-122.341847
4,D1,D,47.627425,-122.339709


In [30]:
zones.dropna(inplace=True)

In [31]:
print(zones['District/Sector'].unique())
zlist = list(zones['District/Sector'].unique())

['M' 'Q' 'D' 'R' 'U' 'K' 'N' 'S' 'F' 'J' 'L' 'W' 'E' 'C' 'G' 'B' 'O' '99'
 'H']


In [32]:
zones[zones['Zone/Beat'] == 'M2'].head()

Unnamed: 0,Zone/Beat,District/Sector,Latitude,Longitude
0,M2,M,47.610975,-122.338147
2,M2,M,47.613551,-122.342843
37,M2,M,47.613343,-122.338524
78,M2,M,47.611706,-122.338048
92,M2,M,47.612887,-122.343703


In [33]:
zones.isna().sum()

Zone/Beat          0
District/Sector    0
Latitude           0
Longitude          0
dtype: int64

In [34]:
crimes_dict.keys()

dict_keys(['M', 'Q', 'D', 'R', 'U', 'K', 'N', 'S', 'F', 'J', 'L', 'W', 'E', 'C', 'G', 'B', 'O', '99', 'H'])

In [35]:
z_dict = {}
for z in zlist:
    df = zones.loc[zones['District/Sector'] == z]
    z_dict[z] = {}
    z_dict[z]['Lat'] = (df['Latitude'].mean())
    z_dict[z]['Long'] = (df['Longitude'].mean())
    z_dict[z]["Top 5"] = crimes_dict[z]["top_5"]

In [36]:
crimes_dict

{'M': {'top_5': 'Disturbances = 14.62 percent of Crime, Traffic Related Calls = 11.656 percent of Crime, Liquor Violations = 11.359 percent of Crime, Suspicious Circumstances = 9.168 percent of Crime, Shoplifting = 6.268 percent of Crime, '},
 'Q': {'top_5': 'Traffic Related Calls = 21.557 percent of Crime, Suspicious Circumstances = 14.751 percent of Crime, Disturbances = 11.935 percent of Crime, Car Prowl = 4.642 percent of Crime, False Alarms = 4.214 percent of Crime, '},
 'D': {'top_5': 'Traffic Related Calls = 17.376 percent of Crime, Disturbances = 13.992 percent of Crime, Suspicious Circumstances = 12.059 percent of Crime, Liquor Violations = 5.786 percent of Crime, Car Prowl = 4.686 percent of Crime, '},
 'R': {'top_5': 'Traffic Related Calls = 21.72 percent of Crime, Suspicious Circumstances = 18.362 percent of Crime, Disturbances = 11.712 percent of Crime, False Alarms = 4.243 percent of Crime, Motor Vehicle Collision Investigation = 3.864 percent of Crime, '},
 'U': {'top_5'

In [37]:
m = folium.Map(location=seattle_coords, zoom_start=11)
for k, v in (z_dict.items()):
    folium.Marker([v['Lat'], v['Long']], popup=folium.Popup(v['Top 5'], max_width=150), tooltip=k).add_to(m)
plugins.ScrollZoomToggler().add_to(m)
m

In [41]:
# definition of the boundaries in the map
district_geo = seattle_geo
  
# calculating total number of incidents per district
crimedata = pd.DataFrame(crime_df['Zone/Beat'].value_counts().astype(float))
crimedata.to_json('crimeagg.json')
crimedata = crimedata.reset_index()
crimedata.columns = ['Beat', 'Number']

m = folium.Map(location=seattle_coords, zoom_start=11)
folium.Choropleth(geo_data=seattle_geo,
                  data=crimedata,
                  columns=['Beat', 'Number'],
                  key_on='feature.properties.name',
                  fill_color='YlGnBu',
                  fill_opacity=0.8,
                  line_opacity=0.2,
                  bins=9,
                  name='Incidents per Beat',
                  legend_name='Incidents per Beat',
                  highlight=True).add_to(m)

for k, v in (z_dict.items()):
    # print(k, list([v['Lat'], v['Long']]))
    folium.Marker([v['Lat'], v['Long']], popup=folium.Popup(v["Top 5"], max_width=150), 
                  tooltip=("District: "+str(k))).add_to(m)
plugins.MarkerCluster(locationlist, name='Crime Clusters').add_to(m)
folium.LayerControl().add_to(m)
plugins.ScrollZoomToggler().add_to(m)

m.save(os.path.join('Maps/', 'crime_districts.html'))
m

In [42]:
len(crime_df)

1415994

In [43]:
locations1 = crime_df[['Latitude', 'Longitude']][:50000]
locationlist1 = locations1.values.tolist()
print(len(locationlist1))
locationlist1[:5]

50000


[[47.610975163, -122.33814674799999],
 [47.65832489899999, -122.404612874],
 [47.613551471, -122.342843234],
 [47.625401388, -122.341846999],
 [47.627424837, -122.339708605]]

In [44]:
%%time
# definition of the boundaries in the map
district_geo = seattle_geo

# calculating total number of incidents per district
crimedata = pd.DataFrame(crime_df['Zone/Beat'].value_counts().astype(float))
crimedata.to_json('crimeagg.json')
crimedata = crimedata.reset_index()
crimedata.columns = ['Beat', 'Number']

m = folium.Map(location=seattle_coords, zoom_start=11)

folium.Choropleth(geo_data=seattle_geo,
                  data=crimedata,
                  columns=['Beat', 'Number'],
                  key_on='feature.properties.name',
                  fill_color='YlGn',
                  fill_opacity=0.8,
                  line_opacity=0.2,
                  bins=9,
                  name='Incidents per Beat',
                  legend_name='Incidents per Beat',
                  highlight=True).add_to(m)

fg = folium.FeatureGroup(name='All Groups')
m.add_child(fg)
g1 = plugins.FeatureGroupSubGroup(fg, 'Districts')
m.add_child(g1)
g2 = plugins.FeatureGroupSubGroup(fg, 'Clusters')
m.add_child(g2)

for k, v in (z_dict.items()):
    folium.Marker([v['Lat'], v['Long']], popup=folium.Popup(v["Top 5"], max_width=200), 
                  tooltip=("District: "+str(k))).add_to(g1)
folium.TileLayer(tiles="OpenStreetMap", name='Open Street Map').add_to(m)
folium.TileLayer(tiles="Stamen Terrain", name='Stamen Terrain').add_to(m)
folium.TileLayer(tiles="Mapbox Bright", name='Mapbox Bright').add_to(m)

layers = folium.LayerControl().add_to(m)
plugins.ScrollZoomToggler().add_to(m)

plugins.MarkerCluster(locationlist1, name='Crime Clusters').add_to(g2)

m.save(os.path.join('Maps/', 'final_crime.html'))

CPU times: user 24.7 s, sys: 883 ms, total: 25.6 s
Wall time: 26.5 s
