In [1]:
# pip install geopandas dash
# pip install jupyter-dash

In [2]:
import pandas as pd
import geopandas as gpd
# import cudf

import plotly.express as px
import plotly.graph_objects as go

from jupyter_dash import JupyterDash
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import json

In [28]:
df = pd.read_csv('data/crime-clean.csv')
df = df[(df['Year'] < 2021)]

In [29]:
df.sample(15)

Unnamed: 0,Crime Type,Latitude,Longitude,Neighborhood,Zip Code,Adult Population,Crime Score,Neigh Score,CSperCapita,Year,Month
2294212,PROSTITUTION,41.757022,-87.558073,South Shore,60649,35883,0.1517,30898.0216,0.861077,2012,6
1565541,ROBBERY,41.791286,-87.734473,West Elsdon,60629,78922,0.0773,3089.4539,0.039146,2011,5
1644684,PUBLIC PEACE VIOLATION,41.692197,-87.618266,Roseland,60628,50492,0.0196,23904.388,0.473429,2016,8
2399054,NARCOTICS,41.774792,-87.615149,Woodlawn,60637,36044,0.0167,13245.7723,0.367489,2010,3
344096,BATTERY,41.909826,-87.736247,Humboldt Park,60639,65556,0.7753,33809.9941,0.515742,2019,5
2035706,OTHER OFFENSE,41.798801,-87.599727,Hyde Park,60615,34631,0.0135,5237.0923,0.151226,2010,11
3028760,CRIMINAL DAMAGE,41.917838,-87.638125,Old Town,60614,61612,0.0501,5325.3836,0.086434,2011,1
895541,MOTOR VEHICLE THEFT,41.77514,-87.672589,Englewood,60636,24046,0.2266,48625.587,2.02219,2015,12
2317796,CRIMINAL DAMAGE,41.762101,-87.564823,South Shore,60649,35883,0.2251,30898.0216,0.861077,2014,10
2939323,CRIMINAL DAMAGE,41.97651,-87.677893,Lincoln Square,60640,60243,0.0513,6508.2831,0.108034,2011,2


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3149595 entries, 0 to 3187329
Data columns (total 11 columns):
 #   Column            Dtype  
---  ------            -----  
 0   Crime Type        object 
 1   Latitude          float64
 2   Longitude         float64
 3   Neighborhood      object 
 4   Zip Code          int64  
 5   Adult Population  int64  
 6   Crime Score       float64
 7   Neigh Score       float64
 8   CSperCapita       float64
 9   Year              int64  
 10  Month             int64  
dtypes: float64(5), int64(4), object(2)
memory usage: 288.4+ MB


In [31]:
df_neigh = df.groupby(['Neighborhood'], as_index=False).agg({'Crime Score': 'mean', 'CSperCapita': 'mean'})
df_neigh.head()

Unnamed: 0,Neighborhood,Crime Score,CSperCapita
0,Albany Park,0.273863,0.134928
1,Andersonville,0.259717,0.015856
2,Archer Heights,0.259546,0.045801
3,Armour Square,0.261752,0.042559
4,Ashburn,0.267841,0.236414


In [32]:
df_zip = df.groupby(['Zip Code'], as_index=False).agg({'Crime Score': 'mean', 'CSperCapita': 'mean'})
df_zip.head()

Unnamed: 0,Zip Code,Crime Score,CSperCapita
0,60601,0.282481,1.590697
1,60602,0.273787,18.654383
2,60603,0.28039,19.703245
3,60604,0.266001,26.765898
4,60605,0.272231,0.472532


In [33]:
df_crime = df.groupby(['Year'], as_index=False).agg({'Crime Type': 'count', 'CSperCapita': 'mean'}).rename(columns={'Crime Type': 'Count'})
df_crime.head(10)

Unnamed: 0,Year,Count,CSperCapita
0,2010,369008,0.777052
1,2011,350350,0.778162
2,2012,334599,0.802661
3,2013,305572,0.824509
4,2014,273045,0.843986
5,2015,257242,0.854416
6,2016,266515,0.887717
7,2017,264231,0.945128
8,2018,262675,0.96686
9,2019,258429,0.95797


In [34]:
df_crime_neigh = df.groupby(['Year', 'Neighborhood'], as_index=False)\
                    .agg({'Crime Type': 'count'}).rename(columns={'Crime Type': 'Count'})
df_crime_neigh.head()

Unnamed: 0,Year,Neighborhood,Count
0,2010,Albany Park,3491
1,2010,Andersonville,436
2,2010,Archer Heights,1358
3,2010,Armour Square,783
4,2010,Ashburn,3447


In [35]:
df_crime_zip = df.groupby(['Year', 'Zip Code'], as_index=False).agg({'Crime Type': 'count'}).rename(columns={'Crime Type': 'Count'})
df_crime_zip.head()

Unnamed: 0,Year,Zip Code,Count
0,2010,60601,1567
1,2010,60602,1818
2,2010,60603,884
3,2010,60604,1015
4,2010,60605,2727


In [36]:
with open('data/geo/Neighborhoods.geojson') as Neigh:
    geodict = {'json_neigh': json.load(Neigh)}

In [37]:
with open('data/geo/ZIP.geojson') as ZIP22:
    geodict['json_zip'] = json.load(ZIP22)

In [38]:
# app = dash.Dash(__name__)

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app = JupyterDash(__name__, external_stylesheets=external_stylesheets)

In [39]:
app.layout = html.Div([
    html.H1('Chicago Crime & Real Estate'),
    dcc.Input(id = 'address', type = 'search'),
    html.Br(),
    dcc.RadioItems(id='location-select',
                  options = [{'label': i, 'value': i} for i in ['Neighborhood', 'Zip Code']],
                  value = 'Neighborhood',
                  labelStyle = {'display': 'inline-block'}),
    html.Br(),
    dcc.Graph(id='chicago-map'),
    html.Br(),
    html.Div(id='hover-test'),
    html.Br(),
    dcc.Graph(id='crime-chart')
])

In [40]:
@app.callback(
    Output('chicago-map', 'figure'),
    Input('location-select', 'value')
)
def update_chicago_map(value):

    if value == 'Neighborhood':
        data_frame = df_neigh
        locations = 'Neighborhood'
        geojson = geodict['json_neigh']
        featureidkey = 'properties.pri_neigh'
    else:
        data_frame = df_zip
        locations = 'Zip Code'
        geojson = geodict['json_zip']
        featureidkey = 'properties.zip'

    fig = px.choropleth_mapbox(
        data_frame = data_frame,
        locations = locations,
        geojson = geojson,
        featureidkey = featureidkey,
        mapbox_style = 'carto-positron',
        color = 'Crime Score',
        center = {'lat': 41.881832, 'lon': -87.623177},
        zoom = 9,
        opacity = 0.5,
        labels = {'Weighted Score': 'CSperCapita'})

    fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})

    return fig

In [41]:
@app.callback(
    Output('crime-chart', 'figure'),
    [Input('chicago-map', 'hoverData'),
     Input('location-select', 'value')]
)
def update_crime_chart(hoverData, locationSelectValue):

    if hoverData is not None:
        if locationSelectValue == 'Neighborhood':
            df_crime_chart = df_crime_neigh
            column = 'Neighborhood'
        else:
            df_crime_chart = df_crime_zip
            column = 'Zip Code'

        location = hoverData['points'][0]['location']
        df_crime_chart = df_crime_chart[(df_crime_chart[column] == location)]

    else:
        df_crime_chart = df_crime
        location = 'Chicago'

    fig = px.line(
        data_frame = df_crime_chart,
        x = 'Year',
        y = 'Count',
        title = location
    )

    return fig

In [42]:
# app.run_server()

In [43]:
# app.run_server(mode="inline")

In [44]:
# app.run_server(mode="jupyterlab")

In [45]:
# Run to terminate the server if necessary:
# app._terminate_server_for_port("localhost", 8050)

## K Means Clustering

In [51]:
# K MEANS CLUSTERING
#
# Precedes GMM in pipeline
#
# INPUT: a ZIP code or neighborhood, a distance threshold
# OUTPUT: crime cluster centers (the number and locations of which will be inputs/parameters for GMM), the number of clusters

from typing import Union
import numpy as np
from shapely.geometry import shape
from sklearn.cluster import KMeans

# this can be folded into the function, possibly
neighborhoods = df_crime_neigh.Neighborhood.unique()
zips = df_crime_zip['Zip Code'].unique()

# dictionaries that yield centers of neighborhoods and ZIP code areas
zip_to_center = {}
neighborhood_to_center = {}
for neigh in geodict['json_neigh']['features']:
    name = neigh['properties']['pri_neigh']
    center = shape(neigh['geometry']).centroid
    center = center.x, center.y
    neighborhood_to_center[name] = center

for ZIP in geodict['json_zip']['features']:
    zip_code = int(ZIP['properties']['zip'])
    center = shape(ZIP['geometry']).centroid
    center = center.x, center.y
    zip_to_center[zip_code] = center

In [22]:
# TO-DO: will want to later allow different data inputs to be specified, since we'll want to filter by crime type
def get_crime_centers(location: Union[str, int], thresh_dist: float=0.0075) -> (np.ndarray, int):
    '''
    Given a neighborhood or ZIP code, finds the best k-means cluster centers for crimes within the threshold
        distance of the center of the neighborhood or ZIP code
    Returns the tuple (array of the cluster centers , number of centers)
    '''
    # convert to upper-case
    if isinstance(location, str):
        location = location.title()
    
    # return None if the location isn't valid
    if location not in neighborhoods and location not in zips:
        return None
    
    if isinstance(location, str):
        center_y, center_x = neighborhood_to_center[location]
    else:
        center_y, center_x = zip_to_center[location]
    
    center = np.array([center_x, center_y])
    crime_locations = df[['Latitude', 'Longitude']]
    distances = np.linalg.norm((crime_locations - center), axis=1, ord=1)
    crime_thresholded = df[distances < thresh_dist][['Latitude', 'Longitude']]
    
    # the actual k-means
    old_inertia = np.inf
    best_k = 50
    for k in range(2, 50):
        model = KMeans(n_clusters=k).fit(crime_thresholded)
        new_inertia = model.inertia_
        if new_inertia * 1.05 > old_inertia: #this is tunable, and perhaps should be a parameter of the function
            best_k = k - 1
            break
        else:
            old_inertia = new_inertia
    centers = KMeans(n_clusters=best_k).fit(crime_thresholded).cluster_centers_
    return centers, best_k

In [23]:
# Example
get_crime_centers('beverly')

(array([[ 41.71309896, -87.66855674],
        [ 41.71103062, -87.67904904],
        [ 41.71671584, -87.67524779],
        [ 41.71421828, -87.68008497],
        [ 41.70757552, -87.67611822],
        [ 41.71134658, -87.67681451],
        [ 41.70984051, -87.67260277],
        [ 41.71664015, -87.6775091 ],
        [ 41.71490638, -87.67180153],
        [ 41.7129353 , -87.68161532],
        [ 41.70945776, -87.67613315],
        [ 41.71879352, -87.67399759],
        [ 41.71492556, -87.67421702],
        [ 41.71266301, -87.67050361],
        [ 41.71484508, -87.67784477],
        [ 41.71303903, -87.67695105],
        [ 41.71299234, -87.67315216],
        [ 41.71670066, -87.67287579],
        [ 41.71841603, -87.67650625],
        [ 41.7113207 , -87.67426316]]),
 20)