In [None]:
import pandas as pd 
import numpy as np
import json
import requests 
from pandas.io.json import json_normalize 
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster
from folium import plugins
from folium.plugins import HeatMap
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import warnings 
warnings.filterwarnings("ignore")


In [None]:
arrest_data=pd.read_csv('NYPD_Arrest_Data__Year_to_Date_.csv')

shots_data=pd.read_csv('NYPD_Shooting_Incident_Data__Year_To_Date_.csv')

# Examining the Data visually

In [None]:
arrest_data.head()

In [None]:
shots_data.head()

In [None]:
lat_shots=list(shots_data.Latitude)
lon_shots=list(shots_data.Longitude)

## Filtering for crimes that are inflicted on another individual

In [None]:
#sorting for only assault, burglary,weapons, sexual abuse, larceny, bringing data size down to ~68k
arrest_data=arrest_data.dropna()

arrest_data=arrest_data[arrest_data["PD_DESC"].str.contains('assault|theft|assault|burglary|weapons|sexual abuse|larceny',case=False)]
arrestloc=arrest_data[['Latitude','Longitude']]
shotsloc=shots_data[['Latitude','Longitude']]

In [None]:
arrestloc=arrest_data[['OFNS_DESC','Latitude','Longitude']]
arrestloc=arrestloc[:45000]
heat = arrestloc
data = heat.as_matrix(['Latitude','Longitude']).tolist()
#datatst = heat.values[:,1:3].tolist()-- For some reason does not work to fix the error below

In [None]:
#Extract longitude and latidude
import pygeoj
latitude_sub=[]
longitude_sub=[]
names=[]
lines=[]
testfile = pygeoj.load("Subway Entrances.geojson")
for feature in testfile:
    lines.append(feature.properties['line'])
    names.append(feature.properties['name'])
    latitude_sub.append(feature.geometry.coordinates[1])
    longitude_sub.append(feature.geometry.coordinates[0])

# Heatmap of NYC crime superimposed with subway entrances

In [None]:
latitude=40.7128
longitude=-74.006
map_ny = folium.Map(location=[latitude, longitude], zoom_start=10)
colormap = {0.0: 'pink', 0.3: 'blue', 0.5: 'green',  0.7: 'yellow', 1.0: 'red'}

# plotting coordinates of postal code, with stopname as labels in popup
for lat, lng, name, line in zip(latitude_sub, longitude_sub, names, lines):
    label = 'Stop Name:{} Line:{}'.format(name,line)
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=True).add_to(map_ny)  
    
HeatMap(data, min_opacity=0.5,fill_opacity=.2,
        max_zoom=18, 
        max_val=1.0, 
        radius=15,
        blur=20,
        gradient=colormap,
        use_local_extrema=True,
        overlay=True).add_to(map_ny)
    
map_ny

# Visualization of shooting incidents YTD 2019, NYC


In [None]:
latitude=40.7128
longitude=-74.006
map_nys = folium.Map(location=[latitude, longitude], zoom_start=10)

# plotting shots fired ytd 2019, with borough name as popup
for  latshot, lonshot, boro in zip(lat_shots, lon_shots,list(shots_data.BORO) ):
    folium.Circle(
        [latshot,lonshot],
        radius=5,
        popup=boro,
        color='crimson',
        fill=False).add_to(map_nys)


map_nys

# Density Based Clustering to split shooting incident data into distinct spatial areas

In [None]:
## using DBSCN to visualize shooting incidents
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
sklearn.utils.check_random_state(1000)
Clus_dataSet = shots_data[['Latitude','Longitude']]
Clus_dataSet = np.nan_to_num(Clus_dataSet)
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)

In [None]:
db = DBSCAN(eps=.30, min_samples=55).fit(Clus_dataSet)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
shotsloc["Clus_Db"]=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels)) 

shotsloc.Clus_Db.unique()

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(-1,4)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, cluster in zip(shotsloc['Latitude'], shotsloc['Longitude'], shotsloc['Clus_Db']):
    label = folium.Popup(str(' Cluster ' + str(cluster)))
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

### Looking at the above data we see that while the algorithm was able to pick up several neighborhoods such as the north of Manhattan, the Bronx, and Brooklyn, we can clearly see that the resolution is not enough to show distinct neighborhoods. In Manhattan, Washington Heights and Harlem get grouped together, and in Brooklyn Bed-Stuy, Bushwick and Crown Heights are all in the same cluster.

# Density based clustering on arrest records 

In [None]:
#Double DBSCAN on manhattan
latitude=40.7128
longitude=-74.006
Clus_arrestSet = arrestloc[['Latitude','Longitude']]
Clus_arrestSet = np.nan_to_num(Clus_arrestSet)
Clus_arrestSet = StandardScaler().fit_transform(Clus_arrestSet)

# Compute DBSCAN #eps=.1, min =750,|||eps=.075, min_samples=200
# best cluster: db = DBSCAN(eps=.075, min_samples=150).fit(Clus_dataSet)
db = DBSCAN(eps=.075, min_samples=150).fit(Clus_arrestSet)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
arrestloc["Clus_Db"]=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels)) 

arrestloc.Clus_Db.unique()

In [None]:
nyc_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(-1,38)
ys = [i + x + (i*x)**2 for i in range(39)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, cluster in zip(arrestloc['Latitude'], arrestloc['Longitude'], arrestloc['Clus_Db']):
    label = folium.Popup(str(' Cluster ' + str(cluster)))
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(nyc_clusters)
nyc_clusters

### We can see that there are several areas standing out in the cluster, however there is not enough resolution to deal with Manhattan, therefore we will run DBSCAN on it again

In [None]:
map_ny1 = folium.Map(location=[latitude, longitude], zoom_start=10)

manhattan=arrestloc[arrestloc['Clus_Db']==2]

# plotting coordinates of postal code, with stopname as labels in popup
for lat, lng, crime in zip(manhattan.Latitude.to_list(), manhattan.Longitude.to_list(), manhattan.OFNS_DESC.to_list()):
    label = 'Crime:{}'.format(crime)
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=True).add_to(map_ny1)  
    
map_ny1

In [None]:
manhattan=arrestloc[arrestloc['Clus_Db']==2]


In [None]:
## DBSCAN values are eps=.1 and min samples = 200
Clus_dataSet2 = manhattan[['Latitude','Longitude']]
Clus_dataSet2 = np.nan_to_num(Clus_dataSet2)
Clus_dataSet2 = StandardScaler().fit_transform(Clus_dataSet2)
db = DBSCAN(eps=.1, min_samples=200).fit(Clus_dataSet2)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
manhattan["Clus_Db_second"]=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels)) 
manhattan.Clus_Db_second.unique()


# Manhattan split



In [None]:
man_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(-1,14)
ys = [i + x + (i*x)**2 for i in range(15)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(manhattan['Latitude'], manhattan['Longitude'], manhattan['Clus_Db_second']):
    label = folium.Popup(str(' Cluster ' + str(cluster)))
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(man_clusters)
man_clusters

In [None]:

manhattanzoom=manhattan[manhattan['Clus_Db_second']==0]
Clus_dataSet3 = manhattanzoom[['Latitude','Longitude']]
Clus_dataSet3 = np.nan_to_num(Clus_dataSet3)
Clus_dataSet3 = StandardScaler().fit_transform(Clus_dataSet3)
db = DBSCAN(eps=.25, min_samples=100).fit(Clus_dataSet3)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
manhattanzoom["Clus_Db_third"]=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels)) 
manhattanzoom.Clus_Db_third.unique()
##eps=.2 min samples=100

In [None]:
man_clusters_zoom = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(-1,8)
ys = [i + x + (i*x)**2 for i in range(9)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(manhattanzoom['Latitude'], manhattanzoom['Longitude'], manhattanzoom['Clus_Db_third']):
    label = folium.Popup(str(' Cluster ' + str(cluster)))
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(man_clusters_zoom)
man_clusters_zoom

# Picking Stops to explore with Foursquare , based on visual inspection of heatmap

In [None]:
dataff = {'Name':names, 'Line':lines,'Latitude':latitude_sub,'Longitude':longitude_sub} 
subway=pd.DataFrame(dataff)
subway=subway.dropna()

subway

In [None]:
canal=subway[subway["Name"].str.contains('canal',case=False)]
fourteenth=subway[subway["Name"].str.contains('14th',case=False)]
Penn=subway[subway["Name"].str.contains('34th',case=False)]
eightsix=subway[subway["Name"].str.contains('Lexington Ave & 86th',case=False)]
broadway=subway[subway["Name"].str.contains('Broadway & 96th',case=False)]
harlem=subway[subway["Name"].str.contains('Lexington Ave & 125th',case=False)]
jerome=subway[subway["Name"].str.contains('Jerome Ave & 170th',case=False)]
gc=subway[subway["Name"].str.contains('Grand Concourse & Fordham',case=False)]
fdr=subway[subway["Name"].str.contains('Junction Blvd & Roosevelt',case=False)]
hillside=subway[subway["Name"].str.contains('169th st & Hillside',case=False)]
flushing=subway[subway["Name"].str.contains('Flushing Ave & Broadway',case=False)]
hoyt=subway[subway["Name"].str.contains('Hoyt st & Fulton',case=False)]
nostrand=subway[subway["Name"].str.contains('nostrand ave & avenue h',case=False)]
beach=subway[subway["Name"].str.contains('Beach 22nd St & Mott Av',case=False)]
surf=subway[subway["Name"].str.contains('Stillwell ave & surf ave',case=False)]

#arrestloc=arrest_data[['Latitude','Longitude']]

In [None]:
stations=(canal, fourteenth,Penn,eightsix,broadway,harlem,jerome,gc,fdr,hillside,flushing,hoyt,nostrand,beach,surf)
crimestops = folium.Map(location=[latitude, longitude], zoom_start=12)
POI=[]
for station in stations:
    label = folium.Popup(stations[0])
    
    folium.CircleMarker(
        [(sum(station.Latitude)/len(station.Latitude)), (sum(station.Longitude)/len(station.Longitude))],
        radius=5,
        popup=label,
        
        fill=True,
        fill_opacity=0.7).add_to(crimestops)
crimestops
    

In [None]:
sum(canal.Latitude)/len(canal.Latitude)

In [None]:
stations[0].Name

