-
Notifications
You must be signed in to change notification settings - Fork 0
/
crime_clustering.py
174 lines (144 loc) · 7.15 KB
/
crime_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# K MEANS CLUSTERING
#
# Precedes GMM in pipeline
#
# INPUT: a ZIP code or neighborhood, a distance threshold
# OUTPUT: crime cluster centers (the number and locations of which will be inputs/parameters for GMM), the number of clusters
from typing import Union
import numpy as np
from shapely.geometry import shape
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import euclidean_distances
import pandas as pd
from scipy.spatial import cKDTree
import geopy
from geopy.geocoders import Nominatim
from geopy import Point, distance
from collections import Counter
class CrimeCluster:
def __init__(self, df_crime: pd.DataFrame, neighborhoods: dict, zip_codes: dict):
self._df_crime = df_crime
self._neighborhoods = df_crime.Neighborhood.unique()
self._zip_codes = df_crime['Zip Code'].unique()
self._neighborhood_to_center = self.__property_to_center(neighborhoods, 'pri_neigh')
self._zip_to_center = self.__property_to_center(zip_codes, 'zip')
self._tree = cKDTree(df_crime[['Latitude', 'Longitude']])
def __property_to_center(self, geo: dict, property: str) -> dict:
property_to_center = {}
for location in geo['features']:
name = location['properties'][property]
if property == 'zip':
name = int(name)
center = shape(location['geometry']).centroid
center = center.x, center.y
property_to_center[name] = center
return property_to_center
# TO-DO: will want to later allow different data inputs to be specified, since we'll want to filter by crime type
def GMM(self, location: Union[str, int], crime_type: str, thresh_dist: float=0.025) -> (pd.DataFrame, int):
'''
Given a neighborhood or ZIP code, finds the best k-means cluster centers for crimes within the threshold
distance of the center of the neighborhood or ZIP code, then returns the GMM means and covariance matrices.
Returns the tuple (number of cluster centers, array of the cluster centers , covariance matrices)
'''
# convert to upper-case
if isinstance(location, str):
location = location.title()
# return None if the location isn't valid
if location not in self._neighborhoods and location not in self._zip_codes:
return None
if isinstance(location, str):
center_y, center_x = self._neighborhood_to_center[location]
else:
center_y, center_x = self._zip_to_center[location]
center = np.array([center_x, center_y])
if crime_type != 'ALL':
df_crime = self._df_crime[self._df_crime['Crime Type'] == crime_type]
else:
df_crime = self._df_crime
crime_locations = df_crime[['Latitude', 'Longitude']]
distances = np.linalg.norm((crime_locations - center), axis=1, ord=1)
crime_thresholded = df_crime[distances < thresh_dist][['Latitude', 'Longitude']]
# the actual k-means
old_inertia = np.inf
best_k = 8
for k in range(2, 8):
model = KMeans(n_clusters=k).fit(crime_thresholded)
new_inertia = model.inertia_
n_per_cluster = Counter(model.labels_)
if new_inertia * 1.05 > old_inertia or min(n_per_cluster.values()) < 0.1 * sum(n_per_cluster.values()):
best_k = k - 1
break
else:
old_inertia = new_inertia
k_means = KMeans(n_clusters=best_k).fit(crime_thresholded)
centers = k_means.cluster_centers_
df_centers = pd.DataFrame(data=centers, columns=['lat', 'lon'])
model = GaussianMixture(n_components=best_k, means_init=df_centers).fit(crime_thresholded)
n_per_cluster = Counter(k_means.labels_)
n_weights = np.zeros(best_k)
for i in n_per_cluster.keys():
n_weights[i] = n_per_cluster[i]
return best_k, model.means_, model.covariances_, n_weights
def GMM_Address(self, address: str, radius: int, crime_type: str, thresh_dist: float=0.025):
geolocator = Nominatim(user_agent='team175')
location = geolocator.geocode(address)
print(location)
start = Point(location.latitude, location.longitude)
dist = distance.geodesic(miles=radius)
end = dist.destination(start, bearing=0)
degrees = end.latitude - start.latitude
center = (location.latitude, location.longitude)
points = self._tree.query_ball_point(center, degrees)
if crime_type != 'ALL':
df_crime = self._df_crime.iloc[points]
df_crime = df_crime[df_crime['Crime Type'] == crime_type]
else:
df_crime = self._df_crime.iloc[points]
crime_thresholded = df_crime[['Latitude', 'Longitude']]
old_inertia = np.inf
best_k = 8
for k in range(2, 8):
model = KMeans(n_clusters=k).fit(crime_thresholded)
new_inertia = model.inertia_
n_per_cluster = Counter(model.labels_)
if new_inertia * 1.05 > old_inertia or min(n_per_cluster.values()) < 0.1 * sum(n_per_cluster.values()):
best_k = k - 1
break
else:
old_inertia = new_inertia
k_means = KMeans(n_clusters=best_k).fit(crime_thresholded)
centers = k_means.cluster_centers_
df_centers = pd.DataFrame(data=centers, columns=['lat', 'lon'])
model = GaussianMixture(n_components=best_k, means_init=df_centers).fit(crime_thresholded)
n_per_cluster = Counter(k_means.labels_)
n_weights = np.zeros(best_k)
for i in n_per_cluster.keys():
n_weights[i] = n_per_cluster[i]
return best_k, model.means_, model.covariances_, df_crime, n_weights, start
def GMM_Chicago(self, crime_type):
if crime_type != 'ALL':
df_crime = self._df_crime[self._df_crime['Crime Type'] == crime_type]
else:
df_crime = self._df_crime
crime_thresholded = df_crime[['Latitude', 'Longitude']]
old_inertia = np.inf
best_k = 8
for k in range(2, 8):
model = KMeans(n_clusters=k).fit(crime_thresholded)
new_inertia = model.inertia_
n_per_cluster = Counter(model.labels_)
if new_inertia * 1.05 > old_inertia or min(n_per_cluster.values()) < 0.1 * sum(n_per_cluster.values()):
best_k = k - 1
break
else:
old_inertia = new_inertia
k_means = KMeans(n_clusters=best_k).fit(crime_thresholded)
centers = k_means.cluster_centers_
df_centers = pd.DataFrame(data=centers, columns=['lat', 'lon'])
model = GaussianMixture(n_components=best_k, means_init=df_centers).fit(crime_thresholded)
n_per_cluster = Counter(k_means.labels_)
n_weights = np.zeros(best_k)
for i in n_per_cluster.keys():
n_weights[i] = n_per_cluster[i]
return best_k, model.means_, model.covariances_, n_weights