In [1]:
import numpy as np
import json
import bokeh
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.tile_providers import CARTODBPOSITRON, CARTODBPOSITRON_RETINA
from datetime import datetime

import math

from pylab import cm
import matplotlib

### Clustering 

import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

In [2]:
def coords_to_plot(lat, lon):
    r_major = 6378137.000
    x = r_major * math.radians(lon)
    scale = x/lon
    y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 + 
        lat * (math.pi/180.0)/2.0)) * scale
    x = int(x)
    y = int(y)
    return (x, y)

def normalize(coords_price):
    cord_min = coords_price.min()
    cord_max = coords_price.max()
    def scale(cord):
        return (cord - cord_min)/(cord_max-cord_min)
    return coords_price.apply(lambda x: scale(x))

def make_json(row):
    return {'lat': row['lat'], 'long': row['long'], 'value': round(row['normed'], 6)} 

In [3]:
def get_cluster_dict(data, lat_col, lon_col, kms_radius, min_samples):
    
    def get_centermost_point(cluster):
        try:
            centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
            centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
            return tuple(centermost_point)
        except Exception as e:
            #print(e, MultiPoint(cluster))
            return None
        
    def cluster_dict(row):
        return {'lat': row[lat_col], 'lon': row[lon_col], 'weight': row['weight']}
    
    coords = data[[lat_col, lon_col]].values
    data = data[[lat_col, lon_col]].dropna()
    kms_per_radian = 6371.0088
    epsilon = kms_radius / kms_per_radian
    db = DBSCAN(eps=epsilon, min_samples=min_samples, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
    print('Number of clusters: {}'.format(num_clusters))
    
    data['label'] = db.labels_
    #data.groupby('label').count().head(20)
    
    centermost_points = clusters.map(get_centermost_point)
    
    centermost_points.dropna(inplace=True)
    
    lats, lons = zip(*centermost_points)
    rep_points = pd.DataFrame({'lon':lons, 'lat':lats})

    ### This doesn't handle cases where there is no noise. ### (.values[1:])
    
    rs = rep_points.apply(lambda row: data[(data[lat_col]==row['lat']) & (data[lon_col]==row['lon'])].iloc[0], axis=1)
    
    if min_samples > 1:
        rs['count'] = [x[0] for x in data.groupby('label').count().values[1:]]
    else:
        rs['count'] = [x[0] for x in data.groupby('label').count().values]
    rs['weight'] = rs['count'].apply(lambda x: round(x/rs['count'].sum(), 6))

    return list(rs.apply(lambda x: cluster_dict(x), axis=1))

In [4]:
def plot_weights(cluster_lat_lons):
    rs = pd.DataFrame(cluster_lat_lon)
    p = figure(x_range=(-13630000, -13610000), y_range=(6038000, 6046000),
                   x_axis_type="mercator", y_axis_type="mercator")
    p.add_tile(CARTODBPOSITRON_RETINA)
    rs['merc'] = rs.apply(lambda x: coords_to_plot(x['lat'], x['lon']), axis=1).values
    for i in rs.index:
        x, y, w = rs.loc[i, 'merc'][0], rs.loc[i, 'merc'][1], rs.loc[i, 'weight']
        p.circle(x = x, 
                 y = y,
                 size=w*1000,
                 fill_alpha=.5,
                 line_alpha=0,
                )
    output_notebook()
    show(p)

In [5]:
def plot_by_year(df, year_col, lat_col, lon_col):
    p = figure(x_range=(-13630000, -13610000), y_range=(6038000, 6046000),
               x_axis_type="mercator", y_axis_type="mercator")
    p.add_tile(CARTODBPOSITRON_RETINA)
    cmap = cm.get_cmap('seismic', len(df[year_col].unique()))
    for cmi, year in enumerate(df[year_col].unique()[-5:]): # last 5 years
        rgb = cmap(cmi)[:3]
        rgb = matplotlib.colors.rgb2hex(rgb)
        print(year)
        coords = df.loc[df['year'] == year, [lat_col, lon_col]]
        coords = coords.apply(lambda x: coords_to_plot(x[lat_col], x[lon_col]), axis=1).values
        for coord in coords[:]:
            x, y = coord[0], coord[1]
            p.circle(x = x, 
                     y = y,
                     size=30,
                     fill_alpha=.1,
                     line_alpha=0,
                     fill_color = rgb
                    )
    output_notebook()
    show(p)

In [14]:
data = pd.read_csv('./seattle-code-violation-cases/code-violation-cases.csv')
data.head()

print('len:', len(data))
print('start:', data['Date Case Created'].min())
print('end:', data['Date Case Created'].max(), '\n')
# print('nulls:')
# print(data.isnull().sum())

data = data.loc[data['Case Type'] == 'TENANT RELOCATION ORDINANCE']
# data.loc[data['Case Type'] == 'TENANT RELOCATION ORDINANCE', 'Date Case Created'].sort_values()

data = data.dropna(subset=['Latitude', 'Longitude'])

# print('nulls:')
# print(data.isnull().sum())

data['year'] = sorted(data['Date Case Created'].apply(lambda x: int(x[:4])))

# for year in data['year'].unique():
#     print(year, len(data.loc[data['year'] == year]))

w1 = get_cluster_dict(data, 'Latitude', 'Longitude', .1, 5)

len: 41485
start: 2003-09-17T00:00:00
end: 2018-04-27T00:00:00 

Number of clusters: 92


In [15]:
data = pd.read_csv('./SPD_Reports.csv')
data['Offence Start Date'] = pd.to_datetime(data['Offense Start Date'])

print('len:', len(data))
print('start:', data['Offence Start Date'].min())
print('end:', data['Offence Start Date'].max(), '\n')
print('nulls:')
print(data.isnull().sum())

# cluster_lat_lon = get_cluster_dict(data.loc[data['Offense Type'] == 'THEFT-OTH'], 'Latitude', 'Longitude', .1, 30)
# plot_weights(cluster_lat_lon)

w2 = get_cluster_dict(data.loc[data['Offense Type'] == 'THEFT-OTH'], 'Latitude', 'Longitude', .1, 30)

len: 684472
start: 1965-01-10 00:00:00
end: 2016-09-19 13:00:00 

nulls:
Offense Type                0
Offense Description         1
Report Date                 1
Offense Start Date          1
Offense End Date       332148
Block                       1
District                  503
Beat                      497
2000 Census Tract        1813
Longitude                   1
Latitude                    1
Offence Start Date          1
dtype: int64
Number of clusters: 128


In [16]:
# https://www.kaggle.com/sohier/seattle-police-department-911-incident-response (380 MB)

data = pd.read_csv('/Users/sam/Downloads/Seattle_Police_Department_911_Incident_Response.csv')

data.drop(columns=['Initial Type Description', 
                   'Initial Type Subgroup', 
                   'Initial Type Group', 
                   'At Scene Time'], inplace=True)

print('len:', len(data))

# Downsample
rand_ind = np.random.choice(data.index, 50000)
data = data.loc[rand_ind]

print('Downsample len:', len(data))
data['Event Clearance Date'] = pd.to_datetime(data['Event Clearance Date'])

print('start:', data['Event Clearance Date'].min())
print('end:', data['Event Clearance Date'].max(), '\n')
print('nulls:')
print(data.isnull().sum())

# data['year'] = data['Event Clearance Date'].apply(lambda x: x.year)

# data['Event Clearance Description'].value_counts()

data.loc[data['Event Clearance Description'] == 'SUSPICIOUS PERSON']

# cluster_lat_lon = get_cluster_dict(data.loc[data['Event Clearance Description'] == 'SUSPICIOUS PERSON'], 'Latitude', 'Longitude', .1, 2)
# plot_weights(cluster_lat_lon)

w3 = get_cluster_dict(data.loc[data['Event Clearance Description'] == 'SUSPICIOUS PERSON'], 
                      'Latitude', 'Longitude', 
                      .1, 2)


  interactivity=interactivity, compiler=compiler, result=result)


len: 1433853
Downsample len: 50000
start: 2009-10-07 00:25:00
end: 2017-08-29 11:25:31 

nulls:
CAD CDW ID                       0
CAD Event Number                 0
General Offense Number           0
Event Clearance Code           431
Event Clearance Description    431
Event Clearance SubGroup       431
Event Clearance Group          431
Event Clearance Date           437
Hundred Block Location         112
District/Sector                 46
Zone/Beat                        0
Census Tract                   115
Longitude                        0
Latitude                         0
Incident Location                0
dtype: int64
Number of clusters: 783


In [17]:
data = pd.read_csv('./kc_house_data.csv')

print(len(data))

data['year'] = data['date'].apply(lambda x: int(str(x)[:4]))
data['date'] = pd.to_datetime(data['date'])

print('start:', data['date'].min())
print('end:', data['date'].max(), '\n')

print('nulls:')
print(data.isnull().sum())

# Make sample data
coords = data.loc[data['year'] == 2014, ['price', 'lat', 'long']]
coords['normed'] = normalize(coords['price'])
coords = coords[:2000]

data = list(coords.apply(lambda x: make_json(x), axis=1).values)

# with open('data.json', 'w') as fp:
#     json.dump(data, fp)

# cluster_lat_lon = get_cluster_dict(pd.DataFrame(data), 'lat', 'long', .5, 5)
# cluster_lat_lon
# plot_weights(data)

w4 = get_cluster_dict(pd.DataFrame(data), 'lat', 'long', .5, 5)

21613
start: 2014-05-02 00:00:00
end: 2015-05-27 00:00:00 

nulls:
id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
year             0
dtype: int64
Number of clusters: 67


In [18]:
data = pd.read_csv('./seattle-land-use-permits/land-use-permits.csv')
len(data)

for col in [col for col in data.columns if 'Date' in col]:
    try:
        data[col] = pd.to_datetime(data[col])
    except:
        pass
    
    print(col, 'min:', data[col].min())
    print(col, 'max:', data[col].max())
    
data.dtypes

data.isnull().sum()

data['Category'].unique()

data.loc[data['Category'] == 'COMMERCIAL'].isnull().sum()

# cluster_lat_lon = get_cluster_dict(data.loc[data['Category'] == 'COMMERCIAL'].dropna(), 'Latitude', 'Longitude', .5, 2)
# cluster_lat_lon
# plot_weights(data)

# data.loc[data['Category'] == 'MULTIFAMILY']
# cluster_lat_lon = get_cluster_dict(data.loc[data['Category'] == 'MULTIFAMILY'].dropna(), 'Latitude', 'Longitude', .5, 2)
# plot_weights(data)

w5 = get_cluster_dict(data.loc[data['Category'] == 'COMMERCIAL'].dropna(), 'Latitude', 'Longitude', .5, 2)
w6 = get_cluster_dict(data.loc[data['Category'] == 'MULTIFAMILY'].dropna(), 'Latitude', 'Longitude', .5, 2)


Application Date min: 1992-01-30 00:00:00
Application Date max: 2018-04-27 00:00:00
Decision Date min: 1992-12-24 00:00:00
Decision Date max: 2018-04-27 00:00:00
Issue Date min: 2002-08-06 00:00:00
Issue Date max: 2018-04-27 00:00:00
Number of clusters: 54
Number of clusters: 44


In [25]:
# w1, w2, w3, w4, w5, w6

In [23]:
w1.extend(w2 + w3 + w4 + w5 + w6)

In [48]:
cluster_lat_lon = get_cluster_dict(pd.DataFrame(w1), 'lat', 'lon', .5, 1)

Number of clusters: 139


In [49]:
plot_weights(cluster_lat_lon)

In [41]:
with open('data2.json', 'w') as fp:
    json.dump(cluster_lat_lon, fp)