In [15]:
import numpy as np
import json
import bokeh
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.tile_providers import CARTODBPOSITRON, CARTODBPOSITRON_RETINA
from datetime import datetime

import math

# from pylab import *

In [16]:
def coords_to_plot(lat, lon):
    r_major = 6378137.000
    x = r_major * math.radians(lon)
    scale = x/lon
    y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 + 
        lat * (math.pi/180.0)/2.0)) * scale
    x = int(x)
    y = int(y)
    return (x, y)

# Code violation cases

In [54]:
data = pd.read_csv('./seattle-code-violation-cases/code-violation-cases.csv')
data.head()

print('len:', len(data))
print('start:', data['Date Case Created'].min())
print('end:', data['Date Case Created'].max(), '\n')
# print('nulls:')
# print(data.isnull().sum())

data = data.loc[data['Case Type'] == 'TENANT RELOCATION ORDINANCE']
# data.loc[data['Case Type'] == 'TENANT RELOCATION ORDINANCE', 'Date Case Created'].sort_values()

data = data.dropna(subset=['Latitude', 'Longitude'])

# print('nulls:')
# print(data.isnull().sum())

data['year'] = sorted(data['Date Case Created'].apply(lambda x: int(x[:4])))

# for year in data['year'].unique():
#     print(year, len(data.loc[data['year'] == year]))

len: 41485
start: 2003-09-17T00:00:00
end: 2018-04-27T00:00:00 



In [62]:
### Clustering 

import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

coords = data.loc[data['year'] == 2014, ['Latitude', 'Longitude']].values

In [64]:
kms_per_radian = 6371.0088
epsilon = 1.0 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 36


In [65]:
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)
centermost_points = clusters.map(get_centermost_point)


In [67]:
# centermost_points

In [32]:
def plot_by_year(df, year_col, lat_col, lon_col):
    
    p = figure(x_range=(-13630000, -13610000), y_range=(6038000, 6046000),
               x_axis_type="mercator", y_axis_type="mercator")
    p.add_tile(CARTODBPOSITRON_RETINA)
    
    cmap = cm.get_cmap('seismic', len(df[year_col].unique()))
    
    for cmi, year in enumerate(df[year_col].unique()[-5:]): # last 5 years
        
        rgb = cmap(cmi)[:3]
        rgb = matplotlib.colors.rgb2hex(rgb)
        
        print(year)
        
        coords = df.loc[df['year'] == year, [lat_col, lon_col]]
        coords = coords.apply(lambda x: coords_to_plot(x[lat_col], x[lon_col]), axis=1).values
        for coord in coords[:]:
            x, y = coord[0], coord[1]

            p.circle(x = x, 
                     y = y,
                     size=30,
                     fill_alpha=.1,
                     line_alpha=0,
                     fill_color = rgb
                    )
            
    output_notebook()
    show(p)

In [38]:
# plot_by_year(data, 'year', 'Latitude', 'Longitude')

# SPD Reports

In [39]:
data = pd.read_csv('./SPD_Reports.csv')
data['Offence Start Date'] = pd.to_datetime(data['Offense Start Date'])

print('len:', len(data))
print('start:', data['Offence Start Date'].min())
print('end:', data['Offence Start Date'].max(), '\n')
print('nulls:')
print(data.isnull().sum())

len: 684472
start: 1965-01-10 00:00:00
end: 2016-09-19 13:00:00 

nulls:
Offense Type                0
Offense Description         1
Report Date                 1
Offense Start Date          1
Offense End Date       332148
Block                       1
District                  503
Beat                      497
2000 Census Tract        1813
Longitude                   1
Latitude                    1
Offence Start Date          1
dtype: int64


In [42]:
data['Offense Type'].value_counts()

THEFT-CARPROWL                    121113
THEFT-OTH                          43870
VEH-THEFT-AUTO                     38864
PROPERTY DAMAGE-NON RESIDENTIA     32671
BURGLARY-FORCE-RES                 31485
ASSLT-NONAGG                       24963
THEFT-SHOPLIFT                     23264
DISTURBANCE-OTH                    22145
PROPERTY FOUND                     21675
BURGLARY-NOFORCE-RES               20252
THEFT-BUILDING                     20117
WARRARR-FELONY                     16440
WARRARR-MISDEMEANOR                14934
BURGLARY-FORCE-NONRES              14626
FRAUD-CREDIT CARD                  13228
THEFT-BICYCLE                      11288
FRAUD-IDENTITY THEFT               10776
HARASSMENT                         10413
TRESPASS                           10319
PROPERTY DAMAGE-RESIDENTIAL         9761
PROPERTY DAMAGE - GRAFFITI          8415
THEFT-AUTOACC                       8390
ASSLT-AGG-WEAPON                    7801
BURGLARY-NOFORCE-NONRES             7778
BURGLARY-SECURE 

# 911 Incidents

In [43]:
# https://www.kaggle.com/sohier/seattle-police-department-911-incident-response (380 MB)

data = pd.read_csv('/Users/sam/Downloads/Seattle_Police_Department_911_Incident_Response.csv')

data.drop(columns=['Initial Type Description', 
                   'Initial Type Subgroup', 
                   'Initial Type Group', 
                   'At Scene Time'], inplace=True)

print('len:', len(data))

# Downsample
rand_ind = np.random.choice(data.index, 5000)
data = data.loc[rand_ind]

print('Downsample len:', len(data))
data['Event Clearance Date'] = pd.to_datetime(data['Event Clearance Date'])

print('start:', data['Event Clearance Date'].min())
print('end:', data['Event Clearance Date'].max(), '\n')
print('nulls:')
print(data.isnull().sum())

  interactivity=interactivity, compiler=compiler, result=result)


len: 1433853
Downsample len: 5000
start: 2010-07-17 20:37:00
end: 2017-08-28 23:13:06 

nulls:
CAD CDW ID                      0
CAD Event Number                0
General Offense Number          0
Event Clearance Code           40
Event Clearance Description    40
Event Clearance SubGroup       40
Event Clearance Group          40
Event Clearance Date           42
Hundred Block Location         11
District/Sector                 2
Zone/Beat                       0
Census Tract                    9
Longitude                       0
Latitude                        0
Incident Location               0
dtype: int64


In [47]:
data['year'] = data['Event Clearance Date'].apply(lambda x: x.year)

In [48]:
data

Unnamed: 0,CAD CDW ID,CAD Event Number,General Offense Number,Event Clearance Code,Event Clearance Description,Event Clearance SubGroup,Event Clearance Group,Event Clearance Date,Hundred Block Location,District/Sector,Zone/Beat,Census Tract,Longitude,Latitude,Incident Location,year
1164771,2115146,14000237946,2014237946,130.0,PROPERTY DESTRUCTION,PROPERTY DAMAGE,PROPERTY DAMAGE,2014-07-20 23:12:00,15XX BLOCK OF NE 145 ST,L,L1,200.4001,-122.311398,47.733936,"(47.733936473, -122.311397618)",2014.0
891889,1550387,15000322459,2015322459,41.0,"HARASSMENT, THREATS","THREATS, HARASSMENT","THREATS, HARASSMENT",2015-09-14 13:50:00,65XX BLOCK OF 23 AV NW,J,J2,3300.4000,-122.385357,47.676833,"(47.676833004, -122.385357427)",2015.0
148471,396872,11000236609,2011236609,130.0,PROPERTY DESTRUCTION,PROPERTY DAMAGE,PROPERTY DAMAGE,2011-07-21 19:48:00,143XX BLOCK OF 30TH AVE NE,L,L1,100.4001,-122.296538,47.732345,"(47.732345401, -122.296537763)",2011.0
305458,1682844,16000070951,201670951,177.0,LIQUOR VIOLATION - INTOXICATED PERSON,LIQUOR VIOLATIONS,LIQUOR VIOLATIONS,2016-02-28 03:17:15,3XX BLOCK OF 2 AV S,K,K3,9200.2028,-122.330810,47.600464,"(47.600464, -122.33081)",2016.0
711503,1736180,16000166811,2016166811,161.0,TRESPASS,TRESPASS,TRESPASS,2016-05-13 09:30:38,3XX BLOCK OF 2 AV S,K,K3,9200.2028,-122.330810,47.600464,"(47.600464, -122.33081)",2016.0
1281904,2249114,15000017742,201517742,203.0,ALARMS - RESIDENTIAL PANIC (FALSE),PANIC ALARMS (FALSE),FALSE ALARMS,2015-01-16 17:05:00,XX BLOCK OF FLORENTIA ST,Q,Q2,6000.4008,-122.356198,47.646889,"(47.646888612, -122.356197841)",2015.0
1049128,2000411,17000174244,2017174244,245.0,"DISTURBANCE, OTHER",DISTURBANCES,DISTURBANCES,2017-05-18 01:38:01,57XX BLOCK OF 22 AV NW,B,B1,4700.4005,-122.384740,47.670547,"(47.670547, -122.38474)",2017.0
437303,801110,12000149413,2012149413,65.0,THEFT - MISCELLANEOUS,THEFT,OTHER PROPERTY,2012-05-15 15:51:00,11XX BLOCK OF 5TH AVE,K,K1,8200.3008,-122.332778,47.607623,"(47.607622885, -122.332777573)",2012.0
949400,1631035,15000433877,2015433877,470.0,PARKING VIOLATION (EXCEPT ABANDONED VEHICLES),PARKING VIOLATIONS,TRAFFIC RELATED CALLS,2015-12-15 09:44:11,39XX BLOCK OF S FERDINAND ST,R,R3,10300.5011,-122.282104,47.557842,"(47.557842, -122.282104)",2015.0
505785,875646,12000268093,2012268093,470.0,PARKING VIOLATION (EXCEPT ABANDONED VEHICLES),PARKING VIOLATIONS,TRAFFIC RELATED CALLS,2012-08-13 16:20:00,17XX BLOCK OF MINOR AVE,D,D2,7300.2016,-122.330060,47.616394,"(47.616394277, -122.330060373)",2012.0


In [45]:
data['Event Clearance Description'].value_counts()

SUSPICIOUS PERSON                                  558
DISTURBANCE, OTHER                                 494
PARKING VIOLATION (EXCEPT ABANDONED VEHICLES)      378
TRAFFIC (MOVING) VIOLATION                         338
LIQUOR VIOLATION - INTOXICATED PERSON              218
SUSPICIOUS VEHICLE                                 152
THEFT - CAR PROWL                                  149
MOTOR VEHICLE COLLISION                            148
ACCIDENT INVESTIGATION                             141
MISCHIEF, NUISANCE COMPLAINTS                      139
SHOPLIFT                                           132
THEFT - MISCELLANEOUS                              129
NOISE DISTURBANCE                                  126
TRESPASS                                           108
ASSAULTS, OTHER                                     91
AUTO THEFT                                          83
LIQUOR VIOLATION - ADULT                            78
ALARMS - RESIDENTIAL BURGLARY (FALSE)               69
MENTAL COM

In [50]:
data

Unnamed: 0,CAD CDW ID,CAD Event Number,General Offense Number,Event Clearance Code,Event Clearance Description,Event Clearance SubGroup,Event Clearance Group,Event Clearance Date,Hundred Block Location,District/Sector,Zone/Beat,Census Tract,Longitude,Latitude,Incident Location,year
1164771,2115146,14000237946,2014237946,130.0,PROPERTY DESTRUCTION,PROPERTY DAMAGE,PROPERTY DAMAGE,2014-07-20 23:12:00,15XX BLOCK OF NE 145 ST,L,L1,200.4001,-122.311398,47.733936,"(47.733936473, -122.311397618)",2014.0
891889,1550387,15000322459,2015322459,41.0,"HARASSMENT, THREATS","THREATS, HARASSMENT","THREATS, HARASSMENT",2015-09-14 13:50:00,65XX BLOCK OF 23 AV NW,J,J2,3300.4000,-122.385357,47.676833,"(47.676833004, -122.385357427)",2015.0
148471,396872,11000236609,2011236609,130.0,PROPERTY DESTRUCTION,PROPERTY DAMAGE,PROPERTY DAMAGE,2011-07-21 19:48:00,143XX BLOCK OF 30TH AVE NE,L,L1,100.4001,-122.296538,47.732345,"(47.732345401, -122.296537763)",2011.0
305458,1682844,16000070951,201670951,177.0,LIQUOR VIOLATION - INTOXICATED PERSON,LIQUOR VIOLATIONS,LIQUOR VIOLATIONS,2016-02-28 03:17:15,3XX BLOCK OF 2 AV S,K,K3,9200.2028,-122.330810,47.600464,"(47.600464, -122.33081)",2016.0
711503,1736180,16000166811,2016166811,161.0,TRESPASS,TRESPASS,TRESPASS,2016-05-13 09:30:38,3XX BLOCK OF 2 AV S,K,K3,9200.2028,-122.330810,47.600464,"(47.600464, -122.33081)",2016.0
1281904,2249114,15000017742,201517742,203.0,ALARMS - RESIDENTIAL PANIC (FALSE),PANIC ALARMS (FALSE),FALSE ALARMS,2015-01-16 17:05:00,XX BLOCK OF FLORENTIA ST,Q,Q2,6000.4008,-122.356198,47.646889,"(47.646888612, -122.356197841)",2015.0
1049128,2000411,17000174244,2017174244,245.0,"DISTURBANCE, OTHER",DISTURBANCES,DISTURBANCES,2017-05-18 01:38:01,57XX BLOCK OF 22 AV NW,B,B1,4700.4005,-122.384740,47.670547,"(47.670547, -122.38474)",2017.0
437303,801110,12000149413,2012149413,65.0,THEFT - MISCELLANEOUS,THEFT,OTHER PROPERTY,2012-05-15 15:51:00,11XX BLOCK OF 5TH AVE,K,K1,8200.3008,-122.332778,47.607623,"(47.607622885, -122.332777573)",2012.0
949400,1631035,15000433877,2015433877,470.0,PARKING VIOLATION (EXCEPT ABANDONED VEHICLES),PARKING VIOLATIONS,TRAFFIC RELATED CALLS,2015-12-15 09:44:11,39XX BLOCK OF S FERDINAND ST,R,R3,10300.5011,-122.282104,47.557842,"(47.557842, -122.282104)",2015.0
505785,875646,12000268093,2012268093,470.0,PARKING VIOLATION (EXCEPT ABANDONED VEHICLES),PARKING VIOLATIONS,TRAFFIC RELATED CALLS,2012-08-13 16:20:00,17XX BLOCK OF MINOR AVE,D,D2,7300.2016,-122.330060,47.616394,"(47.616394277, -122.330060373)",2012.0


In [49]:
plot_by_year(data, 'year', 'Latitude', 'Longitude')

2017.0
2012.0
2010.0
2013.0
nan


# King County House Sales

In [None]:
data = pd.read_csv('./kc_house_data.csv')

print(len(data))

data['year'] = data['date'].apply(lambda x: int(str(x)[:4]))
data['date'] = pd.to_datetime(data['date'])

print('start:', data['date'].min())
print('end:', data['date'].max(), '\n')

print('nulls:')
print(data.isnull().sum())

In [None]:
# p = figure(x_range=(-13630000, -13610000), y_range=(6038000, 6046000),
#            x_axis_type="mercator", y_axis_type="mercator")
# p.add_tile(CARTODBPOSITRON_RETINA)

# colors_dict = {2014: '#008000', 2015: '#FF0000'}

# for year in data['year'].unique():
    
#     coords = data.loc[data['year'] == year, ['price', 'lat', 'long']]
#     coords['merc'] = coords.apply(lambda x: coords_to_plot(x['lat'], x['long']), axis=1).values
    
#     for coord in coords.index[:500]:
        
#         x, y, price = coords.loc[coord]['merc'][0], coords.loc[coord]['merc'][1], coords.loc[coord]['price']

#         p.circle(x = x,
#                  y = y,
#                  size=price/100000,
#                  fill_alpha=.2,
#                  line_color = colors_dict[year],
#                  fill_color = colors_dict[year]
#                 )

# output_notebook()
# show(p)

In [None]:
def normalize(coords_price):
    cord_min = coords_price.min()
    cord_max = coords_price.max()
    def scale(cord):
        return (cord - cord_min)/(cord_max-cord_min)
    return coords_price.apply(lambda x: scale(x))

In [None]:
def make_json(row):
    return {'lat': row['lat'], 'long': row['long'], 'value': round(row['normed'], 6)} 

In [None]:
# Make sample data
coords = data.loc[data['year'] == 2014, ['price', 'lat', 'long']]
coords['normed'] = normalize(coords['price'])
coords = coords[:2000]

data = list(coords.apply(lambda x: make_json(x), axis=1).values)

with open('data.json', 'w') as fp:
    json.dump(data, fp)

# Not used

In [None]:
# # No lat/long
# data = pd.read_csv('/Users/sam/Downloads/seattle-use-of-force/use-of-force.csv')
# data

In [None]:
# # No lat/long
# data = pd.read_csv('/Users/sam/Downloads/seattle-rent-and-income-restricted-housing/rent-and-income-restricted-housing.csv')
# data

# Land use permits

In [None]:
data = pd.read_csv('/Users/sam/Downloads/seattle-land-use-permits/land-use-permits.csv')
len(data)

for col in [col for col in data.columns if 'Date' in col]:
    try:
        data[col] = pd.to_datetime(data[col])
    except:
        pass
    
    print(col, 'min:', data[col].min())
    print(col, 'max:', data[col].max())
    
data.dtypes

In [None]:
data.isnull().sum()

In [None]:
# Applied
# Decisions/Applied
# Issues/Decisions
# Is there Issues/Applied?

# Find denied permits

In [None]:
# Or find permits approved

In [None]:
# for col in ['Permit Type', 'Category', 'Value', 'Applicant Name', 'Contractor']:
#     print(col)
#     print(data[col].unique())

In [None]:
data['Category'].unique()

In [None]:
data.loc[data['Category'] == 'COMMERCIAL']

In [None]:
data.loc[data['Category'] == 'MULTIFAMILY']

In [None]:
data

In [None]:
# p = figure(x_range=(-13630000, -13610000), y_range=(6038000, 6046000),
#            x_axis_type="mercator", y_axis_type="mercator")
# p.add_tile(CARTODBPOSITRON_RETINA)

# colors_dict = {2014: '#008000', 2015: '#FF0000'}

# for year in data['year'].unique():
    
#     coords = data.loc[data['year'] == year, ['price', 'lat', 'long']]
#     coords['merc'] = coords.apply(lambda x: coords_to_plot(x['lat'], x['long']), axis=1).values
    
#     for coord in coords.index[:500]:
        
#         x, y, price = coords.loc[coord]['merc'][0], coords.loc[coord]['merc'][1], coords.loc[coord]['price']

#         p.circle(x = x,
#                  y = y,
#                  size=price/100000,
#                  fill_alpha=.2,
#                  line_color = colors_dict[year],
#                  fill_color = colors_dict[year]
#                 )

# output_notebook()
# show(p)