In [1]:
import numpy as np
import json
import bokeh
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.tile_providers import CARTODBPOSITRON, CARTODBPOSITRON_RETINA
from datetime import datetime

import math

# from pylab import *

In [2]:
def coords_to_plot(lat, lon):
    r_major = 6378137.000
    x = r_major * math.radians(lon)
    scale = x/lon
    y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 + 
        lat * (math.pi/180.0)/2.0)) * scale
    x = int(x)
    y = int(y)
    return (x, y)

# Code violation cases

In [3]:
data = pd.read_csv('./seattle-code-violation-cases/code-violation-cases.csv')
data.head()

print('len:', len(data))
print('start:', data['Date Case Created'].min())
print('end:', data['Date Case Created'].max(), '\n')
# print('nulls:')
# print(data.isnull().sum())

data = data.loc[data['Case Type'] == 'TENANT RELOCATION ORDINANCE']
# data.loc[data['Case Type'] == 'TENANT RELOCATION ORDINANCE', 'Date Case Created'].sort_values()

data = data.dropna(subset=['Latitude', 'Longitude'])

# print('nulls:')
# print(data.isnull().sum())

data['year'] = sorted(data['Date Case Created'].apply(lambda x: int(x[:4])))

# for year in data['year'].unique():
#     print(year, len(data.loc[data['year'] == year]))

len: 41485
start: 2003-09-17T00:00:00
end: 2018-04-27T00:00:00 



In [4]:
### Clustering 

import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

# coords = data[['Latitude', 'Longitude']].values

In [46]:
def get_cluster_dict(data, lat_col, lon_col, kms_radius, min_samples):
    
    def get_centermost_point(cluster):
        try:
            centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
            centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
            return tuple(centermost_point)
        except Exception as e:
            #print(e, MultiPoint(cluster))
            return None
        
    def cluster_dict(row):
        return {'lat': row[lat_col], 'lon': row[lon_col], 'weight': row['weight']}
    
    coords = data[[lat_col, lon_col]].values
    data = data[[lat_col, lon_col]]
    kms_per_radian = 6371.0088
    epsilon = kms_radius / kms_per_radian
    db = DBSCAN(eps=epsilon, min_samples=min_samples, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
    print('Number of clusters: {}'.format(num_clusters))
    
    data['label'] = db.labels_
    #data.groupby('label').count().head(20)
    
    centermost_points = clusters.map(get_centermost_point)
    centermost_points.dropna(inplace=True)
    
    lats, lons = zip(*centermost_points)
    rep_points = pd.DataFrame({'lon':lons, 'lat':lats})

    ### This doesn't handle cases where there is no noise. ### (.values[1:])
    
    rs = rep_points.apply(lambda row: data[(data[lat_col]==row['lat']) & (data[lon_col]==row['lon'])].iloc[0], axis=1)
    rs['count'] = [x[0] for x in data.groupby('label').count().values[1:]]
    rs['weight'] = rs['count'].apply(lambda x: round(x/rs['count'].sum(), 6))

    return list(rs.apply(lambda x: cluster_dict(x), axis=1))

In [48]:
cluster_lat_lon = get_cluster_dict(data, 'Latitude', 'Longitude', .25, 2)
cluster_lat_lon

Number of clusters: 179


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[{'lat': 47.61573981, 'lon': -122.31225414, 'weight': 0.150023},
 {'lat': 47.65973215, 'lon': -122.34843383, 'weight': 0.032987},
 {'lat': 47.63918281, 'lon': -122.37129946, 'weight': 0.011749},
 {'lat': 47.66281648, 'lon': -122.31609893, 'weight': 0.052418},
 {'lat': 47.67181654, 'lon': -122.38048547, 'weight': 0.099413},
 {'lat': 47.63359668, 'lon': -122.27880585, 'weight': 0.001356},
 {'lat': 47.65249444, 'lon': -122.37121017, 'weight': 0.011297},
 {'lat': 47.55247918, 'lon': -122.27080703, 'weight': 0.000904},
 {'lat': 47.56672929999999, 'lon': -122.36880075, 'weight': 0.010393},
 {'lat': 47.69437249, 'lon': -122.35484457, 'weight': 0.002259},
 {'lat': 47.68650052, 'lon': -122.39675162, 'weight': 0.000904},
 {'lat': 47.685204799999994, 'lon': -122.37919115, 'weight': 0.004519},
 {'lat': 47.54369248, 'lon': -122.32108633, 'weight': 0.004067},
 {'lat': 47.64144415, 'lon': -122.3249605, 'weight': 0.037054},
 {'lat': 47.67744717, 'lon': -122.31989440000001, 'weight': 0.020786},
 {'lat'

In [5]:
# df = data[['Latitude', 'Longitude']]
# df.tail()

In [40]:
# kms_per_radian = 6371.0088
# epsilon = .25 / kms_per_radian
# db = DBSCAN(eps=epsilon, min_samples=2, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
# cluster_labels = db.labels_
# num_clusters = len(set(cluster_labels))
# clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
# print('Number of clusters: {}'.format(num_clusters))

In [14]:
# set(db.labels_)
# db.labels_

In [41]:
# df['label'] = db.labels_
# df.groupby('label').count().head(20)

In [26]:
# def get_centermost_point(cluster):
#     centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
#     centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
#     return tuple(centermost_point)

# def get_centermost_point(cluster):
#     try:
#         centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
#         centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
#         return tuple(centermost_point)
#     except Exception as e:
#         #print(e, MultiPoint(cluster))
#         return None
    
# centermost_points = clusters.map(get_centermost_point)

In [42]:
# centermost_points.dropna(inplace=True)
# centermost_points

In [28]:
# lats, lons = zip(*centermost_points)
# rep_points = pd.DataFrame({'lon':lons, 'lat':lats})

In [29]:
# rep_points.tail()

In [30]:
# rs = rep_points.apply(lambda row: df[(df['Latitude']==row['lat']) & (df['Longitude']==row['lon'])].iloc[0], axis=1)
# rs.tail()

In [31]:
# rs

In [32]:
# rs['count'] = [x[0] for x in df.groupby('label').count().values[1:]]

In [44]:
# rs['weight'] = rs['count'].apply(lambda x: round(x/rs['count'].sum(), 6))
# rs

In [34]:
# def cluster_dict(row):
#     return {'lat': row['Latitude'], 'lon': row['Longitude'], 'weight': row['weight']}

In [35]:
# def cluster_json()

In [36]:
# list(rs.apply(lambda x: cluster_dict(x), axis=1))

[{'lat': 47.61573981, 'lon': -122.31225414, 'weight': 0.150023},
 {'lat': 47.65973215, 'lon': -122.34843383, 'weight': 0.032987},
 {'lat': 47.63918281, 'lon': -122.37129946, 'weight': 0.011749},
 {'lat': 47.66281648, 'lon': -122.31609893, 'weight': 0.052418},
 {'lat': 47.67181654, 'lon': -122.38048547, 'weight': 0.099413},
 {'lat': 47.63359668, 'lon': -122.27880585, 'weight': 0.001356},
 {'lat': 47.65249444, 'lon': -122.37121017, 'weight': 0.011297},
 {'lat': 47.55247918, 'lon': -122.27080703, 'weight': 0.000904},
 {'lat': 47.56672929999999, 'lon': -122.36880075, 'weight': 0.010393},
 {'lat': 47.69437249, 'lon': -122.35484457, 'weight': 0.002259},
 {'lat': 47.68650052, 'lon': -122.39675162, 'weight': 0.000904},
 {'lat': 47.685204799999994, 'lon': -122.37919115, 'weight': 0.004519},
 {'lat': 47.54369248, 'lon': -122.32108633, 'weight': 0.004067},
 {'lat': 47.64144415, 'lon': -122.3249605, 'weight': 0.037054},
 {'lat': 47.67744717, 'lon': -122.31989440000001, 'weight': 0.020786},
 {'lat'

In [None]:
# fig, ax = plt.subplots(figsize=[10, 6])
# rs_scatter = ax.scatter(rs['Longitude'], rs['Latitude'], c='#99cc99', edgecolor='None', alpha=0.7, s=300)
# df_scatter = ax.scatter(df['Longitude'], df['Latitude'], c='k', alpha=0.9, s=3)
# ax.set_title('Full data set vs DBSCAN reduced set')
# ax.set_xlabel('Longitude')
# ax.set_ylabel('Latitude')
# ax.legend([df_scatter, rs_scatter], ['Full set', 'Reduced set'], loc='upper right')
# plt.show()

In [37]:
p = figure(x_range=(-13630000, -13610000), y_range=(6038000, 6046000),
               x_axis_type="mercator", y_axis_type="mercator")
p.add_tile(CARTODBPOSITRON_RETINA)

rs['merc'] = rs.apply(lambda x: coords_to_plot(x['Latitude'], x['Longitude']), axis=1).values
# print(rs.head())

for i in rs.index:
    
    x, y, w = rs.loc[i, 'merc'][0], rs.loc[i, 'merc'][1], rs.loc[i, 'weight']

    p.circle(x = x, 
             y = y,
             size=w*1000,
             fill_alpha=.5,
             line_alpha=0,
            )
            
output_notebook()
show(p)

In [39]:
# rs

In [None]:
def plot_by_year(df, year_col, lat_col, lon_col):
    
    p = figure(x_range=(-13630000, -13610000), y_range=(6038000, 6046000),
               x_axis_type="mercator", y_axis_type="mercator")
    p.add_tile(CARTODBPOSITRON_RETINA)
    
    cmap = cm.get_cmap('seismic', len(df[year_col].unique()))
    
    for cmi, year in enumerate(df[year_col].unique()[-5:]): # last 5 years
        
        rgb = cmap(cmi)[:3]
        rgb = matplotlib.colors.rgb2hex(rgb)
        
        print(year)
        
        coords = df.loc[df['year'] == year, [lat_col, lon_col]]
        coords = coords.apply(lambda x: coords_to_plot(x[lat_col], x[lon_col]), axis=1).values
        for coord in coords[:]:
            x, y = coord[0], coord[1]

            p.circle(x = x, 
                     y = y,
                     size=30,
                     fill_alpha=.1,
                     line_alpha=0,
                     fill_color = rgb
                    )
            
    output_notebook()
    show(p)

In [None]:
# plot_by_year(data, 'year', 'Latitude', 'Longitude')

# SPD Reports

In [None]:
data = pd.read_csv('./SPD_Reports.csv')
data['Offence Start Date'] = pd.to_datetime(data['Offense Start Date'])

print('len:', len(data))
print('start:', data['Offence Start Date'].min())
print('end:', data['Offence Start Date'].max(), '\n')
print('nulls:')
print(data.isnull().sum())

In [None]:
data['Offense Type'].value_counts()

# 911 Incidents

In [None]:
# https://www.kaggle.com/sohier/seattle-police-department-911-incident-response (380 MB)

data = pd.read_csv('/Users/sam/Downloads/Seattle_Police_Department_911_Incident_Response.csv')

data.drop(columns=['Initial Type Description', 
                   'Initial Type Subgroup', 
                   'Initial Type Group', 
                   'At Scene Time'], inplace=True)

print('len:', len(data))

# Downsample
rand_ind = np.random.choice(data.index, 5000)
data = data.loc[rand_ind]

print('Downsample len:', len(data))
data['Event Clearance Date'] = pd.to_datetime(data['Event Clearance Date'])

print('start:', data['Event Clearance Date'].min())
print('end:', data['Event Clearance Date'].max(), '\n')
print('nulls:')
print(data.isnull().sum())

In [None]:
data['year'] = data['Event Clearance Date'].apply(lambda x: x.year)

In [None]:
data

In [None]:
data['Event Clearance Description'].value_counts()

In [None]:
data

In [None]:
plot_by_year(data, 'year', 'Latitude', 'Longitude')

# King County House Sales

In [None]:
data = pd.read_csv('./kc_house_data.csv')

print(len(data))

data['year'] = data['date'].apply(lambda x: int(str(x)[:4]))
data['date'] = pd.to_datetime(data['date'])

print('start:', data['date'].min())
print('end:', data['date'].max(), '\n')

print('nulls:')
print(data.isnull().sum())

In [None]:
# p = figure(x_range=(-13630000, -13610000), y_range=(6038000, 6046000),
#            x_axis_type="mercator", y_axis_type="mercator")
# p.add_tile(CARTODBPOSITRON_RETINA)

# colors_dict = {2014: '#008000', 2015: '#FF0000'}

# for year in data['year'].unique():
    
#     coords = data.loc[data['year'] == year, ['price', 'lat', 'long']]
#     coords['merc'] = coords.apply(lambda x: coords_to_plot(x['lat'], x['long']), axis=1).values
    
#     for coord in coords.index[:500]:
        
#         x, y, price = coords.loc[coord]['merc'][0], coords.loc[coord]['merc'][1], coords.loc[coord]['price']

#         p.circle(x = x,
#                  y = y,
#                  size=price/100000,
#                  fill_alpha=.2,
#                  line_color = colors_dict[year],
#                  fill_color = colors_dict[year]
#                 )

# output_notebook()
# show(p)

In [None]:
def normalize(coords_price):
    cord_min = coords_price.min()
    cord_max = coords_price.max()
    def scale(cord):
        return (cord - cord_min)/(cord_max-cord_min)
    return coords_price.apply(lambda x: scale(x))

In [None]:
def make_json(row):
    return {'lat': row['lat'], 'long': row['long'], 'value': round(row['normed'], 6)} 

In [None]:
# Make sample data
coords = data.loc[data['year'] == 2014, ['price', 'lat', 'long']]
coords['normed'] = normalize(coords['price'])
coords = coords[:2000]

data = list(coords.apply(lambda x: make_json(x), axis=1).values)

with open('data.json', 'w') as fp:
    json.dump(data, fp)

# Not used

In [None]:
# # No lat/long
# data = pd.read_csv('/Users/sam/Downloads/seattle-use-of-force/use-of-force.csv')
# data

In [None]:
# # No lat/long
# data = pd.read_csv('/Users/sam/Downloads/seattle-rent-and-income-restricted-housing/rent-and-income-restricted-housing.csv')
# data

# Land use permits

In [None]:
data = pd.read_csv('/Users/sam/Downloads/seattle-land-use-permits/land-use-permits.csv')
len(data)

for col in [col for col in data.columns if 'Date' in col]:
    try:
        data[col] = pd.to_datetime(data[col])
    except:
        pass
    
    print(col, 'min:', data[col].min())
    print(col, 'max:', data[col].max())
    
data.dtypes

In [None]:
data.isnull().sum()

In [None]:
# Applied
# Decisions/Applied
# Issues/Decisions
# Is there Issues/Applied?

# Find denied permits

In [None]:
# Or find permits approved

In [None]:
# for col in ['Permit Type', 'Category', 'Value', 'Applicant Name', 'Contractor']:
#     print(col)
#     print(data[col].unique())

In [None]:
data['Category'].unique()

In [None]:
data.loc[data['Category'] == 'COMMERCIAL']

In [None]:
data.loc[data['Category'] == 'MULTIFAMILY']

In [None]:
data

In [None]:
# p = figure(x_range=(-13630000, -13610000), y_range=(6038000, 6046000),
#            x_axis_type="mercator", y_axis_type="mercator")
# p.add_tile(CARTODBPOSITRON_RETINA)

# colors_dict = {2014: '#008000', 2015: '#FF0000'}

# for year in data['year'].unique():
    
#     coords = data.loc[data['year'] == year, ['price', 'lat', 'long']]
#     coords['merc'] = coords.apply(lambda x: coords_to_plot(x['lat'], x['long']), axis=1).values
    
#     for coord in coords.index[:500]:
        
#         x, y, price = coords.loc[coord]['merc'][0], coords.loc[coord]['merc'][1], coords.loc[coord]['price']

#         p.circle(x = x,
#                  y = y,
#                  size=price/100000,
#                  fill_alpha=.2,
#                  line_color = colors_dict[year],
#                  fill_color = colors_dict[year]
#                 )

# output_notebook()
# show(p)