## This notebook is to build the db for the crash data

In [243]:
#!/usr/bin/env python

import pandas as pd
from sodapy import Socrata
import matplotlib.pyplot as plt
from datetime import datetime
from modules.myfuncs import *

## Get the data from portal using Socrata client

In [244]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:

url = "data.cityofchicago.org"
client = Socrata(url, None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cityofchicago.org,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")



## Grab preliminary data through query

In [245]:
# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
# data website https://data.cityofchicago.org/resource/85ca-t3if.json

crash_data = client.get("85ca-t3if", 
                     #where='violation_date BETWEEN \'2015-01-01\' AND \'2020-12-01\'',
                     where="crash_date BETWEEN \'2015-01-01T00:00:00.000\' AND \'2020-12-31T00:00:00.000\'",
                     limit=1000000,
                    )

# Convert to pandas DataFrame
data = pd.DataFrame.from_records(crash_data)

## Investigate results

In [246]:
# 464k total
all_df = data.copy()

In [247]:
#client_df.dropna(subset=['camera_id']).isna().sum()
all_df.dropna(subset=['location',], inplace=True)
#all_df.isna().sum()

We dropped any crash that did not have lat/long location data.  It represents only 0.5% of data and would make it difficult to determine the intersection in which the crash occurred.

In [248]:
# drop a few columns we don't need, including location (we have lat/long)
dropme = ['statements_taken_i', 'private_property_i', 'photos_taken_i', 'dooring_i', 'date_police_notified','location']

all_df.drop(columns=dropme, inplace=True)

In [249]:
all_df.isna().sum()

crash_record_id                       0
rd_no                              1172
crash_date                            0
posted_speed_limit                    0
traffic_control_device                0
device_condition                      0
weather_condition                     0
lighting_condition                    0
first_crash_type                      0
trafficway_type                       0
alignment                             0
roadway_surface_cond                  0
road_defect                           0
report_type                       11202
crash_type                            0
hit_and_run_i                    328409
damage                                0
prim_contributory_cause               0
sec_contributory_cause                0
street_no                             0
street_direction                      0
street_name                           0
beat_of_occurrence                    3
num_units                             0
most_severe_injury                  938


# What are the categories of 'traffic_control_device' and other columns
We care about 'traffic signal'.  When looking at crash data, these are at intersections.
We also will use intersection_related_i and use it to filter the crashes

In [250]:
# What's in this data?
col_interest = ['traffic_control_device', 'device_condition', 'weather_condition',
       'lighting_condition', 'first_crash_type', 'trafficway_type',
       'alignment', 'roadway_surface_cond', 'road_defect', 'report_type',
       'crash_type', 'hit_and_run_i', 'damage', 'prim_contributory_cause',
       'sec_contributory_cause', 'street_no', 'street_direction',
       'street_name', 'beat_of_occurrence', 'num_units', 'most_severe_injury', 
        'injuries_fatal', 'injuries_incapacitating',
       'injuries_non_incapacitating', 'injuries_reported_not_evident',
       'injuries_no_indication', 'injuries_unknown', 'crash_hour',
       'crash_day_of_week', 'crash_month', 'latitude', 'longitude', 'lane_cnt',
       'private_property_i', 'intersection_related_i', 'crash_date_est_i',
       'statements_taken_i', 'work_zone_i', 'work_zone_type',
       'workers_present_i']

for col in col_interest:
    print(col, all_df[col].unique())

traffic_control_device ['UNKNOWN' 'NO CONTROLS' 'TRAFFIC SIGNAL' 'STOP SIGN/FLASHER'
 'SCHOOL ZONE' 'PEDESTRIAN CROSSING SIGN' 'YIELD'
 'FLASHING CONTROL SIGNAL' 'POLICE/FLAGMAN' 'RR CROSSING SIGN'
 'RAILROAD CROSSING GATE' 'OTHER RAILROAD CROSSING' 'DELINEATORS'
 'NO PASSING' 'BICYCLE CROSSING SIGN']
device_condition ['UNKNOWN' 'NO CONTROLS' 'FUNCTIONING PROPERLY' 'OTHER'
 'FUNCTIONING IMPROPERLY' 'NOT FUNCTIONING' 'WORN REFLECTIVE MATERIAL'
 'MISSING']
weather_condition ['UNKNOWN' 'CLEAR' 'SNOW' 'RAIN' 'CLOUDY/OVERCAST' 'FOG/SMOKE/HAZE'
 'OTHER' 'FREEZING RAIN/DRIZZLE' 'SLEET/HAIL' 'SEVERE CROSS WIND GATE'
 'BLOWING SNOW' 'BLOWING SAND, SOIL, DIRT']
lighting_condition ['DARKNESS, LIGHTED ROAD' 'DAYLIGHT' 'DARKNESS' 'DUSK' 'DAWN' 'UNKNOWN']
first_crash_type ['PARKED MOTOR VEHICLE' 'REAR END' 'ANGLE' 'SIDESWIPE SAME DIRECTION'
 'TURNING' 'REAR TO FRONT' 'PEDESTRIAN' 'PEDALCYCLIST' 'FIXED OBJECT'
 'SIDESWIPE OPPOSITE DIRECTION' 'OTHER NONCOLLISION' 'REAR TO SIDE'
 'OTHER OBJECT' 'HEAD O

KeyError: 'private_property_i'

Let's drop the columns that we don't really care about.

In [251]:
# remove na locations
print(len(all_df))
print(len(all_df.dropna(subset=['latitude'])))  # all have a location
#all_df.dropna(subset=['location'], inplace=True)  # drops about 2800 accidents with no recoreded location

463792
463792


intersection_related_i: A field observation by the police officer whether an intersection played a role in the crash. Does not represent whether or not the crash occurred within the intersection.

In [359]:
# How many were at traffic signals
signal_df = all_df[all_df['traffic_control_device']=='TRAFFIC SIGNAL']
other_df = all_df[all_df['traffic_control_device']!='TRAFFIC SIGNAL']
intrel_df = all_df[(all_df['intersection_related_i']=='Y')]
df = all_df[(all_df['intersection_related_i']=='Y')&(all_df['traffic_control_device']=='TRAFFIC SIGNAL')]

# how many crashes are at signals?
print('{:<25}{:<15}{:20}'.format('', 'n crash', 'Percent of total'))
print('{:<25}{:<15,}{:<20}'.format('All in study', len(all_df), '100%'))
print('{:<25}{:<15,}{:<4.1f}%'.format('At traffic signal', len(signal_df), len(signal_df)/len(all_df)*100))
print('{:<25}{:<15,}{:.1f}%'.format('Intersection-related', 
                                                    len(intrel_df), 
                                                    len(intrel_df)/len(all_df)*100
                                )
     )

print('{:<25}{:<15,}{:.1f}%'.format('Signal AND Intersection', 
                                                    len(df), 
                                                    len(df)/len(all_df)*100
                                )
     )
     


                         n crash        Percent of total    
All in study             463,792        100%                
At traffic signal        129,350        27.9%
Intersection-related     99,843         21.5%
Signal AND Intersection  60,332         13.0%


In [360]:
# what kind of crashes occur at traffic light, and how does it compare to other crashes?

def crash_stats(all_df, df):
    # by percentages
    crash_types = df['first_crash_type'].unique()

    print_list = []
    print('{:30}{:15}{:20}'.format("Type of Crash", 'Total crashes','Percent of total'))
    for crash in crash_types:
        num_crashes = len(df[df['first_crash_type']==crash])
        print_list.append([crash, num_crashes])
    
    print_list = sorted(print_list, key=lambda x: x[1], reverse=True)
    
    for crash, n in print_list: 
        print('{:30}{:<15d}{:>5.1f}%'.format(crash, n, 100 * n / len(df)))

def print_header(message):
    print('--'*20)
    print(message)
    print('--'*20)
        
def print_crash_stats(all_df, signal_df, other_df, signal_int_df):
    '''
    all_df Dataframe: all crashes in study 
    signal_df Dataframe: all signal related crashes
    other_df Dataframe: all non-signal related crahses
    signal_int_df: both signal and intersection related crashes
    '''
    print('CRASH STATS')
    print('Total crashes in study:', len(all_df))
    print('Signal crashes: {:.1f}%'.format((100 * len(signal_df) / len(all_df))))

    print_header('CRASHES NOT AT SIGNAL')
    print('Total crashes:', len(other_df))
    print()
    crash_stats(all_df, other_df)
    print('\n\n')
    
    
    print_header('CRASHES AT SIGNAL')
    print('Total crashes:', len(signal_df))
    print()
    crash_stats(all_df, signal_df)
    print('\n\n')

    
    print_header('CRASHES AT SIGNAL AND INTERSECTION RELATED')
    print('Total crashes:', len(signal_int_df))
    print()
    crash_stats(all_df, signal_int_df)
    

        

        
print_crash_stats(all_df, signal_df, other_df, df)


CRASH STATS
Total crashes in study: 463792
Signal crashes: 27.9%
----------------------------------------
CRASHES NOT AT SIGNAL
----------------------------------------
Total crashes: 334442

Type of Crash                 Total crashes  Percent of total    
PARKED MOTOR VEHICLE          103434          30.9%
REAR END                      61292           18.3%
SIDESWIPE SAME DIRECTION      50229           15.0%
ANGLE                         36606           10.9%
TURNING                       34479           10.3%
FIXED OBJECT                  18347            5.5%
PEDESTRIAN                    6639             2.0%
SIDESWIPE OPPOSITE DIRECTION  5725             1.7%
PEDALCYCLIST                  4854             1.5%
OTHER OBJECT                  3986             1.2%
HEAD ON                       3037             0.9%
REAR TO FRONT                 1995             0.6%
REAR TO SIDE                  1388             0.4%
OTHER NONCOLLISION            1352             0.4%
REAR TO REAR  

In [254]:
#!pip install geopy

In [361]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="https://github.com/sciencelee/chicago_rlc")


location = geolocator.geocode("175 5th Avenue NYC")
print(location.address)
# out: Flatiron Building, 175, 5th Avenue, Flatiron, New York, NYC, New York, ...

print((location.latitude, location.longitude))
# out: (40.7410861, -73.9896297241625)

#print(location.raw)
# out: {'place_id': '9167009604', 'type': 'attraction', ...}


# CAN USE THIS TO FIGURE OUT MY LAT LONG FROM RLC ADDRESS (or crash)


Flatiron Building, 175, 5th Avenue, Flatiron District, Manhattan Community Board 5, Manhattan, New York County, New York, 10010, United States
(40.741059199999995, -73.98964162240998)


In [362]:
# street no, street name, street direction?
for i in range(1):
    print(signal_df.iloc[i, :])
    
#     all_df['street_direction'].head() #N S E W
#     all_df['street_no'].head()  # just a number, so I guess street_direction goes with it.  Like 3700 N Ashland 
#     all_df['street_name'].head()

crash_record_id                  3358dc03b2fe52149c5901afd7221a9ebd509e3dfbaf49...
rd_no                                                                     JC366373
crash_date                                                 2019-07-27T02:16:00.000
posted_speed_limit                                                              30
traffic_control_device                                              TRAFFIC SIGNAL
device_condition                                              FUNCTIONING PROPERLY
weather_condition                                                            CLEAR
lighting_condition                                          DARKNESS, LIGHTED ROAD
first_crash_type                                                             ANGLE
trafficway_type                                                           FOUR WAY
alignment                                                       STRAIGHT AND LEVEL
roadway_surface_cond                                                           DRY
road

In [257]:
# BRINING SOME STUFF IN FROM THE RED CAMERA INVESTIGATION
# this loads up the lat/long of every camera
import pickle
import pandas as pd


with open('pickles/used_ints.pkl', 'rb') as f:
    used_ints = pickle.load(f)

with open('pickles/used_cams.pkl', 'rb') as f:
    used_cams = pickle.load(f)

#print(used_ints)
print(len(used_ints))
print(len(used_cams))

#print(used_ints)

183
363


In [363]:
# make interesections into a dataframe

ints_df = pd.DataFrame()

ints_df['intersection'] = used_ints.keys()
ints_df['location'] = ints_df['intersection'].apply(lambda x: used_ints[x][0])
ints_df['camera_ids'] = ints_df['intersection'].apply(lambda x: list(used_ints[x][1]))


ints_df['n_cams'] = ints_df['intersection'].apply(lambda x: len(used_ints[x][1]))
#cams_df['intersection'] = cams_df['address'].apply(lambda x: used_onts[x][2])


# we have way more cameras than intersections.  We need to consolidate
# some intersections have 3 or 4 cams.  Some have 1.
# we will focus on the intersection, not the individual camera for statistics
#print(len(ints_df.camera_id.unique()))
print(len(ints_df.intersection.unique()))

# NEED TO LOOK AT THE INTERSECTION NAMES TO MAKE SURE THEY MAKE SENSE
# WORRIED ABOUT NAMING CONVENTION 'ASHLAND AND CLARK' same as "CLARK AND ASHLAND"
ints_df.tail(50)
#print(ints_df.intersection.sort_values().unique())


# Data in this form works.  I have everything I need to start finding closest rlc to crash
# Take it back.  I do not have everything.  I really need to know the start and end dates for every intersection.
# This is tricky.  I need go through cam_ids, and find min start and max end for list.
# this would give me the start and end for each intersection.
# I am assuming, but do not know that cameras were installed as groups.  ASSUME??  Uh-oh. Need to know this.

# Rethinking this.  How about I just set up a database and go from there??

183


Unnamed: 0,intersection,location,camera_ids,n_cams
133,CANAL AND ROOSEVELT,"(41.86717512472001, -87.63937581258092)","[2291, 2294]",2
134,119TH AND HALSTED,"(41.677923389134385, -87.64198964584013)","[2404, 2402]",2
135,WESTERN AND 51ST,"(41.801080203554356, -87.68381820888895)","[2172, 2174]",2
136,ROOSEVELT AND HALSTED,"(41.86726309836901, -87.64697268601648)","[2234, 2233]",2
137,PULASKI AND MONTROSE,"(41.96105580370759, -87.72789120598296)","[1294, 1292]",2
138,STONEY ISLAND AND 79TH,"(41.75161152327319, -87.58559228428787)","[2464, 2462, 2461]",3
139,COTTAGE GROVE AND 71ST,"(41.76580035131893, -87.6057291269901)","[2571, 2572]",2
140,HAMLIN AND MADISON,"(41.88084147694413, -87.72090218457842)","[1901, 1903]",2
141,FULLERTON AND NARRAGANSETT,"(41.92378618871128, -87.78556720562165)","[1553, 1554]",2
142,KEDZIE AND ARMITAGE,"(41.91744617191702, -87.70711607187832)","[1832, 1834]",2


In [364]:
# lets split the cameras and lat and long all up into separate columns so we can store it in a db
# we will just make 3 columns for cam1 cam2 cam3

ints_df['latitude'] = ints_df['location'].apply(lambda x: x[0])
ints_df['longitude'] = ints_df['location'].apply(lambda x: x[1])
ints_df['camera_id1'] = ints_df['camera_ids'].apply(lambda x: x[0])
ints_df['camera_id2'] = ints_df['camera_ids'].apply(lambda x: x[1] if len(x)>1 else None)
ints_df['camera_id3'] = ints_df['camera_ids'].apply(lambda x: x[2] if len(x)>2 else None)

In [365]:
ints_df
final_ints_df = ints_df.drop(columns=['camera_ids', 'location'])

In [366]:
final_ints_df

Unnamed: 0,intersection,n_cams,latitude,longitude,camera_id1,camera_id2,camera_id3
0,31ST ST AND MARTIN LUTHER KING DRIVE,2,41.780231,-87.615640,2121,2123,
1,ASHLAND AND DIVERSEY,2,41.932484,-87.668562,1623,1621,
2,IRVING PARK AND LARAMIE,2,41.953866,-87.716609,1533,1534,
3,ELSTON AND IRVING PARK,2,41.953838,-87.720427,1503,1502,
4,IRVING PARK AND KILPATRICK,2,41.953866,-87.716609,2764,2763,
...,...,...,...,...,...,...,...
178,MILWAUKEE AND CENTRAL,1,41.975350,-87.767633,3022,,
179,FOSTER AND NORTHWEST HIGHWAY,1,41.975791,-87.769277,3003,,
180,MICHIGAN AND ONTARIO,2,41.893426,-87.625362,3084,3082,
181,MICHIGAN AND JACKSON,2,41.878217,-87.624456,3041,3043,


In [367]:
# Now put it into the db
# Create a db
conn = create_connection('database/rlc.db')  # function from myfuncs file
c = conn.cursor()
#conn.close()



sqlite3 version: 2.6.0
connected to database/rlc.db


## Create the table for intersection locations and cameras

In [368]:

def create_table(c, mytable, cols):
    '''
    Creates a new table 
    #Sets a constraint on db (UNIQUE) so that if you have camera_id and violation_date the same...
    '''
    
    flat_cols = []
    for col in cols:
        for item in col:
            flat_cols.append(item) 
    
    my_sql = 'CREATE TABLE {} ' + '(' + '{} ' * len(flat_cols) + ');'
    my_sql = my_sql.format(mytable, *flat_cols)  # insert datatypes for cols

    print(my_sql) # just to see what I'm doing

    
    try:     
        c.execute(my_sql) 
    except Exception as e:
        print('\nCREATE TABLE', mytable, 'FAILED!!',  e)


        
cols = [
        ['intersection', 'text'], 
        ['n_cams', 'int'],
        ['address', 'text'],
        ['latitude', 'real'],
        ['longitude', 'real'],
        ['camera_id1', 'int'],
        ['camera_id2', 'int'],
        ['camera_id3', 'int'],
        ]


create_table(c, 'intersection_cams', cols)
print()
conn.commit()

CREATE TABLE intersection_cams (intersection text n_cams int address text latitude real longitude real camera_id1 int camera_id2 int camera_id3 int );

CREATE TABLE intersection_cams FAILED!! table intersection_cams already exists



In [369]:
sql_fetch_tables(c, conn)  # helper function in myfuncs
delete_all_entries(c, conn, 'intersection_cams')
final_ints_df.to_sql('intersection_cams', conn, if_exists='replace', index = False)


[('cam_startend',), ('cam_locations',), ('intersection_locations',), ('daily_violations',), ('intersection_cams',), ('signal_crashes',)]


In [370]:
query = c.execute("SELECT camera_id1 FROM intersection_cams;").fetchall()
print(query[:5])
print(len(query))

[('2121',), ('1623',), ('1533',), ('1503',), ('2764',)]
183


## Add table for all crashes

before finalizing my crash data, I want to tag all of the crashes that occured at my rlc intersections.

This might be difficult.


In [371]:
# what is the closest rlc intersection to each accident.  
# The code takes hours to run on macbook
    


def closest_rlc(camloc_df, lat, long, m):
    # m is meters distance threshold for determining at the correct intersection
    latlong = (lat, long)
    for i in range(len(camloc_df)):
        loc = ((camloc_df.iloc[i]['latitude'], camloc_df.iloc[i]['longitude']))
        if distance.distance(loc, latlong).meters < m:
            return camloc_df.iloc[i]['intersection']
            #print(loc, camloc_df.iloc[i]['intersection'], distance.distance(loc, latlong).meters)

    else:
        return None

    
    
#print(crash_df.iloc[1, :])
closest_rlc(camloc_df, df.iloc[1]['latitude'], df.iloc[1]['longitude'], 30)
#df['intersection'] = 
df[:5].apply(lambda x: closest_rlc(camloc_df, x.latitude, x.longitude, 30), axis=1)


3                 None
6     WESTERN AND 79TH
15                None
16                None
19                None
dtype: object

In [372]:
# The code above works but is a memory HOG.  Let's try something else.
# maybe a simpler function would work??
# chi_lat = 41.8781
deg_lat = 111070 # distance in meters
deg_long = 83000 # distance in meters 

def pythag_latlong(loc1, loc2):
    delta_lat = abs(loc1[0] - loc2[0]) * deg_lat
    delta_long = abs(loc1[1] - loc2[1]) * deg_long
    return (delta_lat**2 + delta_long**2)**0.5

def closest_rlc(camloc_df, lat, long, m):
    # m is meters distance threshold for determining at the correct intersection
    latlong = (lat, long)
    for i in range(len(camloc_df)):
        loc = ((camloc_df.iloc[i]['latitude'], camloc_df.iloc[i]['longitude']))
        if pythag_latlong(loc, latlong) < m:
            return camloc_df.iloc[i]['intersection']
            #print(loc, camloc_df.iloc[i]['intersection'], distance.distance(loc, latlong).meters)

    else:
        return None


intersect_list = []

for i in range(10): #(len(df)):
    intersect = closest_rlc(camloc_df, float(df.iloc[i]['latitude']), float(df.iloc[i]['longitude']), 30)
    intersect_list.append(intersect)




In [373]:
# Now I am desperate.  This takes too long to process.  Let's simplify it and make it a box instead.
box_side = 50  # effectively makes it check for crash being within 25m of interscection
box_lat = box_side / 111070 / 2 # 111070 is meters in deg lat in Chicago
box_long = box_side / 83000 / 2 # 83000 is meters in deg long in Chicago

def box_check(lat, long, ints_df):
    answer = (ints_df[  (ints_df['latitude'] > (lat - box_lat)) & 
                      (ints_df['latitude'] < (lat + box_lat)) &
                      (ints_df['longitude'] > (long - box_long)) &
                      (ints_df['longitude'] < (long + box_long))
                     ])
    if answer.empty: return None
    return answer['intersection'].values[0]
    
# THIS SEEMS TO WORK WITH SPEED AND ELIMINATES MEMORY PROBLEM
for i in range(5): #(len(df)):
    intersect = box_check(float(df.iloc[i]['latitude']), float(df.iloc[i]['longitude']), final_ints_df)
    print(intersect)
    
    
# MOMENT OF TRUTH
df['intersection'] = df.apply(lambda x: box_check(float(x.latitude), float(x.longitude), camloc_df), axis=1)



None
WESTERN AND 79TH
None
None
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[None,
 'WESTERN AND 79TH',
 None,
 None,
 None,
 None,
 None,
 None,
 'LAWRENCE AND WESTERN',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'HALSTED AND 95TH',
 'WESTERN AND FULLERTON',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'WESTERN AND ARMITAGE',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'LAFAYETTE AND 87TH',
 None,
 None,
 None,
 None,
 None,
 None,
 'ASHLAND AND LAWRENCE',
 None,
 None,
 None,
 None,
 None,
 'WESTERN AND FULLERTON',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'HOLLYWOOD AND SHERIDAN',
 None,
 None,
 None,
 None,
 'DIVERSEY AND WESTERN',
 'PULASKI AND MONTROSE',
 None,
 None,
 None,
 'BROADWAY/SHERIDAN AND DEVON',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'CICERO AND 47TH',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'PULASKI AND NORTH',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [374]:
df['intersection'].head()

3                 None
6     WESTERN AND 79TH
15                None
16                None
19                None
Name: intersection, dtype: object

In [375]:
cols = [
        ['crash_record_id', 'text'],
        ['rd_no', 'int'],
        ['crash_date', 'numeric'],
        ['posted_speed_limit', 'int'],
        ['traffic_control_device', 'text'],
        ['device_condition', 'text'],
        ['weather_condition', 'text'],
        ['lighting_condition', 'text'],
        ['first_crash_type', 'text'],
        ['trafficway_type', 'text'],
        ['alignment', 'text'],
        ['roadway_surface_cond', 'text'],
        ['road_defect', 'text'],
        ['report_type', 'text'],
        ['crash_type', 'text'],
        ['hit_and_run_i', 'text'],
        ['damage', 'text'],
        ['prim_contributory_cause', 'text'],
        ['sec_contributory_cause', 'text'],
        ['street_no', 'int'],
        ['street_direction', 'text'],
        ['street_name', 'text'],
        ['beat_of_occurrence', 'int'],
        ['num_units', 'int'],
        ['most_severe_injury', 'text'],
        ['injuries_total', 'int'],
        ['injuries_fatal', 'int'],
        ['injuries_incapacitating', 'int'],
        ['injuries_non_incapacitating', 'int'],
        ['injuries_reported_not_evident', 'int'],
        ['injuries_no_indication', 'int'],
        ['injuries_unknown', 'int'],
        ['crash_hour', 'int'],
        ['crash_day_of_week', 'int'],
        ['crash_month', 'int'],
        ['latitude', 'real'],
        ['longitude', 'real'],
        ['lane_cnt', 'int'],
        ['private_property_i', 'text'],
        ['intersection_related_i', 'text'],
        ['crash_date_est_i', 'text'],
        ['statements_taken_i', 'text'],
        ['work_zone_i', 'text'],
        ['work_zone_type', 'text'],
        ['workers_present_i', 'text'],
        ['intersection', 'text']  # THIS ONE STILL NEEDS TO BE BUILT
        ]


create_table(c, 'signal_crashes', cols)
print()
conn.commit()

CREATE TABLE signal_crashes (crash_record_id text rd_no int crash_date numeric posted_speed_limit int traffic_control_device text device_condition text weather_condition text lighting_condition text first_crash_type text trafficway_type text alignment text roadway_surface_cond text road_defect text report_type text crash_type text hit_and_run_i text damage text prim_contributory_cause text sec_contributory_cause text street_no int street_direction text street_name text beat_of_occurrence int num_units int most_severe_injury text injuries_total int injuries_fatal int injuries_incapacitating int injuries_non_incapacitating int injuries_reported_not_evident int injuries_no_indication int injuries_unknown int crash_hour int crash_day_of_week int crash_month int latitude real longitude real lane_cnt int private_property_i text intersection_related_i text crash_date_est_i text statements_taken_i text work_zone_i text work_zone_type text workers_present_i text intersection text );

CREATE TAB

In [378]:
sql_fetch_tables(c, conn)  # helper function in myfuncs
delete_all_entries(c, conn, 'signal_crashes')
df.to_sql('signal_crashes', conn, if_exists='replace', index = False)


[('cam_startend',), ('cam_locations',), ('intersection_locations',), ('daily_violations',), ('intersection_cams',), ('signal_crashes',)]


In [379]:
query = c.execute("SELECT * FROM signal_crashes;").fetchall()
for q in query[:2]:
    print(q)
    print()
print(len(query))

('3358dc03b2fe52149c5901afd7221a9ebd509e3dfbaf496acad79d6c6a1125ec28f51d4eef6ddbd429a053c1916292640967d7dc007cce3366d861e74fe83548', 'JC366373', '2019-07-27T02:16:00.000', '30', 'TRAFFIC SIGNAL', 'FUNCTIONING PROPERLY', 'CLEAR', 'DARKNESS, LIGHTED ROAD', 'ANGLE', 'FOUR WAY', 'STRAIGHT AND LEVEL', 'DRY', 'NO DEFECTS', 'ON SCENE', 'INJURY AND / OR TOW DUE TO CRASH', None, 'OVER $1,500', 'DISREGARDING TRAFFIC SIGNALS', 'NOT APPLICABLE', '1400', 'S', 'CENTRAL PARK AVE', '1011', '2', 'NO INDICATION OF INJURY', '0', '0', '0', '0', '0', '3', '0', '2', '7', '7', '41.862315114', '-87.715448345', None, 'Y', None, None, None, None, None)

('33597a24ed9ab58921e1f7516cc25b94720fd292923e1f11a432bb369db732bf2c834702a761b4ed5c839f639e806a0779ea3c6dd9632853b47547206157f1b0', 'JA513137', '2017-11-15T16:42:00.000', '35', 'TRAFFIC SIGNAL', 'FUNCTIONING PROPERLY', 'CLEAR', 'DARKNESS', 'SIDESWIPE SAME DIRECTION', 'DIVIDED - W/MEDIAN (NOT RAISED)', 'STRAIGHT AND LEVEL', 'DRY', 'NO DEFECTS', 'ON SCENE', 'NO I

West Irving Park Road, Irving Park, Chicago, Jefferson Township, Cook County, Illinois, 60618, United States
(41.9538663, -87.7166086)


In [380]:
cams_df[cams_df['intersection']=='VAN BUREN AND WESTERN']  
# FIXED mistake here!!!  I HAD two locations. 
# The 2nd location is wrong.  Latlongs were slightly shifted for multiple cams in same intersection

Unnamed: 0,intersection
8,VAN BUREN AND WESTERN


What I currently have:
- all crashes at signals with long/lat postions

What I want
- add a rlc to any crash that happened at traffic signal within x meters of a rlc.  I am assuming that anyting outside of 30m is at another signal.  This thinking makes sense for chicago streets which are typically 50 to 100 m E/W and about 150 to 200 m N/S


Can use with geopy to get distance


In [312]:
# def closest_redlight(point, cams_df):
#     #distances = cams_df['location'].apply(lambda x: distance.distance(x, point).meters)
#     #return distances.min()  # this gives me the closest distance, but I want the closest camera

#     # 
#     distances = cams_df['camera_id'].apply(lambda x: [x, distance.distance(point, cams_df[cams_df['camera_id']==x]['location']).meters])
#     #closest = distances.apply(lambda x: x[1]).min()
#     #cam = distances[distances[0]==closet]
    
#     print(distances)
    


# signal_df2['latlong'][:1].apply(lambda x: closest_redlight(x, cams_df))

# ABOVE IS A CLOSE SOLUTON
# BELOW IS MY NEXT STEP.  I WOULD REALLY LIKE TO JUST RETURN THESE INDIVIDUALLY, BUT UNSURE IF I CAN.


# def close_redlight(point, cams_df):
#     #distances = cams_df['location'].apply(lambda x: distance.distance(x, point).meters)
#     #return distances.min()  # this gives me the closest distance, but I want the closest camera

#     # 
# #     distances = cams_df['camera_id'].apply(lambda x: [x, distance.distance(point, cams_df[cams_df['camera_id']==x]['location']).meters])
# #     my_min = min(distances, key=lambda x: x[1])
# #     #my_cam = distances[distances[0]==my_min]
    
#     # will see if loop is faster
#     for i in range(len(cams_df)):
#         print(cams_df.iloc[i]['location'])
    
#     return(my_min)
    

    
def closest_redlight(point, ints_df):

    # Try 3.  Will do a loop I can break from
    n = len(ints_df)
    threshold = 80
    
    for i in range(n):
        dist = distance.distance(ints_df.iloc[i]['location'], point).meters
        if dist < threshold:
            intersection = ints_df.iloc[i]['intersection'] 
            print(dist)
            break
    else:
        intersection = None 
      
    return intersection 
    
    

signal_df2['intersection'] = signal_df2['latlong'].apply(lambda x: closest_redlight(x, ints_df))

# TIMEIT ^^^^^^
# 100 rows takes 8s  (down from 1minute with original clunky code)
# We have 500k of them!!!!  That's 1000s or ~ 15min or so.

# TOMORROWS TASK
# add lat and long columns instead.  Don't measure distance p2p.  Lets just filter down to get ones less than threshold.
# we will end up checking a square basicaly, but it might be faster.


5.124864323976304
6.420731261493854
8.797828907998701
14.834090163121166
26.645000113056426
3.9160007016191306e-08
3.7313233922712326e-08
1.0783539382488574e-08
4.065226880762699e-08
62.70383513835724
29.505216060474446
76.45260004649793
53.745483087747424
2.8277777741445005e-08
10.855426812832361
28.44239805570159
3.8969473567225035e-08
23.43207235255013
24.383671698679233
50.27660567596613
10.005542137728836
4.7023402905368615e-08
2.3715293054500464
5.4602617183523594e-08
7.504959175819666
23.059687262088996
4.336840952845469e-08
3.4481764729986495e-08
7.626513257479981
3.335747472136621e-08
22.479128937115433
3.8969473567225035e-08
1.862616885007296e-08
7.723887739620867
3.8631921279757258
11.940071243391381
26.37786290497777
9.410992718969595
24.384045388135217
77.13006648140016
42.837995654779704
4.606408392720616e-08
9.272169887497903
4.606408392720616e-08
55.41199254639488
21.398718494078068
19.673348201150077
40.146865372101914
57.372392727053004
55.41199254639488
24.3839675670

26.645000113056426
23.300790763921817
4.7023402905368615e-08
17.53301697261736
8.212065890237638e-09
69.86197996953761
41.34006260409198
35.204185550995724
2.4990489207080733e-08
2.539673813230021e-08
57.43912719164704
4.578422102223326e-08
23.30530368699344
17.45854870154168
20.451915153034737
29.541080461132463
18.323263913038993
3.010657255893809
29.903665681495017
29.480303628500838
2.4990489207080733e-08
2.87017774175932e-08
7.562914454833557
3.247987735013498e-08
5.551989604998888e-08
24.38394751039059
2.4990489207080733e-08
29.270092920167222
71.29594865177812
23.410285022413525
53.89812201146573
3.4481764729986495e-08
24.383619210418125
49.001924113053775
28.4017278179278
16.133519704679852
6.271368474107226e-08
72.82072777551879
16.24911143063327
45.82464773323088
18.597935061451132
14.471932396231995
28.611942266025718
75.96059217701675
26.14281162320725
45.44710749972329
14.595640345536687
74.3931712708054
62.39981802852457
38.529089623721056
4.7023402905368615e-08
33.458020

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [358]:
print(signal_df2['intersection'].count(), len(signal_df2['intersection']))

NameError: name 'signal_df2' is not defined

In [382]:
rlc_crash_df = signal_df2[signal_df2['intersection'].notnull()]
not_rlc_crash_df = signal_df2[signal_df2['intersection'].isnull()]
len(not_rlc_crash_df)

NameError: name 'signal_df2' is not defined

# Add in closest red light camera intersection to my crashes.
If a camera is within the threshold specified, it will recored the intersection.
This is key to being able to link my rlc data and my crash data.

In [242]:
# what is the closest rlc intersection to each accident.  Is it under 50m
from geopy import distance



def closest_rlc(camloc_df, lat, long, m):
    # m is meters distance threshold for determining at the correct intersection
    latlong = (lat, long)
    for i in range(len(camloc_df)):
        loc = ((camloc_df.iloc[i]['latitude'], camloc_df.iloc[i]['longitude']))
        if distance.distance(loc, latlong).meters < m:
            return camloc_df.iloc[i]['intersection']
            #print(loc, camloc_df.iloc[i]['intersection'], distance.distance(loc, latlong).meters)

    else:
        return None

    
sql_fetch_tables(c, conn)  # helper function in myfuncs


camloc_df = pd.read_sql_query("SELECT * from cam_locations", conn)
    
#print(df.iloc[1, :])
closest_rlc(camloc_df, df.iloc[1]['latitude'], df.iloc[1]['longitude'], 50)
df['intersection'] = df.apply(lambda x: closest_rlc(camloc_df, x.latitude, x.longitude, 30), axis=1)



[('cam_startend',), ('cam_locations',), ('intersection_locations',), ('daily_violations',), ('signal_crashes',), ('intersection_cams',)]


KeyboardInterrupt: 