## Imports

In [5]:

import pandas as pd
from sodapy import Socrata

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofchicago.org", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cityofchicago.org,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")





## Grab preliminary data through query

In [14]:
# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
# data website https://data.cityofchicago.org/resource/85ca-t3if.json

crash_data = client.get("85ca-t3if", 
                     #where='violation_date BETWEEN \'2015-01-01\' AND \'2020-12-01\'',
                     where='crash_date BETWEEN \'2015-01-01T00:00:00.000\' AND \'2020-12-01T00:00:00.000\'',
                     limit=1000000,
                    )

# Convert to pandas DataFrame
all_df = pd.DataFrame.from_records(crash_data)

In [15]:
# 459k total

all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459400 entries, 0 to 459399
Data columns (total 49 columns):
crash_record_id                  459400 non-null object
rd_no                            459400 non-null object
crash_date                       459400 non-null object
posted_speed_limit               459400 non-null object
traffic_control_device           459400 non-null object
device_condition                 459400 non-null object
weather_condition                459400 non-null object
lighting_condition               459400 non-null object
first_crash_type                 459400 non-null object
trafficway_type                  459400 non-null object
alignment                        459400 non-null object
roadway_surface_cond             459400 non-null object
road_defect                      459400 non-null object
report_type                      448336 non-null object
crash_type                       459400 non-null object
damage                           459400 non-null 

In [22]:
# What's in this data?
for col in ['traffic_control_device', 'intersection_related_i']:
    print(col, all_df[col].unique())

traffic_control_device ['STOP SIGN/FLASHER' 'TRAFFIC SIGNAL' 'NO CONTROLS'
 'PEDESTRIAN CROSSING SIGN' 'OTHER' 'UNKNOWN' 'YIELD' 'OTHER REG. SIGN'
 'LANE USE MARKING' 'POLICE/FLAGMAN' 'RAILROAD CROSSING GATE'
 'SCHOOL ZONE' 'DELINEATORS' 'OTHER RAILROAD CROSSING'
 'RR CROSSING SIGN' 'BICYCLE CROSSING SIGN']
intersection_related_i ['Y' nan 'N']


In [20]:
# Let's clean up our dataset

# remove na locations
print(len(all_df))
print(len(all_df.dropna(subset=['location'])))
all_df.dropna(subset=['location'], inplace=True)  # drops about 2800 accidents with no recoreded location

459400
456899


In [30]:
# remove any accident that did not occur at an intersection (interesction_related_i)
print(len(all_df[all_df['intersection_related_i']=='Y']))  # drops about 80% of all accidents
print(len(all_df[all_df['traffic_control_device']=='TRAFFIC SIGNAL'])) # about the same
len( all_df[(all_df['intersection_related_i']=='Y') & (all_df['traffic_control_device']=='TRAFFIC SIGNAL')] ) 

# 59k have both in common.  Intersection related accident and also a traffic signal
# That's 10% of all accidents in the city occuring at lights over past 5 years

98166
127375


59342

intersection_related_i: A field observation by the police officer whether an intersection played a role in the crash. Does not represent whether or not the crash occurred within the intersection.

In [78]:
# How many were at traffic signals
signal_df = all_df[all_df['traffic_control_device']=='TRAFFIC SIGNAL']
other_df = all_df[all_df['traffic_control_device']!='TRAFFIC SIGNAL']
intrel_df = all_df[(all_df['intersection_related_i']=='Y')]
df = all_df[(all_df['intersection_related_i']=='Y')&(all_df['traffic_control_device']=='TRAFFIC SIGNAL')]

# how many crashes are at signals?
print('{:<25}{:<15}{:20}'.format('', 'n crash', 'Percent of total'))
print('{:<25}{:<15,}{:<20}'.format('All in study', len(all_df), '100%'))
print('{:<25}{:<15,}{:<4.1f}%'.format('At traffic signal', len(signal_df), len(signal_df)/len(all_df)*100))
print('{:<25}{:<15,}{:.1f}%'.format('Intersection-related', 
                                                    len(intrel_df), 
                                                    len(intrel_df)/len(all_df)*100
                                )
     )

print('{:<25}{:<15,}{:.1f}%'.format('Signal AND Intersection', 
                                                    len(df), 
                                                    len(df)/len(all_df)*100
                                )
     )
     


                         n crash        Percent of total    
All in study             456,899        100%                
At traffic signal        127,375        27.9%
Intersection-related     98,166         21.5%
Signal AND Intersection  59,342         13.0%


In [80]:
# what kind of crashes occur at traffic light, and how does it compare to other crashes?

def crash_stats(all_df, df):
    # by percentages
    crash_types = df['first_crash_type'].unique()

    print_list = []
    print('{:30}{:15}{:20}'.format("Type of Crash", 'Total crashes','Percent of total'))
    for crash in crash_types:
        num_crashes = len(df[df['first_crash_type']==crash])
        print_list.append([crash, num_crashes])
    
    print_list = sorted(print_list, key=lambda x: x[1], reverse=True)
    
    for crash, n in print_list: 
        print('{:30}{:<15d}{:>5.1f}%'.format(crash, n, 100 * n / len(df)))

def print_header(message):
    print('--'*20)
    print(message)
    print('--'*20)
        
def print_crash_stats(all_df, signal_df, other_df, signal_int_df):
    print('CRASH STATS')
    print('Total crashes in study:', len(all_df))
    print('Signal crashes: {:.1f}%'.format((100 * len(signal_df) / len(all_df))))

    print_header('CRASHES NOT AT SIGNAL')
    print('Total crashes:', len(other_df))
    print()
    crash_stats(all_df, other_df)
    print('\n\n')
    
    
    print_header('CRASHES AT SIGNAL')
    print('Total crashes:', len(signal_df))
    print()
    crash_stats(all_df, signal_df)
    print('\n\n')

    
    print_header('CRASHES AT SIGNAL AND INTERSECTION RELATED')
    print('Total crashes:', len(signal_int_df))
    print()
    crash_stats(all_df, signal_int_df)
    

        

        
print_crash_stats(all_df, signal_df, other_df, df)


CRASH STATS
Total crashes in study: 456899
Signal crashes: 27.9%
----------------------------------------
CRASHES NOT AT SIGNAL
----------------------------------------
Total crashes: 329524

Type of Crash                 Total crashes  Percent of total    
PARKED MOTOR VEHICLE          101773          30.9%
REAR END                      60579           18.4%
SIDESWIPE SAME DIRECTION      49609           15.1%
ANGLE                         36063           10.9%
TURNING                       33980           10.3%
FIXED OBJECT                  17943            5.4%
PEDESTRIAN                    6551             2.0%
SIDESWIPE OPPOSITE DIRECTION  5662             1.7%
PEDALCYCLIST                  4825             1.5%
OTHER OBJECT                  3931             1.2%
HEAD ON                       2983             0.9%
REAR TO FRONT                 1923             0.6%
OTHER NONCOLLISION            1336             0.4%
REAR TO SIDE                  1320             0.4%
REAR TO REAR  

In [278]:
cams_of_interest = ['2552', '2592', '2593', '2553', '3002', '3022', '3032', '3003', '3041', '3082', '3043', '3084', '3051', '3052']
cam_addresses = ['340 W UPPER WACKER DR', '10300 S HALSTED STREE',
       '11500 S HALSTED STREE', '200 N UPPER WACKER DR',
       '5232 N MILWAUKEE AVE', '300 S MICHIGAN AVE', '100 E ONTARIO ST',
       '628 N MICHIGAN AVE', '100 E JACKSON BLVD', '5616 W FOSTER AVE',
       '5200 N NORTHWEST HWY', '5232 N CENTRAL AVE', '800 W 115TH STREET',
       '800 W 103RD STREET']

cam_with_address =  [['2552', '11500 S HALSTED STREE', '115TH AND HALSTED'],
                     ['2592', '10300 S HALSTED STREE', 'HALSTED AND 103RD'],
                     ['2593', '800 W 103RD STREET', 'HALSTED AND 103RD'],
                     ['2553', '800 W 115TH STREET', '115TH AND HALSTED'],
                     ['3002', '5200 N NORTHWEST HWY', 'NORTHWEST HIGHWAY AND FOSTER'],
                     ['3022', '5232 N MILWAUKEE AVE', 'MILWAUKEE AND CENTRAL'],
                     ['3032', '5232 N CENTRAL AVE', 'CENTRAL AND MILWAUKEE'],
                     ['3003', '5616 W FOSTER AVE', 'FOSTER AND NORTHWEST HIGHWAY'],
                     ['3041', '300 S MICHIGAN AVE', 'MICHIGAN AND JACKSON'],
                     ['3082', '628 N MICHIGAN AVE', 'MICHIGAN AND ONTARIO'],
                     ['3043', '100 E JACKSON BLVD', 'MICHIGAN AND JACKSON'],
                     ['3084', '100 E ONTARIO ST', 'MICHIGAN AND ONTARIO'],
                     ['3051', '200 N UPPER WACKER DR', 'LAKE AND UPPER WACKER'],
                     ['3052', '340 W UPPER WACKER DR', 'LAKE AND UPPER WACKER']]

# this is just to fill in the 5% of data missing a lat long.
# I will use lat long proximity to determine intersection
#all_cams_missing_addresses = 


In [81]:
print(df.columns)  
print(df.street_name)

print(df.latitude.isna().sum())
print(df.location.isna().sum())


Index(['crash_record_id', 'rd_no', 'crash_date', 'posted_speed_limit',
       'traffic_control_device', 'device_condition', 'weather_condition',
       'lighting_condition', 'first_crash_type', 'trafficway_type',
       'alignment', 'roadway_surface_cond', 'road_defect', 'report_type',
       'crash_type', 'damage', 'date_police_notified',
       'prim_contributory_cause', 'sec_contributory_cause', 'street_no',
       'street_direction', 'street_name', 'beat_of_occurrence', 'num_units',
       'most_severe_injury', 'injuries_total', 'injuries_fatal',
       'injuries_incapacitating', 'injuries_non_incapacitating',
       'injuries_reported_not_evident', 'injuries_no_indication',
       'injuries_unknown', 'crash_hour', 'crash_day_of_week', 'crash_month',
       'lane_cnt', 'intersection_related_i', 'latitude', 'longitude',
       'location', 'statements_taken_i', 'hit_and_run_i', 'crash_date_est_i',
       'private_property_i', 'work_zone_i', 'work_zone_type',
       'workers_present_i

In [82]:
df.location.head()

4     {'type': 'Point', 'coordinates': [-87.74095358...
6     {'type': 'Point', 'coordinates': [-87.64328635...
7     {'type': 'Point', 'coordinates': [-87.62382803...
10    {'type': 'Point', 'coordinates': [-87.74095358...
18    {'type': 'Point', 'coordinates': [-87.66590234...
Name: location, dtype: object

In [10]:
#!pip install geopy

In [281]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="https://github.com/sciencelee/chicago_rlc")


location = geolocator.geocode("175 5th Avenue NYC")
print(location.address)
# out: Flatiron Building, 175, 5th Avenue, Flatiron, New York, NYC, New York, ...

print((location.latitude, location.longitude))
# out: (40.7410861, -73.9896297241625)

#print(location.raw)
# out: {'place_id': '9167009604', 'type': 'attraction', ...}


# CAN USE THIS TO FIGURE OUT MY LAT LONG FROM RLC ADDRESS (or crash)


Flatiron Building, 175, 5th Avenue, Flatiron District, Manhattan Community Board 5, Manhattan, New York County, New York, 10010, United States
(40.741059199999995, -73.98964162240998)


In [83]:
for col in all_df.columns:
    print(col)

crash_record_id
rd_no
crash_date
posted_speed_limit
traffic_control_device
device_condition
weather_condition
lighting_condition
first_crash_type
trafficway_type
alignment
roadway_surface_cond
road_defect
report_type
crash_type
damage
date_police_notified
prim_contributory_cause
sec_contributory_cause
street_no
street_direction
street_name
beat_of_occurrence
num_units
most_severe_injury
injuries_total
injuries_fatal
injuries_incapacitating
injuries_non_incapacitating
injuries_reported_not_evident
injuries_no_indication
injuries_unknown
crash_hour
crash_day_of_week
crash_month
lane_cnt
intersection_related_i
latitude
longitude
location
statements_taken_i
hit_and_run_i
crash_date_est_i
private_property_i
work_zone_i
work_zone_type
workers_present_i
photos_taken_i
dooring_i


In [283]:
# street no, street name, street direction?
for i in range(1):
    print(signal_df.iloc[i, :])
    
#     all_df['street_direction'].head() #N S E W
#     all_df['street_no'].head()  # just a number, so I guess street_direction goes with it.  Like 3700 N Ashland 
#     all_df['street_name'].head()

crash_record_id                  9c5243eb61a4234d0d23ba523b07433ece1364378ef43e...
crash_date                                                 2020-12-28T00:20:00.000
posted_speed_limit                                                              35
traffic_control_device                                              TRAFFIC SIGNAL
device_condition                                              FUNCTIONING PROPERLY
weather_condition                                                            CLEAR
lighting_condition                                          DARKNESS, LIGHTED ROAD
first_crash_type                                                      FIXED OBJECT
trafficway_type                                                           FOUR WAY
alignment                                                       STRAIGHT AND LEVEL
roadway_surface_cond                                                           DRY
road_defect                                                             NO DEFECTS
repo

In [91]:
# BRINING SOME STUFF IN FROM THE RED CAMERA INVESTIGATION
# this loads up the lat/long of every camera
import pickle
import pandas as pd


with open('used_ints.pkl', 'rb') as f:
    used_ints = pickle.load(f)

with open('used_cams.pkl', 'rb') as f:
    used_cams = pickle.load(f)

#print(used_ints)
print(len(used_ints))
print(len(used_cams))

#print(used_ints)

183
363


In [105]:


# let'make our cams into a DataFrame

ints_df = pd.DataFrame()

ints_df['intersection'] = used_ints.keys()
ints_df['location'] = ints_df['intersection'].apply(lambda x: used_ints[x][0])
ints_df['camera_ids'] = ints_df['intersection'].apply(lambda x: list(used_ints[x][1]))


ints_df['n_cams'] = ints_df['intersection'].apply(lambda x: len(used_ints[x][1]))
#cams_df['intersection'] = cams_df['address'].apply(lambda x: used_onts[x][2])


# we have way more cameras than intersections.  We need to consolidate
# some intersections have 3 or 4 cams.  Some have 1.
# we will focus on the intersection, not the individual camera for statistics
#print(len(ints_df.camera_id.unique()))
print(len(ints_df.intersection.unique()))

# NEED TO LOOK AT THE INTERSECTION NAMES TO MAKE SURE THEY MAKE SENSE
# WORRIED ABOUT NAMING CONVENTION 'ASHLAND AND CLARK' same as "CLARK AND ASHLAND"
ints_df.tail(50)
#print(ints_df.intersection.sort_values().unique())


# Data in this form works.  I have everything I need to start finding closest rlc to crash
# Take it back.  I do not have everything.  I really need to know the start and end dates for every intersection.
# This is tricky.  I need go through cam_ids, and find min start and max end for list.
# this would give me the start and end for each intersection.
# I am assuming, but do not know that cameras were installed as groups.  ASSUME??  Uh-oh. Need to know this.

# Rethinking this.  How about I just set up a database and go from there??

183


Unnamed: 0,intersection,location,camera_ids,n_cams
133,CANAL AND ROOSEVELT,"(41.86717512472001, -87.63937581258092)","[2291, 2294]",2
134,119TH AND HALSTED,"(41.677923389134385, -87.64198964584013)","[2404, 2402]",2
135,WESTERN AND 51ST,"(41.801080203554356, -87.68381820888895)","[2172, 2174]",2
136,ROOSEVELT AND HALSTED,"(41.86726309836901, -87.64697268601648)","[2234, 2233]",2
137,PULASKI AND MONTROSE,"(41.96105580370759, -87.72789120598296)","[1294, 1292]",2
138,STONEY ISLAND AND 79TH,"(41.75161152327319, -87.58559228428787)","[2464, 2462, 2461]",3
139,COTTAGE GROVE AND 71ST,"(41.76580035131893, -87.6057291269901)","[2571, 2572]",2
140,HAMLIN AND MADISON,"(41.88084147694413, -87.72090218457842)","[1901, 1903]",2
141,FULLERTON AND NARRAGANSETT,"(41.92378618871128, -87.78556720562165)","[1553, 1554]",2
142,KEDZIE AND ARMITAGE,"(41.91744617191702, -87.70711607187832)","[1832, 1834]",2


In [271]:
#HOLD OFF ON THIS.  NOT SURE IF I WILL USE THIS CODE

# Let's make another dataframe that contains intersections with all cams
# these will work as sql dbs too.  POINT FOR FUTURE
intersection_df = pd.DataFrame(columns=['intersection', 
                                        'location', 
                                        'latitude',
                                        'longitude',
                                        'address',
                                        'cameras'])

all_intersections = cams_df.intersection.unique()
intersection_df['intersection'] = all_intersections
#intersection_df['location'] = 
#intersection_df['intersection'].apply(lambda x: cams_df[cams_df['intersection']==x]['location'])

cams_df[cams_df['intersection']=='WESTERN AND 51ST']

#print(intersection_df.head(20))
#
#cams_df[cams_df['intersection']=='VAN BUREN AND WESTERN']
# WELL, THIS IS A PROBLEM.  Some of the camera locations are way off!!  Some are just fine.
# let's geocode the whole thing and go from there.  This will be a big hassle, but needs to be done to link
# the two datasets (cameras and crashes)



Unnamed: 0,address,location,camera_id,intersection
185,2172,"{'latitude': '41.801080203554356', 'longitude'...",2172,WESTERN AND 51ST
336,2174,"{'latitude': '41.801257491998776', 'longitude'...",2174,WESTERN AND 51ST


In [85]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="https://github.com/sciencelee/chicago_rlc")


location = geolocator.geocode("4700 W IRVING PARK ROAD, CHICAGO")  # NS then EW
print(location.address)
# out: Flatiron Building, 175, 5th Avenue, Flatiron, New York, NYC, New York, ...

print((location.latitude, location.longitude))
# out: (40.7410861, -73.9896297241625)

#print(location.raw)
# out: {'place_id': '9167009604', 'type': 'attraction', ...}


# CAN USE THIS TO FIGURE OUT MY LAT LONG FROM RLC ADDRESS (or crash)

West Irving Park Road, Irving Park, Chicago, Jefferson Township, Cook County, Illinois, 60618, United States
(41.9538663, -87.7166086)


In [87]:
# NEED TO FIX THE CAM ID STUFF>. I WANT MULTIPLE CAMS FOR SINGLE INTERSECTION.  AHHHHH!!!

df[df['intersection']=='IRVING PARK AND KILPATRICK']

KeyError: 'intersection'

In [294]:
cams_df[cams_df['intersection']=='VAN BUREN AND WESTERN']  # MISTAKE HERE!!!  I have two locations. The 2nd is wrong

Unnamed: 0,intersection,location,camera_id
8,VAN BUREN AND WESTERN,"(41.876162, -87.686438)",2054


What I currently have:
- all crashes at signals with long/lat postions
What I want
- add a rlc to any crash that happened at traffic signal within 20m of a rlc.  I am assuming this to be at the intersection.  I am assuming that anyting outside of 20m is at another signal.  This thinking makes sense for chicago streets which are typically 50 to 100 m E/W and about 150 to 200 m N/S


signal_df already has locations in Point format.  Can use with geopy to get distance


In [295]:
# lets make a longlat col instead of using location which ia in a point format.
# still want to keep the Point data in case we map it.


print(signal_df['location'].isna().sum())  # only 1% of data is missing. Should I just drop it?
signal_df2 = signal_df.dropna(subset=['location'])

# I may come back later to look up address for this data
print(len(signal_df))
print(len(signal_df2)) # 20 dropped


signal_df2['longlat'] = signal_df2['location'].apply(lambda x: x['coordinates'])
signal_df2['latlong'] = signal_df2['location'].apply(lambda x: [x['coordinates'][1], x['coordinates'][0]])



signal_df2.head()

# Note
#Length in meters of 1° of latitude = always 111.32 km
#Length in meters of 1° of longitude = 40075 km * cos( latitude ) / 360


20
2900
2880


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,crash_record_id,crash_date,posted_speed_limit,traffic_control_device,device_condition,weather_condition,lighting_condition,first_crash_type,trafficway_type,alignment,...,crash_date_est_i,private_property_i,dooring_i,work_zone_i,work_zone_type,workers_present_i,rd_no,lane_cnt,longlat,latlong
0,9c5243eb61a4234d0d23ba523b07433ece1364378ef43e...,2020-12-28T00:20:00.000,35,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",FIXED OBJECT,FOUR WAY,STRAIGHT AND LEVEL,...,,,,,,,,,"[-87.634042511085, 41.887040881068]","[41.887040881068, -87.634042511085]"
4,14d4e3963d036fa759cb3f3503bcec173acee575734864...,2020-12-27T22:26:00.000,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,...,,,,,,,,,"[-87.690451220174, 42.01238873061]","[42.01238873061, -87.690451220174]"
5,439da8beb92d7da0152c605589beed39954fb788a43cfb...,2020-12-27T22:00:00.000,45,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,T-INTERSECTION,STRAIGHT AND LEVEL,...,,,,,,,,,"[-87.61865463769, 41.867330715666]","[41.867330715666, -87.61865463769]"
6,8a3432a7ff84ab4a414740eebebbf0816d2313cdabaa04...,2020-12-27T22:00:00.000,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,RAIN,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,...,,,,,,,,,"[-87.722024093476, 41.931742123587]","[41.931742123587, -87.722024093476]"
10,d73adf6ac5fb07bbbc5eed98a9780674881b00136fae21...,2020-12-27T20:30:00.000,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",TURNING,FOUR WAY,STRAIGHT AND LEVEL,...,,,,,,,,,"[-87.701253788355, 41.877302828389]","[41.877302828389, -87.701253788355]"


In [296]:
# Example from geopy docs
from geopy import distance
newport_ri = (41.49008, -71.312796)
cleveland_oh = (41.499498, -81.695391)
print(distance.distance(newport_ri, cleveland_oh).miles)

# my example
rlc1 = signal_df2.iloc[0]['longlat']
print(signal_df2.iloc[0]['street_name'])
rlc2 = signal_df2.iloc[1]['longlat']
print(signal_df2.iloc[1]['street_name'])

print(rlc1, rlc2)
print(distance.distance(signal_df2.iloc[0]['longlat'], signal_df2.iloc[1]['longlat']).meters)


538.3904453677204
WACKER DR
TOUHY AVE
[-87.634042511085, 41.887040881068] [-87.690451220174, 42.01238873061]
6326.232694682955


In [312]:
# def closest_redlight(point, cams_df):
#     #distances = cams_df['location'].apply(lambda x: distance.distance(x, point).meters)
#     #return distances.min()  # this gives me the closest distance, but I want the closest camera

#     # 
#     distances = cams_df['camera_id'].apply(lambda x: [x, distance.distance(point, cams_df[cams_df['camera_id']==x]['location']).meters])
#     #closest = distances.apply(lambda x: x[1]).min()
#     #cam = distances[distances[0]==closet]
    
#     print(distances)
    


# signal_df2['latlong'][:1].apply(lambda x: closest_redlight(x, cams_df))

# ABOVE IS A CLOSE SOLUTON
# BELOW IS MY NEXT STEP.  I WOULD REALLY LIKE TO JUST RETURN THESE INDIVIDUALLY, BUT UNSURE IF I CAN.


# def close_redlight(point, cams_df):
#     #distances = cams_df['location'].apply(lambda x: distance.distance(x, point).meters)
#     #return distances.min()  # this gives me the closest distance, but I want the closest camera

#     # 
# #     distances = cams_df['camera_id'].apply(lambda x: [x, distance.distance(point, cams_df[cams_df['camera_id']==x]['location']).meters])
# #     my_min = min(distances, key=lambda x: x[1])
# #     #my_cam = distances[distances[0]==my_min]
    
#     # will see if loop is faster
#     for i in range(len(cams_df)):
#         print(cams_df.iloc[i]['location'])
    
#     return(my_min)
    

    
def closest_redlight(point, ints_df):

    # Try 3.  Will do a loop I can break from
    n = len(ints_df)
    threshold = 80
    
    for i in range(n):
        dist = distance.distance(ints_df.iloc[i]['location'], point).meters
        if dist < threshold:
            intersection = ints_df.iloc[i]['intersection'] 
            print(dist)
            break
    else:
        intersection = None 
      
    return intersection 
    
    

signal_df2['intersection'] = signal_df2['latlong'].apply(lambda x: closest_redlight(x, ints_df))

# TIMEIT ^^^^^^
# 100 rows takes 8s  (down from 1minute with original clunky code)
# We have 500k of them!!!!  That's 1000s or ~ 15min or so.

# TOMORROWS TASK
# add lat and long columns instead.  Don't measure distance p2p.  Lets just filter down to get ones less than threshold.
# we will end up checking a square basicaly, but it might be faster.


5.124864323976304
6.420731261493854
8.797828907998701
14.834090163121166
26.645000113056426
3.9160007016191306e-08
3.7313233922712326e-08
1.0783539382488574e-08
4.065226880762699e-08
62.70383513835724
29.505216060474446
76.45260004649793
53.745483087747424
2.8277777741445005e-08
10.855426812832361
28.44239805570159
3.8969473567225035e-08
23.43207235255013
24.383671698679233
50.27660567596613
10.005542137728836
4.7023402905368615e-08
2.3715293054500464
5.4602617183523594e-08
7.504959175819666
23.059687262088996
4.336840952845469e-08
3.4481764729986495e-08
7.626513257479981
3.335747472136621e-08
22.479128937115433
3.8969473567225035e-08
1.862616885007296e-08
7.723887739620867
3.8631921279757258
11.940071243391381
26.37786290497777
9.410992718969595
24.384045388135217
77.13006648140016
42.837995654779704
4.606408392720616e-08
9.272169887497903
4.606408392720616e-08
55.41199254639488
21.398718494078068
19.673348201150077
40.146865372101914
57.372392727053004
55.41199254639488
24.3839675670

26.645000113056426
23.300790763921817
4.7023402905368615e-08
17.53301697261736
8.212065890237638e-09
69.86197996953761
41.34006260409198
35.204185550995724
2.4990489207080733e-08
2.539673813230021e-08
57.43912719164704
4.578422102223326e-08
23.30530368699344
17.45854870154168
20.451915153034737
29.541080461132463
18.323263913038993
3.010657255893809
29.903665681495017
29.480303628500838
2.4990489207080733e-08
2.87017774175932e-08
7.562914454833557
3.247987735013498e-08
5.551989604998888e-08
24.38394751039059
2.4990489207080733e-08
29.270092920167222
71.29594865177812
23.410285022413525
53.89812201146573
3.4481764729986495e-08
24.383619210418125
49.001924113053775
28.4017278179278
16.133519704679852
6.271368474107226e-08
72.82072777551879
16.24911143063327
45.82464773323088
18.597935061451132
14.471932396231995
28.611942266025718
75.96059217701675
26.14281162320725
45.44710749972329
14.595640345536687
74.3931712708054
62.39981802852457
38.529089623721056
4.7023402905368615e-08
33.458020

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [313]:
print(signal_df2['intersection'].count(), len(signal_df2['intersection']))

482 2880


In [324]:
rlc_crash_df = signal_df2[signal_df2['intersection'].notnull()]
not_rlc_crash_df = signal_df2[signal_df2['intersection'].isnull()]
len(not_rlc_crash_df)

2398

In [326]:
# FROM ABOVE 
# what kind of crashes occur at traffic light, and how does it compare to other crashes?

def crash_stats(all_df, df):
    # by percentages
    crash_types = df['first_crash_type'].unique()

    print_list = []
    print('{:30}{:15}{:20}'.format("Type of Crash", 'Total crashes','Percent of total'))
    for crash in crash_types:
        num_crashes = len(df[df['first_crash_type']==crash])
        print_list.append([crash, num_crashes])
    
    print_list = sorted(print_list, key=lambda x: x[1], reverse=True)
    
    for crash, n in print_list: 
        print('{:30}{:<15d}{:>5.1f}%'.format(crash, n, 100 * n / len(df)))

def print_header(message):
    print('--'*20)
    print(message)
    print('--'*20)
        
def print_crash_stats(all_df, signal_df, other_df):
    print('CRASH STATS')
    print('Total crashes in study:', len(all_df))
    print('Signal crashes: {:.1f}%'.format((100 * len(signal_df) / len(all_df))))

    print_header('CRASHES NOT AT SIGNAL')

    print('Total crashes:', len(other_df))
    print()
    crash_stats(all_df, other_df)
    print('\n\n')
    print_header('CRASHES AT SIGNAL')

    print('Total crashes:', len(signal_df))
    crash_stats(all_df, signal_df)




print_crash_stats(signal_df2, rlc_crash_df, not_rlc_crash_df)

CRASH STATS
Total crashes in study: 2880
Signal crashes: 16.7%
----------------------------------------
CRASHES NOT AT SIGNAL
----------------------------------------
Total crashes: 2398

Type of Crash                 Total crashes  Percent of total    
REAR END                      846             35.3%
TURNING                       581             24.2%
ANGLE                         366             15.3%
SIDESWIPE SAME DIRECTION      287             12.0%
FIXED OBJECT                  90               3.8%
PEDESTRIAN                    59               2.5%
PARKED MOTOR VEHICLE          54               2.3%
PEDALCYCLIST                  29               1.2%
HEAD ON                       26               1.1%
SIDESWIPE OPPOSITE DIRECTION  21               0.9%
REAR TO FRONT                 21               0.9%
REAR TO SIDE                  7                0.3%
OVERTURNED                    5                0.2%
OTHER OBJECT                  3                0.1%
OTHER NONCOLLISION