# Red Light Camera Natural Experiment

Linear regression analysis of factors affecting red light crashes

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from modules.myfuncs import *
import warnings
import numpy as np
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import sqlite3
# import dask
# import dask.dataframe as dd
import gc

warnings.filterwarnings('ignore')
pd.options.display.max_rows = 100

In [2]:
conn = create_connection('database/rlc2.db')  # function from myfuncs file
c = conn.cursor()

sqlite3 version: 2.6.0
connected to database/rlc2.db


In [3]:
print(sql_fetch_tables(c, conn))



def table_info(c, conn):
    '''
    prints out all of the columns of every table in db
    c : cursor object
    conn : database connection object
    '''
    tables = c.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
    for table_name in tables:
        table_name = table_name[0]
        table = pd.read_sql_query("SELECT * from {} LIMIT 0".format(table_name), conn)
        print(table_name)
        for col in table.columns:
            print('\t-' + col)
        print()

table_info(c, conn)


[('intersection_chars',), ('cam_locations',), ('cam_startend',), ('daily_violations',), ('all_crashes',), ('hourly_congestion',), ('hourly_weather',), ('region_data',), ('intersection_cams',), ('signal_crashes',), ('all_hours',), ('int_startend',)]
intersection_chars
	-protected_turn
	-total_lanes
	-medians
	-exit
	-split
	-way
	-underpass
	-no_left
	-angled
	-triangle
	-one_way
	-turn_lanes
	-lat
	-long
	-rlc
	-intersection
	-daily_traffic

cam_locations
	-camera_id
	-intersection
	-address
	-violation_date
	-violations
	-latitude
	-longitude
	-month
	-day
	-weekday
	-year

cam_startend
	-camera_id
	-start
	-end

daily_violations
	-intersection
	-camera_id
	-address
	-violation_date
	-violations
	-latitude
	-longitude
	-month
	-day
	-weekday
	-year

all_crashes
	-crash_record_id
	-rd_no
	-crash_date
	-posted_speed_limit
	-traffic_control_device
	-device_condition
	-weather_condition
	-lighting_condition
	-first_crash_type
	-trafficway_type
	-alignment
	-roadway_surface_cond
	-road_def

## Query our data
I would like to get hourly intersection data with the following columns:


signal_crashes
	-crash_date
	-posted_speed_limit
	-device_condition
	-weather_condition
	-lighting_condition
	-first_crash_type
	-trafficway_type
	-alignment
	-roadway_surface_cond
	-road_defect
	-report_type
	-crash_type
	-damage
	-prim_contributory_cause
	-sec_contributory_cause
	-street_no
	-street_direction
	-street_name
	-most_severe_injury
	-injuries_total
	-injuries_fatal
	-injuries_incapacitating
	-injuries_non_incapacitating
	-injuries_reported_not_evident
	-injuries_no_indication
	-injuries_unknown
	-latitude
	-longitude
	-lane_cnt
	-work_zone_i
	-work_zone_type
	-workers_present_i
	-intersection
	-year
	-month
	-day
	-hour
	-region_id
	-time
	-weekday

hourly_weather
	-temp
	-rain_1h
	-rain_3h
	-snow_1h
	-snow_3h
	-time
	-year
	-month
	-day
	-hour
	-weekday

hourly_congestion
	-year
	-month
	-day
	-hour
	-region_id
	-bus_count
	-num_reads
	-speed
	-weekday
    
Tables will be queried to JOIN on year, month, day, hour

WHAT I REALLY NEED HERE
A single dataset with the following:
- daily intersection chars (all the stuff I entered)
- daily intersection crashes (injuries, deaths, number etc)
- daily wx (temp, precip mainly)
- daily congestion (might not need this)
- daily violations (already in the format)

In [9]:
# Approximately how many entries should I expect to be looking at after filtering my data
# 3 years, 4 months x 153 intersections
print((365.25*3 + 30*4) *153)


186009.75


## Query my db to get a DataFrame with crashes and intersection
We will be using the number of crashes and injuries to do some t-tests

In [74]:
# THis is for the times the cams were off

cam_df = pd.read_sql_query('''WITH v AS(SELECT d.year,
                                                d.month,
                                                d.day,
                                                d.intersection,
                                                dv.violations,
                                                dv.violation_date
                                               
                                            FROM all_hours as d
                                            
                                            LEFT JOIN daily_violations as dv
                                                ON d.year = dv.year
                                                AND d.month = dv.month
                                                AND d.day = dv.day
                                                AND d.intersection = dv.intersection
                                                
                                            ), 
                                    se AS(SELECT *
                                         FROM int_startend as se
                                    ),
                                    cr AS(SELECT *
                                         FROM signal_crashes
                                        )
                                        
                                    SELECT  v.*,
                                    
                                            se.start,
                                            se.end
                                    FROM v
                                    LEFT JOIN se
                                        ON se.intersection = v.intersection
                                    ''', conn)

crash_df = pd.read_sql_query('''SELECT * FROM signal_crashes''', conn)
                                        
violations_df = pd.read_sql_query('''SELECT * FROM daily_violations''', conn)  
days_df = pd.read_sql_query('''SELECT * FROM all_hours''', conn)  

In [77]:
days_df[days_df['intersection']=='111TH AND HALSTED']  # missing from all_hours??  Probably missing from int_df

Unnamed: 0,year,month,day,intersection


What does my cam_df look like?

In [78]:
cam_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535102 entries, 0 to 535101
Data columns (total 8 columns):
year              535102 non-null int64
month             535102 non-null int64
day               535102 non-null int64
intersection      535102 non-null object
violations        465430 non-null float64
violation_date    465430 non-null object
start             533305 non-null object
end               533305 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 32.7+ MB


In [79]:
# should be one entry per day per intersection.
crash_df.head()
cam_df.head()


Unnamed: 0,year,month,day,intersection,violations,violation_date,start,end
0,2016,1,1,115TH AND HALSTED,20.0,2016-01-01 00:00:00,2015-01-01 00:00:00,2017-10-26 00:00:00
1,2016,1,2,115TH AND HALSTED,6.0,2016-01-02 00:00:00,2015-01-01 00:00:00,2017-10-26 00:00:00
2,2016,1,2,115TH AND HALSTED,14.0,2016-01-02 00:00:00,2015-01-01 00:00:00,2017-10-26 00:00:00
3,2016,1,3,115TH AND HALSTED,2.0,2016-01-03 00:00:00,2015-01-01 00:00:00,2017-10-26 00:00:00
4,2016,1,3,115TH AND HALSTED,11.0,2016-01-03 00:00:00,2015-01-01 00:00:00,2017-10-26 00:00:00


What does my crash_df look like?

In [80]:
crash_df[crash_df['intersection']=='115TH AND HALSTED'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39 entries, 238 to 59637
Data columns (total 51 columns):
crash_record_id                  39 non-null object
rd_no                            39 non-null object
crash_date                       39 non-null object
posted_speed_limit               39 non-null object
traffic_control_device           39 non-null object
device_condition                 39 non-null object
weather_condition                39 non-null object
lighting_condition               39 non-null object
first_crash_type                 39 non-null object
trafficway_type                  39 non-null object
alignment                        39 non-null object
roadway_surface_cond             39 non-null object
road_defect                      39 non-null object
report_type                      36 non-null object
crash_type                       39 non-null object
damage                           39 non-null object
prim_contributory_cause          39 non-null object
sec_cont

#### change my dates to datetime objects
Need to do this so we can do some date calculations to see which entries had the cameras on or off.

In [81]:
cam_df[['start', 'end']] = cam_df[['start', 'end']].apply(pd.to_datetime)
crash_df['crash_date'] = crash_df['crash_date'].apply(pd.to_datetime)

#### Make sure all my intersections have start-end dates.  I was originally missing a few

In [82]:
cam_df.isna().sum()

year                  0
month                 0
day                   0
intersection          0
violations        69672
violation_date    69672
start              1797
end                1797
dtype: int64

In [83]:
cam_df[cam_df['start'].isna()]['intersection'].unique()
cam_df.dropna(subset=['start'], inplace=True)  # drop the few that don't have start-end.  Just a problem with my handmade dataset.  Can fix later

## Merge my DataFrames
Combine my camera info and crash info into a single df.  
Keep all of the cam_df info since it has all the date info.  Will join on

In [84]:
pd.options.display.max_rows = 200
all_df = cam_df.merge(crash_df, how='left', on=['year', 'month', 'day', 'intersection'])

In [85]:
all_df['date'] = all_df.apply(lambda x: datetime(int(x.year), int(x.month), int(x.day)), axis=1)

In [86]:
print(len(cam_df)) # should be same as all if everything merged properly
print(len(crash_df))
print(len(all_df))

print(len(crash_df.intersection.unique()))  
print(len(cam_df.intersection.unique()))

print(set(crash_df.intersection.unique()) - set(cam_df.intersection.unique()))  # missing very first one ???


533305
60338
533653
183
182
{'111TH AND HALSTED', 'Archer and Central', None}


In [87]:
all_df = all_df[all_df['date']>=datetime(2017,9,1)]

In [88]:
pd.options.display.max_columns = 1000
#all_df[all_df['year'].isna()]
#all_df.iloc[535444]
all_df.year.unique()

array([2017, 2018, 2019, 2020])

In [89]:
# need to determine if crash occurred in or outside of cam on dates

def rlc_state(start, end, my_date):

    if (end - my_date).days >= 0 and (my_date - start).days >= 0:
        return 1
    elif (my_date - end).days > 0:
        return 0
    elif (start - my_date).days > 0:
        return 0
    else:
        return None

all_df['rlc_on'] = all_df.apply(lambda x: rlc_state(x.start, x.end, x.date), axis=1)

In [100]:
all_df[all_df['intersection']=='115TH AND HALSTED']

Unnamed: 0,year,month,day,intersection,violations,violation_date,start,end,crash_record_id,rd_no,crash_date,posted_speed_limit,traffic_control_device,device_condition,weather_condition,lighting_condition,first_crash_type,trafficway_type,alignment,roadway_surface_cond,road_defect,report_type,crash_type,damage,prim_contributory_cause,sec_contributory_cause,street_no,street_direction,street_name,beat_of_occurrence,num_units,most_severe_injury,injuries_total,injuries_fatal,injuries_incapacitating,injuries_non_incapacitating,injuries_reported_not_evident,injuries_no_indication,injuries_unknown,crash_hour,crash_day_of_week,crash_month,latitude,longitude,lane_cnt,intersection_related_i,hit_and_run_i,crash_date_est_i,work_zone_i,work_zone_type,workers_present_i,hour,region_id,time,weekday,date,rlc_on
1154,2017,9,1,115TH AND HALSTED,7.0,2017-09-01 00:00:00,2015-01-01,2017-10-26,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-09-01,1
1155,2017,9,1,115TH AND HALSTED,11.0,2017-09-01 00:00:00,2015-01-01,2017-10-26,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-09-01,1
1156,2017,9,2,115TH AND HALSTED,6.0,2017-09-02 00:00:00,2015-01-01,2017-10-26,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-09-02,1
1157,2017,9,2,115TH AND HALSTED,7.0,2017-09-02 00:00:00,2015-01-01,2017-10-26,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-09-02,1
1158,2017,9,3,115TH AND HALSTED,4.0,2017-09-03 00:00:00,2015-01-01,2017-10-26,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-09-03,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2392,2020,11,27,115TH AND HALSTED,,,2015-01-01,2017-10-26,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-11-27,0
2393,2020,11,28,115TH AND HALSTED,,,2015-01-01,2017-10-26,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-11-28,0
2394,2020,11,29,115TH AND HALSTED,,,2015-01-01,2017-10-26,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-11-29,0
2395,2020,11,30,115TH AND HALSTED,,,2015-01-01,2017-10-26,,,NaT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-11-30,0


In [101]:
all_df[(all_df['intersection']=='115TH AND HALSTED') & (all_df['crash_date'].notnull())][['crash_date', 'start', 'end', 'rlc_on']]

Unnamed: 0,crash_date,start,end,rlc_on
1206,2017-09-27 08:00:00,2015-01-01,2017-10-26,1
1207,2017-09-27 08:00:00,2015-01-01,2017-10-26,1
1284,2017-11-15 07:30:00,2015-01-01,2017-10-26,0
1377,2018-02-16 20:15:00,2015-01-01,2017-10-26,0
1425,2018-04-05 15:24:00,2015-01-01,2017-10-26,0
1439,2018-04-19 13:50:00,2015-01-01,2017-10-26,0
1503,2018-06-22 17:40:00,2015-01-01,2017-10-26,0
1520,2018-07-09 22:21:00,2015-01-01,2017-10-26,0
1568,2018-08-26 12:00:00,2015-01-01,2017-10-26,0
1626,2018-10-23 22:09:00,2015-01-01,2017-10-26,0


In [102]:
all_on = all_df[all_df['rlc_on']==1]

In [103]:
len(all_on)

311104

In [104]:
all_off = all_df[all_df['rlc_on']==0]

In [105]:
len(all_off)
all_off.columns

Index(['year', 'month', 'day', 'intersection', 'violations', 'violation_date',
       'start', 'end', 'crash_record_id', 'rd_no', 'crash_date',
       'posted_speed_limit', 'traffic_control_device', 'device_condition',
       'weather_condition', 'lighting_condition', 'first_crash_type',
       'trafficway_type', 'alignment', 'roadway_surface_cond', 'road_defect',
       'report_type', 'crash_type', 'damage', 'prim_contributory_cause',
       'sec_contributory_cause', 'street_no', 'street_direction',
       'street_name', 'beat_of_occurrence', 'num_units', 'most_severe_injury',
       'injuries_total', 'injuries_fatal', 'injuries_incapacitating',
       'injuries_non_incapacitating', 'injuries_reported_not_evident',
       'injuries_no_indication', 'injuries_unknown', 'crash_hour',
       'crash_day_of_week', 'crash_month', 'latitude', 'longitude', 'lane_cnt',
       'intersection_related_i', 'hit_and_run_i', 'crash_date_est_i',
       'work_zone_i', 'work_zone_type', 'workers_present_

# My first try at t-test

In [106]:
on = all_on['crash_record_id'].notnull()
off = all_off['crash_record_id'].notnull()

In [107]:
on.value_counts()

False    301054
True      10050
Name: crash_record_id, dtype: int64

In [108]:
off.value_counts()

False    37480
True      1130
Name: crash_record_id, dtype: int64

In [109]:
print(on.sum(), len(on))
print(on.sum() / len(on))


print(off.sum(), len(off))
print(off.sum() / len(off))

10050 311104
0.03230430981279572
1130 38610
0.029267029267029267


In [110]:
on_rate = on.mean()
off_rate = off.mean()
expected_off = on_rate * len(off)


print('rlc on crash mean: {:.4f}'.format(on_rate))
print('rlc off crash mean: {:.4f}'.format(off_rate))
print()

print('Actual crashes with rlc off:', off.sum())
print('Expected crashes with rlc off: {:.0f}'.format(expected_off))


rlc on crash mean: 0.0323
rlc off crash mean: 0.0293

Actual crashes with rlc off: 1130
Expected crashes with rlc off: 1247


In [111]:
n = len(off)
p = on_rate
var = n * p * (1-p)
std = np.sqrt(var)

print(std)  # deviation from expected. We are way outside that

34.74157775193792


In [112]:

z_score = (off.sum() - expected_off)/std
print(z_score)

-3.3754771504440804


In [113]:
import scipy.stats as stats
p_val = stats.norm.sf(z_score) #or 1 - stats.norm.cdf(z_score)




print(p_val)


0.9996315609461888


In [114]:
# WHOA.  This went the wrong way. I was testing that turning camera off increases crashes, lets switch it up.

# Second try at t-test

It is possible that the previous AB test was misleading.  Only a small percentage of cameras actually have dates where they were turned on/off during the examined time period.  This time, I would like to test only the cameras that have off periods between Sept 2017 through 2020.

In [115]:
#p = 1 - stats.t.cdf(t, df)


In [116]:
all_df.columns

Index(['year', 'month', 'day', 'intersection', 'violations', 'violation_date',
       'start', 'end', 'crash_record_id', 'rd_no', 'crash_date',
       'posted_speed_limit', 'traffic_control_device', 'device_condition',
       'weather_condition', 'lighting_condition', 'first_crash_type',
       'trafficway_type', 'alignment', 'roadway_surface_cond', 'road_defect',
       'report_type', 'crash_type', 'damage', 'prim_contributory_cause',
       'sec_contributory_cause', 'street_no', 'street_direction',
       'street_name', 'beat_of_occurrence', 'num_units', 'most_severe_injury',
       'injuries_total', 'injuries_fatal', 'injuries_incapacitating',
       'injuries_non_incapacitating', 'injuries_reported_not_evident',
       'injuries_no_indication', 'injuries_unknown', 'crash_hour',
       'crash_day_of_week', 'crash_month', 'latitude', 'longitude', 'lane_cnt',
       'intersection_related_i', 'hit_and_run_i', 'crash_date_est_i',
       'work_zone_i', 'work_zone_type', 'workers_present_

In [117]:
select_cams = all_df[(all_df['start'] > datetime(2017,9,1)) | (all_df['end'] < datetime(2020,12,31))]

In [118]:
select_cams.isna().sum()

year                                 0
month                                0
day                                  0
intersection                         0
violations                       40080
violation_date                   40080
start                                0
end                                  0
crash_record_id                  63156
rd_no                            63156
crash_date                       63156
posted_speed_limit               63156
traffic_control_device           63156
device_condition                 63156
weather_condition                63156
lighting_condition               63156
first_crash_type                 63156
trafficway_type                  63156
alignment                        63156
roadway_surface_cond             63156
road_defect                      63156
report_type                      63212
crash_type                       63156
damage                           63156
prim_contributory_cause          63156
sec_contributory_cause   

In [119]:
experiment = select_cams[select_cams['rlc_on']==0]
control = select_cams[select_cams['rlc_on']==1]

In [120]:
print(len(control), len(experiment))

26273 38610


In [121]:
n_experiment = len(experiment)
crash_experiment = experiment['crash_record_id'].notnull()

n_control = len(control)
crash_control = control['crash_record_id'].notnull()

In [122]:
print(crash_experiment.sum(), n_experiment)
print(crash_control.sum(), n_control)

1130 38610
597 26273


In [123]:
on_rate = crash_control.mean()
off_rate = crash_experiment.mean()
expected_off = on_rate * len(experiment)


print('rlc on crash mean: {:.4f}'.format(on_rate))
print('rlc off crash mean: {:.4f}'.format(off_rate))
print()

print('Actual crashes with rlc off:', off.sum())
print('Expected crashes with rlc off: {:.0f}'.format(expected_off))

rlc on crash mean: 0.0227
rlc off crash mean: 0.0293

Actual crashes with rlc off: 1130
Expected crashes with rlc off: 877


In [124]:
n = len(off)
p = on_rate
var = n * p * (1-p)
std = np.sqrt(var)

print(std)  # deviation from expected. We are way outside that

29.28134921199512


In [125]:
z_score = (off.sum() - expected_off)/std
print(z_score)

8.628939694925773


In [126]:
import scipy.stats as stats
p_val = stats.norm.sf(z_score) #or 1 - stats.norm.cdf(z_score)




print(p_val)

3.0961664253679065e-18


### This gives a very different result.  Comparing only cams switched on/off.
Having a cam on is statistically significant for this group.

There could be another effect.  What if the camera that had the most down time also had more crashes.  That would skew the results dramatically

I believe the only real way to test this is by going through every camera of interest.



In [127]:
select_cams.intersection.unique()


array(['115TH AND HALSTED', '31ST AND CALIFORNIA', '75TH AND STATE',
       '83RD AND STONY ISLAND', '95TH AND STONEY ISLAND',
       'ARCHER/NARRAGANSETT AND 55TH', 'ASHLAND AND 47TH',
       'ASHLAND AND 63RD', 'ASHLAND AND ARCHER', 'ASHLAND AND DIVERSEY',
       'AUSTIN AND ADDISON', 'BLUE ISLAND AND DAMEN',
       'CENTRAL AND BELMONT', 'CENTRAL AND DIVERSEY',
       'CENTRAL AND FULLERTON', 'CENTRAL AND MILWAUKEE',
       'CORNELL DRIVE AND 57TH', 'COTTAGE GROVE AND 95TH',
       'ELSTON AND FOSTER', 'FOSTER AND NORTHWEST HIGHWAY',
       'FULLERTON AND NARRAGANSETT', 'GARFIELD AND ASHLAND',
       'GRAND AND OAK PARK', 'HALSTED AND 103RD', 'HALSTED AND 63RD',
       'HALSTED AND 83RD', 'HARLEM AND BELMONT',
       'HARLEM AND NORTHWEST HWY', 'IRVING PARK AND KEDZIE',
       'JEFFERY AND 79TH', 'KIMBALL AND LINCOLN', 'LAKE AND UPPER WACKER',
       'LINCOLN AND MCCORMICK', 'MADISON AND CENTRAL',
       'MICHIGAN AND JACKSON', 'MICHIGAN AND ONTARIO',
       'MILWAUKEE AND CENTRAL',

In [128]:
# Above is a list of the cams in question.

In [129]:
def calculate_pval(intersection, control, experiment):
    # control is on, experiment is off
    control = control[control['intersection']==intersection]
    experiment = experiment[experiment['intersection']==intersection]

    
    n_experiment = len(experiment)
    crash_experiment = experiment['crash_record_id'].notnull()

    n_control = len(control)
    crash_control = control['crash_record_id'].notnull()
    
    on_rate = crash_control.mean()
    off_rate = crash_experiment.mean()
    expected_off = on_rate * len(experiment)
    
    print()
    print('*'*20)
    print(intersection)
    print('rlc on crash mean: {:.4f}'.format(on_rate))
    print('rlc off crash mean: {:.4f}'.format(off_rate))
    print()

    print('Actual crashes with rlc off:', crash_experiment.sum())
    print('Expected crashes with rlc off: {:.0f}'.format(expected_off))
    
    n = len(off)
    p = on_rate
    var = n * p * (1-p)
    std = np.sqrt(var)
    
    print()
    print('SD: {:.2f}'.format(std))  # deviation from expected. We are way outside that
    z_score = (off.sum() - expected_off)/std
    print('z score: {:.2f}'.format(z_score))
    
    p_val = stats.norm.sf(z_score) #or 1 - stats.norm.cdf(z_score)
    print(p_val)
    
    if p_val < 0.05: 
        print('Null Hypothesis Rejected')
    else:
        print("Null Hypothesis Not Rejected")
    
for intersection in select_cams.intersection.unique():
    calculate_pval(intersection, control, experiment)


********************
115TH AND HALSTED
rlc on crash mean: 0.0180
rlc off crash mean: 0.0300

Actual crashes with rlc off: 34
Expected crashes with rlc off: 20

SD: 26.14
z score: 42.45
0.0
Null Hypothesis Rejected

********************
31ST AND CALIFORNIA
rlc on crash mean: nan
rlc off crash mean: 0.0185

Actual crashes with rlc off: 22
Expected crashes with rlc off: nan

SD: nan
z score: nan
nan
Null Hypothesis Not Rejected

********************
75TH AND STATE
rlc on crash mean: 0.0430
rlc off crash mean: nan

Actual crashes with rlc off: 0
Expected crashes with rlc off: 0

SD: 39.87
z score: 28.34
4.931961902046246e-177
Null Hypothesis Rejected

********************
83RD AND STONY ISLAND
rlc on crash mean: nan
rlc off crash mean: 0.0579

Actual crashes with rlc off: 69
Expected crashes with rlc off: nan

SD: nan
z score: nan
nan
Null Hypothesis Not Rejected

********************
95TH AND STONEY ISLAND
rlc on crash mean: nan
rlc off crash mean: 0.1134

Actual crashes with rlc off: 13

## Build out a balanced experiment and control
We have an imbalance. Each intersection is imbalanced cam_on/cam_off.  We are unable to use all of the data for this reason. If we lump all of the intersections together, the cams that were primarily on or primarily off will be overrepresented in their respective groups.  

I propose balancing every intersection.  If I only have 100 days with the camera off, I will only take 100 days of data from the camera on data.  We should also control for time of year if possible.  We could take random samples perhaps to minimize this effect. 

In [130]:
np.random.seed(42)

all_ints_on = pd.Series()
all_ints_off = pd.Series()


for myint in select_cams['intersection'].unique():
    myint_df = select_cams[select_cams['intersection']==myint]  # get all days regardless of on/off status
    cam_on = myint_df[myint_df.rlc_on==1]
    cam_off = myint_df[myint_df.rlc_on==0]
    cam_on = cam_on.sample(frac=1).reset_index(drop=True)
    cam_off = cam_off.sample(frac=1).reset_index(drop=True)

    
    balanced_n = min(cam_on.year.count(), cam_off.year.count())  # get the smaller size (control or experiment)
    if balanced_n:
        print('{:>30}: {}'.format(myint, balanced_n))
    
    # use balanced_n to add same amount to each series
    all_ints_on = all_ints_on.append(cam_on.iloc[:balanced_n]['crash_record_id'])
    all_ints_off = all_ints_off.append(cam_off.iloc[:balanced_n]['crash_record_id'])

print()
print(len(all_ints_on), 'days of cam on, and', len(all_ints_off), 'days of cam off.')
print(all_ints_on.count(), all_ints_off.count())

             115TH AND HALSTED: 111
         CENTRAL AND MILWAUKEE: 54
  FOSTER AND NORTHWEST HIGHWAY: 55
             HALSTED AND 103RD: 107
         LAKE AND UPPER WACKER: 173
          MICHIGAN AND JACKSON: 157
          MICHIGAN AND ONTARIO: 157
         MILWAUKEE AND CENTRAL: 54
  NORTHWEST HIGHWAY AND FOSTER: 26

894 days of cam on, and 894 days of cam off.
17 30


In [132]:
# after all that, we only have 894 in each dataset for the win.  We only have 9 cameras represented in the set.

In [179]:
# pulled info from coin flip analysis lab in mod 2 t-test


def calc_pval(control, experiment):
    # control is on, experiment is off
    #control = control[control['intersection']==intersection]
    #experiment = experiment[experiment['intersection']==intersection]

    n_experiment = len(experiment)
    crash_experiment = experiment.notnull()

    n_control = len(control)
    crash_control = control.notnull()
    
    on_rate = crash_control.mean()
    off_rate = crash_experiment.mean()
    expected_off = on_rate * n_experiment
    
    print('n days total: {}'.format(n_control + n_experiment))
    print('Total crashes in set: {}\n'.format(crash_control.sum() + crash_experiment.sum()))
    
    print('Camera ON mean crashes per day: {:.4f}'.format(on_rate))
    print('Camera OFF mean crashes per day: {:.4f}'.format(off_rate))
    print()

    print('Actual crashes with rlc off:', crash_experiment.sum())
    print('Expected crashes with rlc off: {:.0f}'.format(expected_off))
    
    n = len(off) # n for my experiment
    p = on_rate # mean for my control (expected mean for my experiment)
    var = n * p * (1-p)
    std = np.sqrt(var)
    
    print()
    print('std: {:.2f}'.format(std))  # deviation from expected. 
    z_score = (off.sum() - expected_off)/std
    print('z score: {:.2f}'.format(z_score))
    
    p_val = stats.norm.sf(z_score) # calc the pval from the zscore.
    print('p: {:.10e}'.format(p_val))
    print()
    
    if p_val < 0.05: 
        print('Null Hypothesis Rejected')
    else:
        print("Null Hypothesis Not Rejected")
        

calc_pval(all_ints_on, all_ints_off)
print('alpha:', 0.05)


n days total: 1788
Total crashes in set: 47

Camera ON mean crashes per day: 0.0190
Camera OFF mean crashes per day: 0.0336

Actual crashes with rlc off: 30
Expected crashes with rlc off: 17

std: 26.84
z score: 41.47
p: 0.0000000000e+00

Null Hypothesis Rejected
alpha: 0.05


# Bootstrap resampling method with t-test

### Make my two balanced date series

In [187]:
# bootstrap resampling
np.random.seed(42)

all_ints_on = pd.Series()
all_ints_off = pd.Series()


for myint in select_cams['intersection'].unique():
    myint_df = select_cams[select_cams['intersection']==myint]  # get all days regardless of on/off status
    cam_on = myint_df[myint_df.rlc_on==1]
    cam_off = myint_df[myint_df.rlc_on==0]
    cam_on = cam_on.sample(frac=1).reset_index(drop=True)
    cam_off = cam_off.sample(frac=1).reset_index(drop=True)

    
    balanced_n = min(cam_on.year.count(), cam_off.year.count())  # get the smaller size (control or experiment)
    if balanced_n:
        print('{:>30}: {}'.format(myint, balanced_n))
    
    # use balanced_n to add same amount to each series
    all_ints_on = all_ints_on.append(cam_on.iloc[:balanced_n]['crash_record_id'])
    all_ints_off = all_ints_off.append(cam_off.iloc[:balanced_n]['crash_record_id'])

print()
print(len(all_ints_on), 'days of cam on, and', len(all_ints_off), 'days of cam off.')
print(all_ints_on.count(), all_ints_off.count())

             115TH AND HALSTED: 111
         CENTRAL AND MILWAUKEE: 54
  FOSTER AND NORTHWEST HIGHWAY: 55
             HALSTED AND 103RD: 107
         LAKE AND UPPER WACKER: 173
          MICHIGAN AND JACKSON: 157
          MICHIGAN AND ONTARIO: 157
         MILWAUKEE AND CENTRAL: 54
  NORTHWEST HIGHWAY AND FOSTER: 26

894 days of cam on, and 894 days of cam off.
17 30


In [188]:
def bootstrap(A, B):
    universe = list(A) + list(B)
    universe_shuffled = np.random.choice(universe, size=len(universe), replace=True)
    new_a = universe_shuffled[:len(A)]
    new_b = universe_shuffled[len(A):]
    return new_a, new_b

In [212]:
# bootstrap resampling
np.random.seed(42)

all_ints_on = pd.Series()
all_ints_off = pd.Series()


for myint in select_cams['intersection'].unique():
    myint_df = select_cams[select_cams['intersection']==myint]  # get all days regardless of on/off status
    cam_on = myint_df[myint_df.rlc_on==1]
    cam_off = myint_df[myint_df.rlc_on==0]
    cam_on = cam_on.sample(frac=1).reset_index(drop=True)
    cam_off = cam_off.sample(frac=1).reset_index(drop=True)

    
    balanced_n = min(cam_on.year.count(), cam_off.year.count())  # get the smaller size (control or experiment)
    if balanced_n:
        print('{:>30}: {}'.format(myint, balanced_n))
    
    # use balanced_n to add same amount to each series
    all_ints_on = all_ints_on.append(cam_on.iloc[:balanced_n]['crash_record_id'])
    all_ints_off = all_ints_off.append(cam_off.iloc[:balanced_n]['crash_record_id'])

print()
print(len(all_ints_on), 'days of cam on, and', len(all_ints_off), 'days of cam off.')
print(all_ints_on.count(), all_ints_off.count())

             115TH AND HALSTED: 111
         CENTRAL AND MILWAUKEE: 54
  FOSTER AND NORTHWEST HIGHWAY: 55
             HALSTED AND 103RD: 107
         LAKE AND UPPER WACKER: 173
          MICHIGAN AND JACKSON: 157
          MICHIGAN AND ONTARIO: 157
         MILWAUKEE AND CENTRAL: 54
  NORTHWEST HIGHWAY AND FOSTER: 26

894 days of cam on, and 894 days of cam off.
17 30


In [213]:
exp = all_ints_off.notnull()  # series of True False, did crash occur
con = all_ints_on.notnull()


# bootstrap iterations
iterations = 10**5
diff_mu_a_b = np.mean(exp) - np.mean(con)
num = 0 #Initialize numerator



for n in range(iterations):
    ai, bi = bootstrap(exp, con)
    diff_mu_ai_bi = np.mean(ai) - np.mean(bi)
    if diff_mu_ai_bi >= diff_mu_a_b:
        num +=1
p_value = num / iterations
print('P-value: {}'.format(p_value))

P-value: 0.02767


Using the bootstrap resampling method with 10k iterations, we are able to get a very high confidence that the presence of red light cameras reduced crashes at the 9 intersections that were turned on/off.

In [214]:
len(exp)

894

# Same bootstrap resampling t-test with injuries 

In [215]:
# bootstrap resampling
np.random.seed(42)

all_ints_on = pd.Series()
all_ints_off = pd.Series()


for myint in select_cams['intersection'].unique():
    myint_df = select_cams[select_cams['intersection']==myint]  # get all days regardless of on/off status
    cam_on = myint_df[myint_df.rlc_on==1]
    cam_off = myint_df[myint_df.rlc_on==0]
    cam_on = cam_on.sample(frac=1).reset_index(drop=True)
    cam_off = cam_off.sample(frac=1).reset_index(drop=True)

    
    balanced_n = min(cam_on.year.count(), cam_off.year.count())  # get the smaller size (control or experiment)
    if balanced_n:
        print('{:>30}: {}'.format(myint, balanced_n))
    
    # use balanced_n to add same amount to each series
    all_ints_on = all_ints_on.append(cam_on.iloc[:balanced_n]['injuries_total'])
    all_ints_off = all_ints_off.append(cam_off.iloc[:balanced_n]['injuries_total'])

print()
print(len(all_ints_on), 'days of cam on, and', len(all_ints_off), 'days of cam off.')
print(all_ints_on.count(), all_ints_off.count())

             115TH AND HALSTED: 111
         CENTRAL AND MILWAUKEE: 54
  FOSTER AND NORTHWEST HIGHWAY: 55
             HALSTED AND 103RD: 107
         LAKE AND UPPER WACKER: 173
          MICHIGAN AND JACKSON: 157
          MICHIGAN AND ONTARIO: 157
         MILWAUKEE AND CENTRAL: 54
  NORTHWEST HIGHWAY AND FOSTER: 26

894 days of cam on, and 894 days of cam off.
17 30


In [217]:
exp = all_ints_off.fillna(0)
exp = exp.astype(int)
con = all_ints_on.fillna(0)
con= con.astype(int)

In [218]:
print('Injuries with cams on:  {}'.format(con.sum()))
print('Injuries with cams off: {}'.format(exp.sum()))

Injuries with cams on:  10
Injuries with cams off: 9


In [219]:
# bootstrap iterations
iterations = 10**3
diff_mu_a_b = np.mean(exp) - np.mean(con)
num = 0 #Initialize numerator



for n in range(iterations):
    ai, bi = bootstrap(con, exp)
    diff_mu_ai_bi = np.mean(ai) - np.mean(bi)
    if diff_mu_ai_bi >= diff_mu_a_b:
        num +=1
p_value = num / iterations
print('P-value: {}'.format(p_value))

P-value: 0.587


Results here are inconclusisive.  We cannot reject the null hypothesis that red light cameras have no affect on the number of injuries.
It should be noted that there are only 10 and 9 injuries in the dataset.  More data might yield a different result.

## T-test for dangerous crash types
#### Same thing now with types of crashes we are concerned with.  


In [238]:
# bootstrap resampling
np.random.seed(337)

all_ints_on = pd.Series()
all_ints_off = pd.Series()


for myint in select_cams['intersection'].unique():
    myint_df = select_cams[select_cams['intersection']==myint]  # get all days regardless of on/off status
    cam_on = myint_df[myint_df.rlc_on==1]
    cam_off = myint_df[myint_df.rlc_on==0]
    cam_on = cam_on.sample(frac=1).reset_index(drop=True)
    cam_off = cam_off.sample(frac=1).reset_index(drop=True)

    
    balanced_n = min(cam_on.year.count(), cam_off.year.count())  # get the smaller size (control or experiment)
    if balanced_n:
        print('{:>30}: {}'.format(myint, balanced_n))
    
    # use balanced_n to add same amount to each series
    all_ints_on = all_ints_on.append(cam_on.iloc[:balanced_n]['first_crash_type'])
    all_ints_off = all_ints_off.append(cam_off.iloc[:balanced_n]['first_crash_type'])

print()
print(len(all_ints_on), 'days of cam on, and', len(all_ints_off), 'days of cam off.')
print(all_ints_on.count(), all_ints_off.count())

             115TH AND HALSTED: 111
         CENTRAL AND MILWAUKEE: 54
  FOSTER AND NORTHWEST HIGHWAY: 55
             HALSTED AND 103RD: 107
         LAKE AND UPPER WACKER: 173
          MICHIGAN AND JACKSON: 157
          MICHIGAN AND ONTARIO: 157
         MILWAUKEE AND CENTRAL: 54
  NORTHWEST HIGHWAY AND FOSTER: 26

894 days of cam on, and 894 days of cam off.
17 31


In [239]:
all_ints_off.unique()

array([nan, 'REAR END', 'TURNING', 'PEDESTRIAN', 'ANGLE',
       'SIDESWIPE SAME DIRECTION'], dtype=object)

In [240]:
dangerous = ['ANGLE', 'TURNING', 'PEDESTRIAN']

def targeted_crash_count(crash_type):
    if crash_type in dangerous:
        return 1
    return 0

con = all_ints_on.apply(targeted_crash_count)
exp = all_ints_off.apply(targeted_crash_count)

In [243]:
print('Targeted crashes with cams on:  {}'.format(con.sum()))
print('Targeted crashes with cams off: {}'.format(exp.sum()))

Targeted crashes with cams on:  12
Targeted crashes with cams off: 17


In [244]:
# bootstrap iterations
iterations = 10**4
diff_mu_a_b = np.mean(exp) - np.mean(con)
num = 0 #Initialize numerator



for n in range(iterations):
    ai, bi = bootstrap(con, exp)
    diff_mu_ai_bi = np.mean(ai) - np.mean(bi)
    if diff_mu_ai_bi >= diff_mu_a_b:
        num +=1
p_value = num / iterations
print('P-value: {}'.format(p_value))

P-value: 0.1923


Small sample size.  The event was infrequent at best.

In [None]:
print(153*365*6)
cam_df.rlc_on.count()

In [None]:
crash_df.head()

In [None]:
cam_df.head()


In [None]:
def rlc_state(start, end, crash):
    if (end - crash).days > 0 and (crash - start).days > 0:
        return 1
    elif (crash - end).days > 0:
        return 0
    elif (start - crash).days > 0:
        return 0
    else:
        return None

cam_df['rlc_on'] = cam_df.apply(lambda x: rlc_state(x.start, x.end, x.crash_date), axis=1)

In [None]:
cam_df.crash_date.unique()
print(nocam_df.crash_date.notnull().sum())
print(nocam_df.crash_date.isnull().sum())

In [None]:
pd.options.display.max_rows = 1000
nocam_df[nocam_df.crash_date.notnull()]['crash_date'].describe()

In [None]:
print(365*5*153)

In [None]:
nocam_df.head()