# Need Data Dictionary for TAMS

In [2]:
import pandas as pd
import ris
from ris import db2
from ris.db2 import pg_shp as shp
import os
from tqdm import tqdm
from IPython.display import clear_output
import getpass
import datetime
from IPython.display import Markdown
from dateutil.relativedelta import relativedelta
Markdown('<strong>Notebook run on: {} | by {} | Using ris library version: {}'.format(
    datetime.datetime.now().strftime('%Y-%m-%d %H:%M'), getpass.getuser(), ris.__version__
))

<strong>Notebook run on: 2019-08-15 15:27 | by bfeng | Using ris library version: 1.3.10

In [4]:
db = db2.PostgresDb('dotdevrhpgsql01','CRASHDATA')

User name (CRASHDATA):bfeng
Password (CRASHDATA)········


In [5]:
forms = db2.SqlDb('dot55sql01', 'forms', user='arcgis', db_pass='arcgis', quiet = True)

In [6]:
# nodes = db2.query_to_table(db,'select nodeid from node where is_int = true').nodeid.tolist()

In [7]:
end = datetime.datetime.now().strftime('%Y-%m-%d') 
start = (datetime.datetime.now() - relativedelta(years=3)).strftime('%Y-%m-%d')

# Leap Years?

In [8]:
@db2.timeDec

def all_crashes():
    crashes = db2.query_to_table(forms,"""select coalesce(c.nodeid, c.lion_node_number) nodeid, 
    integration_id, cast(c.accident_dt as date) accident_dt
    from forms.dbo.wc_accident_f c
    where year(c.accident_dt) >= 2017
    and accident_dt between '{start}' and '{end}'
    and coalesce(c.nodeid, c.lion_node_number) is not null
    union select min(s.nodeid) nodeid, integration_id, cast(c.accident_dt as date) accident_dt
    from forms.dbo.wc_accident_f c
    join [FORMS].[dbo].[v_IntersectionStreetNames_Gen] s
    on 
    lower(ltrim(rtrim(c.SRC_ON_STREET))) = lower(ltrim(rtrim(s.street_1))) 
    and
    lower(ltrim(rtrim(c.SRC_cross_STREET))) = lower(ltrim(rtrim(s.street_2)))
    and 
    case WHEN C.[SRC_POLICE_PRECINCT] BETWEEN 1 AND 34 THEN 1
        WHEN C.[SRC_POLICE_PRECINCT] BETWEEN 40 AND 52 THEN 2
        WHEN C.[SRC_POLICE_PRECINCT] BETWEEN 60 AND 94 THEN 3
        WHEN C.[SRC_POLICE_PRECINCT] BETWEEN 100 AND 115 THEN 4
        WHEN C.[SRC_POLICE_PRECINCT] BETWEEN 120 AND 123 THEN 5 end 
    in (left(b7sc_2, 1), left(b7sc_1, 1))
    where year(c.accident_dt) >= 2017
    and accident_dt between '{start}' and '{end}'
    and coalesce(c.nodeid, c.lion_node_number) is null
    group by integration_id, cast(c.accident_dt as date)
    order by nodeid, cast(c.accident_dt as date) desc
        """.format(start=start, end=end))
    
    return crashes

In [29]:
crashes = all_crashes()

'all_crashes' 55.23 sec


In [30]:
crashes.head()

Unnamed: 0,nodeid,integration_id,accident_dt
0,25,103718123,2018-07-20
1,26,6617123,2017-01-11
2,27,127318123,2018-09-04
3,27,143317123,2017-09-26
4,51,73718123,2018-05-22


# Need Preventable Crashes

In [31]:
# @db2.timeDec

# def preventable_crashes(crashes):
    

In [33]:
def crashes_5(crashes):
    counts = crashes.groupby('nodeid', as_index = False).integration_id.count()
    counts = counts.rename(index=str, columns = {'integration_id':'crashes'})
    crashes = crashes.merge(counts)
    crashes = crashes.loc[crashes.crashes >= 5]
    return crashes

In [48]:
crashes = crashes_5(crashes)

In [49]:
@db2.timeDec

def crash_diff(preventables):
    preventables.accident_dt = pd.to_datetime(preventables.accident_dt)
    preventables['dif'] = preventables.groupby('nodeid').accident_dt.diff(periods=-4) # take every 5th date difference per node
    return preventables

In [50]:
crash_diff(crashes)

'crash_diff' 5.51 sec


Unnamed: 0,nodeid,integration_id,accident_dt,dif,crashes
44,129,0169318123,2018-11-18,227 days,11
45,129,0168618123,2018-11-15,224 days,11
46,129,0076818123,2018-05-29,103 days,11
47,129,0067818123,2018-05-12,182 days,11
48,129,0050118123,2018-04-05,297 days,11
49,129,0050318123,2018-04-05,385 days,11
50,129,0026518123,2018-02-15,383 days,11
51,129,0169817123,2017-11-11,NaT,11
52,129,0087817123,2017-06-12,NaT,11
53,129,0039517123,2017-03-16,NaT,11


In [9]:
@db2.timeDec

def five_crash_window(preventables):
    preventables.accident_dt = pd.to_datetime(preventables.accident_dt)
    preventables['dif'] = preventables.groupby('nodeid').accident_dt.diff(periods=4) # take every 5th date difference per node
    preventables['dif'] = preventables['dif'].dt.days # convert to int
    preventables['five_yr'] = 0
    preventables.loc[preventables.dif >= -365, 'five_yr'] = 1 # boolean for 5 preventables within 365 days
    for i in preventables.loc[preventables.five_yr == 1].index:
        preventables.at[i, 'window_start'] = preventables.at[i-4, 'accident_dt'] # window start is 4th preceding crash
        preventables.at[i, 'window_end'] = preventables.at[i-4, 'accident_dt'] - relativedelta(years=1) # window end is start + 1 yr
    return preventables

In [10]:
crash5 = five_crash_window(crashes)

'five_crash_window' 30.92 sec


In [11]:
crash5.loc[crash5.nodeid.isin(crash5.loc[crash5.five_yr == 1].nodeid.tolist())]

Unnamed: 0,nodeid,integration_id,accident_dt,dif,five_yr,window_start,window_end
43,129,0169318123,2018-11-18,,0,NaT,NaT
44,129,0168618123,2018-11-15,,0,NaT,NaT
45,129,0076818123,2018-05-29,,0,NaT,NaT
46,129,0067818123,2018-05-12,,0,NaT,NaT
47,129,0050118123,2018-04-05,-227.0,1,2018-11-18,2017-11-18
48,129,0050318123,2018-04-05,-224.0,1,2018-11-15,2017-11-15
49,129,0026518123,2018-02-15,-103.0,1,2018-05-29,2017-05-29
50,129,0169817123,2017-11-11,-182.0,1,2018-05-12,2017-05-12
51,129,0087817123,2017-06-12,-297.0,1,2018-04-05,2017-04-05
52,129,0039517123,2017-03-16,-385.0,0,NaT,NaT


In [20]:
@db2.timeDec

def last_window(preventables_5):
    latest_start = preventables_5.groupby('nodeid', as_index = False).window_start.max()
    latest_start = latest_start.rename(index = str, columns = {'window_start':'latest_start'})
    latest_end = preventables_5.groupby('nodeid', as_index = False).window_end.max()
    latest_end = latest_end.rename(index = str, columns = {'window_end':'latest_end'})    
    preventables_5 = preventables_5.merge(latest_start.merge(latest_end), how = 'left')
    preventables_5.loc[(preventables_5.accident_dt >= preventables_5.latest_end) & \
                       (preventables_5.accident_dt <= preventables_5.latest_start), 'last_in'] = 1
    return preventables_5

In [21]:
last_window_crashes = last_window(crash5)

'last_window' 0.26 sec


In [146]:
last_window_crashes.loc[(last_window_crashes.accident_dt >= last_window_crashes.latest_end) & \
                       (last_window_crashes.accident_dt <= last_window_crashes.latest_start), 'last_in'] = 1