In [1]:
import pandas as pd
import pyreadr
import datetime
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)

In [2]:
# define paths:
data_path = '/Users/simonneumeyer/Dropbox/Ethiopia IE - Road Safety/Data/'

crashes_path = 'ETRE - Crashes/'
crash_file = 'FinalData/crashes.csv'

traffic_path = 'ETRE - Traffic/'
traffic_file = 'FinalData/traffic.pq'

precipitation_path = 'Precipitation/'
precipitation_file = 'FinalData/precipitation.csv'

In [3]:
# load data
traffic_final = pd.read_parquet(data_path + traffic_path + traffic_file, engine='pyarrow')

precipitation_final = pd.read_csv(data_path + precipitation_path + precipitation_file)
crashes_final = pd.read_csv(data_path + crashes_path + crash_file)
# rename date & hour:
crashes_final = crashes_final.rename(columns={'accident_date':'date', 'time_of_accident_hour':'hour'})


In [4]:
#aae_points_path = data_path + 'Addis Adama Expressway/Data/expressway/aae_points.Rds'
#aae_points = pyreadr.read_r(aae_points_path)
#aae_points = aae_points[None]
#aae_points

In [5]:
%%time
# create rectangularized segment-time dataset:
date_range = pd.date_range(precipitation_final.date.min(), precipitation_final.date.max())
date_range = list(date_range)
hour_range = list(range(0,24))
km_range = [1000*x for x in range(0,79)]
segment_time = pd.DataFrame([[x,y,z] for x in date_range for y in hour_range for z in km_range])
segment_time = segment_time.rename(columns={0:'date', 1:'hour', 2:'km_from_addis'})

# create accident counts and add them to segment-time dataset:
crashes_final['accidents'] = 1
crashes_final['km_from_addis'] = crashes_final.distance_from_addis.round(-3)
accid_count = crashes_final.groupby(['date', 'hour', 'km_from_addis'])['accidents'].count()
accid_count = accid_count.reset_index()
accid_count.date = pd.to_datetime(accid_count.date)
segment_time = segment_time.merge(accid_count, how='left', on=['date', 'hour', 'km_from_addis'])

# add precipitation:
precipitation_final.date = pd.to_datetime(precipitation_final.date)
segment_time = segment_time.merge(precipitation_final[['date', 'precip_mm']], on='date', how='left')

# fill missing data with 0 only for the date range where we have crashes data (2015-2017)
crashes_start_date = pd.to_datetime('2015-01-01')
crashes_end_date = pd.to_datetime(crashes_final.date).max()
condition = (segment_time['date'] >= crashes_start_date) & (segment_time['date'] <= crashes_end_date) 
segment_time.loc[condition, 'accidents'] = segment_time.loc[condition, 'accidents'].fillna(0)

# merging accident data into it:
relevant_crash_vars = ['distance_from_addis', 'case_no', 'date', 'day', 'hashed_plate_number',
       'vehicle_type', 'vehicle_brand', 'direction',
       'road_geometry', 'road', 'weather', 'fatality', 'serious_injury',
       'slight_injury', 'cause_of_accident', 'type_of_accident',
       'year_of_production', 'owner', 'driver_age', 'gender', 'address',
       'drivers_license_level', 'license_year', 'license_region', 'experience',
       'relation_with_vehicle', 'etre_asset_damage', 'ownership',
       'extent_of_damage', 'year', 'hour',
       'time_of_accident_minute', 'accident_datetime',
       'accident_location_original', 'accident_location_text', 'latitude',
       'longitude', 'axle_number', 'cause_of_accident_simple',
       'type_of_accident_simple', 'accident_cause_vehicle_human',
       'km_from_addis']
crashes_final.date = pd.to_datetime(crashes_final.date)
segment_time = segment_time.merge(crashes_final[relevant_crash_vars], how='left', on=['date', 'hour', 'km_from_addis'])

CPU times: user 16.4 s, sys: 4.82 s, total: 21.2 s
Wall time: 21.8 s


In [6]:
segment_time

Unnamed: 0,date,hour,km_from_addis,accidents,precip_mm,distance_from_addis,case_no,day,hashed_plate_number,vehicle_type,vehicle_brand,direction,road_geometry,road,weather,fatality,serious_injury,slight_injury,cause_of_accident,type_of_accident,year_of_production,owner,driver_age,gender,address,drivers_license_level,license_year,license_region,experience,relation_with_vehicle,etre_asset_damage,ownership,extent_of_damage,year,time_of_accident_minute,accident_datetime,accident_location_original,accident_location_text,latitude,longitude,axle_number,cause_of_accident_simple,type_of_accident_simple,accident_cause_vehicle_human
0,2014-01-01,0,0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2014-01-01,0,1000,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2014-01-01,0,2000,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2014-01-01,0,3000,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2014-01-01,0,4000,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4154226,2019-12-31,23,74000,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4154227,2019-12-31,23,75000,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4154228,2019-12-31,23,76000,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4154229,2019-12-31,23,77000,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## adding traffic:

In [9]:
traffic_final.ent_occur_time = pd.to_datetime(traffic_final.ent_occur_time)
traffic_final.trans_occur_time = pd.to_datetime(traffic_final.trans_occur_time)

In [100]:
date_range = pd.date_range(pd.to_datetime('2014-08-28'), traffic_final.trans_occur_time.max())
date_range = list(date_range)
hour_range = list(range(0,24))
km_range = [1000*x for x in range(0,79)]
day_hour_km = [[x + datetime.timedelta(hours=y),z] for x in date_range for y in hour_range for z in km_range]

In [105]:
traffic_final['entr_km_from_addis'] = traffic_final.entrance_km.apply(lambda x: 78.0-x)
traffic_final['exit_km_from_addis'] = traffic_final.exit_km.apply(lambda x: 78.0-x)

In [136]:
t = traffic_final.sample(n=100).reset_index(drop=True)

In [137]:
#traffic_final[traffic_final.exit_km>traffic_final.entrance_km]

In [141]:
traffic_final

Unnamed: 0,plaza_id,trans_occur_time,car_license,veh_type,up_down,total_weight,distance,ent_plaza_id,ent_occur_time,time_on_road,speed_km_hr,entrance_km,exit_km,direction,entr_km_from_addis,exit_km_from_addis
0,102,2014-09-01 08:35:30,0x92AD56AB29E347D1779A07879D38C67A,2,0,0,16930.0,101,2014-08-31 10:07:42,1347.800000,,2.0,2.0,to addis,76.0,76.0
1,102,2014-09-01 08:36:24,0x2CD713BB239103A0B783F539E3BEEF32,3,0,0,16930.0,101,2014-08-31 10:22:29,1333.916667,,2.0,2.0,to addis,76.0,76.0
2,102,2014-09-01 08:37:21,0xBC6CE93C7F7B823BF30E9C741AFD5E1E,5,0,0,16930.0,101,2014-08-30 15:19:22,,,2.0,2.0,to addis,76.0,76.0
3,102,2014-09-01 08:38:17,0x99AF7C40091A3325B24D7E451316AA9F,2,0,0,16930.0,101,2014-08-31 10:20:53,1337.400000,,2.0,2.0,to addis,76.0,76.0
4,102,2014-09-01 10:38:38,0xCFCD208495D565EF66E7DFF9F98764DA,1,0,0,,0,NaT,,,2.0,2.0,to addis,76.0,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31844671,602,2019-08-31 23:50:28,0x0B3C0CD1FDE274BEE444E1B4AD22F3C0,3,0,0,73310.0,101,2019-08-31 22:50:50,59.633333,73.760760,2.0,64.0,to addis,76.0,14.0
31844672,602,2019-08-31 23:52:00,0xD740B5BD8DB8A86A3B10FE7FCF675FE2,3,0,2600,73310.0,101,2019-08-31 23:04:51,47.150000,93.289502,2.0,64.0,to addis,76.0,14.0
31844673,602,2019-08-31 23:52:38,0x0A5E88CC182B9D75487392C0661CBA6A,7,0,20350,73310.0,101,2019-08-31 22:45:44,66.900000,65.748879,2.0,64.0,to addis,76.0,14.0
31844674,602,2019-08-31 23:53:18,0xEC0ECBF72F19F3F9400CE5648FFB5D90,7,0,0,73310.0,101,2019-08-31 22:40:18,73.000000,60.254795,2.0,64.0,to addis,76.0,14.0


In [None]:
%%time
def get_traffic(traffic_dataset, day_hour_km):
    traffic = []
    for x in day_hour_km:
        print(x)
        counter = 0
        ent_occur_time = traffic_dataset.ent_occur_time
        trans_occur_time = traffic_dataset.trans_occur_time
        distance = traffic_dataset.distance.fillna(0)
        entr_km_from_addis = traffic_dataset.entr_km_from_addis
        exit_km_from_addis = traffic_dataset.exit_km_from_addis
        direction = traffic_dataset.direction
        timestamp = x[0]

        for i in range(len(traffic_dataset)):
            time_condition = (timestamp > ent_occur_time[i] and timestamp < trans_occur_time[i])

            to_adama = direction[i] == 'to adama'
            if to_adama:
                km_condition_1 = x[1]/1000 in range(int(entr_km_from_addis[i]), int(exit_km_from_addis[i] + 1))
                km_condition_2 = x[1]/1000 in range(int(entr_km_from_addis[i]), int(entr_km_from_addis[i] + round(distance[i]/1000)))
            else:
                km_condition_1 = x[1]/1000 in range(int(exit_km_from_addis[i]), int(entr_km_from_addis[i] + 1))
                km_condition_2 = x[1]/1000 in range(int(exit_km_from_addis[i]), int(entr_km_from_addis[i] + round(distance[i]/1000)))
            
            if time_condition and (km_condition_1 or km_condition_2):
                counter += 1
        traffic.append(counter)
    return traffic

traffic = get_traffic(traffic_final, day_hour_km)

[Timestamp('2014-08-28 00:00:00', freq='D'), 0]


In [51]:
df_traffic = pd.DataFrame(day_hour_km, columns=['date_hour', 'km'])
df_traffic['traffic'] = traffic

In [None]:
segment_time['traffic']

In [47]:
len(segment_time)

4154231

In [78]:
df_traffic.date

0          2014-01-01
1          2014-01-01
2          2014-01-01
3          2014-01-01
4          2014-01-01
              ...    
4154131    2019-12-31
4154132    2019-12-31
4154133    2019-12-31
4154134    2019-12-31
4154135    2019-12-31
Name: date, Length: 4154136, dtype: object

In [79]:
df_traffic['hour'] = df_traffic.date_hour.apply(lambda x: x.hour)
df_traffic['date'] = pd.to_datetime(df_traffic.date_hour.apply(lambda x: x.date()))

In [80]:
df_traffic = df_traffic.rename(columns={'km':'km_from_addis'})

In [82]:
segment_time = segment_time.merge(df_traffic, how='left', on=['date', 'hour', 'km_from_addis'])

In [93]:
traffic_final[(traffic_final.entrance_km == 2) & (traffic_final.ent_occur_time < pd.to_datetime('2014-08-30 16:00:00')) & (traffic_final.trans_occur_time > pd.to_datetime('2014-08-30 16:00:00'))]

Unnamed: 0,plaza_id,trans_occur_time,car_license,veh_type,up_down,total_weight,distance,ent_plaza_id,ent_occur_time,time_on_road,speed_km_hr,entrance_km,exit_km,direction
2,102,2014-09-01 08:37:21,0xBC6CE93C7F7B823BF30E9C741AFD5E1E,5,0,0,16930.0,101,2014-08-30 15:19:22,,,2.0,2.0,to addis
10,102,2014-09-01 10:08:48,0x748C5881461367AB9EED5AF82966A480,2,0,0,16930.0,101,2014-08-29 09:32:11,,,2.0,2.0,to addis
29,102,2014-09-01 10:39:25,0xE99EC12BEE11AAFC744800512EAF2702,2,0,0,16930.0,101,2014-08-29 10:29:40,,,2.0,2.0,to addis
36,102,2014-09-01 10:46:02,0xE99EC12BEE11AAFC744800512EAF2702,2,0,0,16930.0,101,2014-08-29 10:29:40,,,2.0,2.0,to addis
39,102,2014-09-01 10:50:28,0xE99EC12BEE11AAFC744800512EAF2702,2,0,0,16930.0,101,2014-08-29 10:29:40,,,2.0,2.0,to addis
42,102,2014-09-01 10:52:04,0xE99EC12BEE11AAFC744800512EAF2702,2,0,0,16930.0,101,2014-08-29 10:29:40,,,2.0,2.0,to addis
44,102,2014-09-01 10:53:13,0x37499BFF6CD99349DA630045E06DC0A3,2,0,0,16930.0,101,2014-08-28 14:51:24,,,2.0,2.0,to addis
45,102,2014-09-01 10:54:13,0x37499BFF6CD99349DA630045E06DC0A3,2,0,0,16930.0,101,2014-08-28 14:51:24,,,2.0,2.0,to addis
49,102,2014-09-01 10:55:40,0x0423EBD4696DD24D87A9E094527F9C66,2,0,0,16930.0,101,2014-08-28 14:51:46,,,2.0,2.0,to addis
50,102,2014-09-01 10:56:51,0x0423EBD4696DD24D87A9E094527F9C66,2,0,0,16930.0,101,2014-08-28 14:51:46,,,2.0,2.0,to addis


In [90]:
#df_traffic[(df_traffic.date == '2014-08-30') & (df_traffic.hour==16)]

In [107]:
#segment_time[segment_time.traffic>=2]