## Imports

In [38]:
from datetime import datetime

## Load data

In [39]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=4, memory_limit="2GB")
client  = Client(cluster)       # opens a dashboard at http://127.0.0.1:8787

In [43]:
from dask_mongo import read_mongo

# Connection
mongo_uri   = "mongodb://localhost:27017"
database    = "mongo_db"
traffic_coll  = "traffic_collection"
collision_coll = "collision_collection"


tr_bag = read_mongo(
    connection_kwargs={"host": mongo_uri},
    database=database,
    collection=traffic_coll,
    chunksize=6000,
)

col_bag = read_mongo(
    connection_kwargs={"host": mongo_uri},
    database=database,
    collection=collision_coll,
    chunksize=2000,
)

# # Pull everything (you can pass a query / projection to cut size)
# tr_df = read_mongo(
#         connection_string=mongo_uri,
#         database=database,
#         collection=traffic_coll,
#         partition_field="_id",     # how data are chunked; any indexed field works
#         partition_size=2000      # ≈ docs per partition
#       ).persist()

# col_df = read_mongo(
#         connection_string=mongo_uri,
#         database=database,
#         collection=collision_coll,
#         partition_field="_id",     # how data are chunked; any indexed field works
#         partition_size=2000      # ≈ docs per partition
#       ).persist()


In [44]:
tr_df = tr_bag.to_dataframe().persist()
col_df = col_bag.to_dataframe().persist()

In [45]:
tr_df.head()

Unnamed: 0,_id,id,speed,travel_time,status,data_as_of,link_id,link_points,encoded_poly_line,encoded_poly_line_lvls,owner,transcom_id,borough,link_name
0,681ba9386aa59990a8eab101,3,0.0,0,-101,2025-04-28T21:29:03.000,4616324,"40.76375,-73.999191 40.763521,-73.99935 40.762...",mtxwF\\|}sbMl@^~GpK\\|LrIbLlH??lK~G\\|FtD`C~@}...,BBBBBBBBBBBBBBB,NYC_DOT_LIC,4616324,Manhattan,12th ave @ 45th - 11 ave ganservoort st
1,681ba9386aa59990a8eab102,106,0.0,0,-101,2025-04-28T21:29:03.000,4616323,"40.77158,-73.994441 40.7713004,-73.99455 40.77...",kezwFf`sbMv@TxAVnDZe@Gz@J~@Xf@VlEnC??~KpH??vCp...,BBBBBBBBBBBBBBBBB,NYC_DOT_LIC,4616323,Manhattan,12th Ave S 57th St - 45th St
2,681ba9386aa59990a8eab103,4,0.0,0,-101,2025-04-28T21:28:10.000,4616338,"40.7607,-74.002141 40.76212,-74.91 40.76335,-7...",kaxwFjptbM{GuFuFgIwHsFmJqG_OwJcCw@qE[cBc@,BBBBBBBBB,NYC_DOT_LIC,4616338,Manhattan,12th Ave N 40th - 57th St
3,681ba9386aa59990a8eab104,376,2.48,1927,0,2025-04-28T21:28:10.000,4616192,"40.61052,-74.09769 40.610561,-74.09586 40.6102...",wvzvFnegcMGmJ~@kKpAoGxPki@~AyJbA}Kf@ac@`BeOzHgY,BBBBBBBBBB,NYC_DOT_LIC,4616192,Staten Island,SIE E CLOVE ROAD - FINGERBOARD ROAD
4,681ba9386aa59990a8eab105,351,47.84,112,0,2025-04-28T21:28:10.000,4616210,"40.63092,-74.14592 40.62975,-74.14593 40.62877...",gv~vF~rpcMhF@bE[bDWbD@vCb@\\|D~@dCf@`Cv@jCjA~C...,BBBBBBBBBBBBBBBBBBBBBBBB,NYC_DOT_LIC,4616210,Staten Island,MLK S - SIE W WALKER STREET - RICHMOND AVENUE


In [46]:
tr_df.tail()

Unnamed: 0,_id,id,speed,travel_time,status,data_as_of,link_id,link_points,encoded_poly_line,encoded_poly_line_lvls,owner,transcom_id,borough,link_name
5995,681ba9386aa59990a8eac86c,347,41.01,170,0,2025-04-28T17:23:02.000,4456477,"40.77223,-73.919941 40.77367,-73.92198 40.7744...",mizwFrndbM_HvKuC~C_PhLqWpQwKrHwBxAeBh@mBPoCKmB...,BBBBBBBBBBBBBBBBBBBBBBBBB,MTA Bridges & Tunnels,4456477,Queens,MDE S TBB EXIT RAMP - QUEENS ANCHORAGE
5996,681ba9386aa59990a8eac86d,451,16.77,384,0,2025-04-28T17:23:02.000,4456500,"40.7712605,-73.833311 40.7727804,-73.83087 40....",kczwFdqsaMoHgNmBoCoDoDeIaFwPeIuLcFkEuAyE_AoXeD...,BBBBBBBBBBBBBBBBBBB,NYC_DOT_LIC,4456500,Queens,Whitestone Expwy N Exit 14 (Linden Pl) - BWB N...
5997,681ba9386aa59990a8eac86e,124,13.67,520,0,2025-04-28T17:23:02.000,4456501,"40.68036,-74.00441001 40.6822,-74.0057201 40.6...",gkhwFp~tbMoJdGkHnCaK`Fat@d`@oe@`NyD\\|@{L@wHsA...,BBBBBBBBBB,MTA Bridges & Tunnels,4456501,Manhattan,BBT W Toll Plaza - Manhattan Portal
5998,681ba9386aa59990a8eac86f,213,9.94,228,0,2025-04-28T17:23:02.000,4456450,"40.80069,-73.92878 40.8013005,-73.930181 40.80...",i{_xFzefbMyBvGUlACt@Rj@d@f@z@@`@W\g@bA_DTk@b@i...,BBBBBBBBBBBBBBBBBBBBBBB,MTA Bridges & Tunnels,4456450,Manhattan,FDR N - TBB E 116TH STREET - MANHATTAN TRUSS
5999,681ba9386aa59990a8eac870,164,43.49,100,0,2025-04-28T17:23:02.000,4456497,"40.79932,-73.82809 40.8006,-73.82895 40.804700...",wr_xFppraM_GjDsXtNci@pYsB~@}Ab@gAHiKD,BBBBBBBB,NYC_DOT_LIC,4456497,Queens,BWB N Queens Anchorage - Toll Plaza


In [47]:
tr_df.nunique('data_as_of').compute()

_id                       6000
id                         120
speed                      104
travel_time               1040
status                       2
data_as_of                 274
link_id                    120
link_points                118
encoded_poly_line          236
encoded_poly_line_lvls      37
owner                        7
transcom_id                120
borough                      5
link_name                  120
dtype: int64

In [48]:
tr_df.min().compute()

_id                                                681ba9386aa59990a8eab101
id                                                                        1
speed                                                                     0
travel_time                                                               0
status                                                                 -101
data_as_of                                          2025-04-28T17:23:02.000
link_id                                                             4329472
link_points               40.52561,-74.23039 40.5258705,-74.22618 40.526...
encoded_poly_line         _mbxFnufbMnE~@xBP\\\\\\|BOpBeA\\\\\\|A}ArLaQjB...
encoded_poly_line_lvls                                                   BB
owner                                                 MTA Bridges & Tunnels
transcom_id                                                         4329472
borough                                                               Bronx
link_name   

In [49]:
tr_df.max().compute()

_id                                                681ba9386aa59990a8eac870
id                                                                      453
speed                                                                  9.94
travel_time                                                             999
status                                                                    0
data_as_of                                          2025-04-28T21:29:03.000
link_id                                                             4763657
link_points               40.8859405,-73.89676 40.88698,-73.89528 40.887...
encoded_poly_line                   }zhxFryfbMfC_J~BqOlAgIZkFC_Hq@sZ[wb@ToV
encoded_poly_line_lvls    BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB...
owner                                              Verrazano-Narrows-Bridge
transcom_id                                                         4763657
borough                                                       Staten Island
link_name   

In [50]:
print(tr_bag.take(1))

({'_id': ObjectId('681ba9386aa59990a8eab101'), 'id': '3', 'speed': '0', 'travel_time': '0', 'status': '-101', 'data_as_of': '2025-04-28T21:29:03.000', 'link_id': '4616324', 'link_points': '40.76375,-73.999191 40.763521,-73.99935 40.7620804,-74.00136 40.75985,-74.00306 40.75775,-74.00457 40.75775,-74.00457 40.75576,-74.00601 40.7544904,-74.006921 40.7538404,-74.007241 40.75415,-74.00712 40.7502804,-74.00848 40.74833,-74.007771 40.74114,-74.0', 'encoded_poly_line': 'mtxwF\\\\|}sbMl@^~GpK\\\\|LrIbLlH??lK~G\\\\|FtD`C~@}@WdWnGdKmC\\\\|k@~G`CRzElC', 'encoded_poly_line_lvls': 'BBBBBBBBBBBBBBB', 'owner': 'NYC_DOT_LIC', 'transcom_id': '4616324', 'borough': 'Manhattan', 'link_name': '12th ave @ 45th - 11 ave ganservoort st'},)


In [51]:
col_df.head()

Unnamed: 0,_id,crash_date,crash_time,on_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle_1,collision_id,vehicle_type_code1
0,681ba907b75cfbd89a629965,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,1,0,0,0,0,0,1,0,Pavement Slippery,4513547,Sedan
1,681ba907b75cfbd89a629966,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage,4455765,Sedan
2,681ba907b75cfbd89a629967,2023-04-26T00:00:00.000,22:20,,0,0,0,0,0,0,0,0,Unspecified,4623865,Sedan
3,681ba907b75cfbd89a629968,2022-06-29T00:00:00.000,6:55,THROGS NECK BRIDGE,0,0,0,0,0,0,0,0,Following Too Closely,4541903,Sedan
4,681ba907b75cfbd89a629969,2023-11-01T00:00:00.000,1:29,OCEAN PARKWAY,1,0,0,0,0,0,1,0,Unspecified,4675373,Moped


In [3]:
import pandas as pd

In [41]:
tr_df = pd.read_json("../../../dummy_data/traffic_data.jsonl", lines=True)
tr_df.head()

Unnamed: 0,id,speed,travel_time,status,data_as_of,link_id,link_points,encoded_poly_line,encoded_poly_line_lvls,owner,transcom_id,borough,link_name
0,159,44.11,125,0,2025-04-28T21:29:03.000,4616252,"40.8563506,-73.87233 40.85219,-73.871371 40.85...",ewjxF`e{aM~X_EfLs@pRFbE^fUlClPlC`TdE`Gb@\\|HMt...,BBBBBBBBBBBBB,NYC_DOT_LIC,4616252,Bronx,BRP N WATSON AVENUE - FORDHAM ROAD
1,3,0.0,0,-101,2025-04-28T21:29:03.000,4616324,"40.76375,-73.999191 40.763521,-73.99935 40.762...",mtxwF\\|}sbMl@^~GpK\\|LrIbLlH??lK~G\\|FtD`C~@}...,BBBBBBBBBBBBBBB,NYC_DOT_LIC,4616324,Manhattan,12th ave @ 45th - 11 ave ganservoort st
2,450,0.0,0,-101,2025-04-28T21:29:03.000,4616346,"40.8500304,-73.944831 40.8492,-73.945241 40.84...",uoixFdjibMdDpAhDDnWk@rDJvD^hG\\|AzEpB~BzAbEfD\...,BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB...,NYC_DOT_LIC,4616346,Manhattan,Westside Hwy S GWB - 57th St
3,2,0.0,0,-101,2025-04-28T21:29:03.000,4616325,"40.73933,-74.01004 40.73895,-74.01012 40.7376,...",y{swFvavbMjANlGSvQn@fa@fBhQdA,BBBBBB,NYC_DOT_LIC,4616325,Manhattan,11th ave s ganservoort - west st @ spring st
4,433,0.0,0,-101,2025-04-28T21:29:03.000,4616215,"40.52561,-74.23039 40.5258705,-74.22618 40.526...",adjvF\\|badMs@iYaBsPcEB_\\|@vDyLWeHg@mUeF}L}EuIaF,BBBBBBBBBB,NYC_DOT_LIC,4616215,Staten Island,WSE N TYRELLAN AVENUE - BLOOMINGDALE ROAD


In [42]:
tr_df

Unnamed: 0,id,speed,travel_time,status,data_as_of,link_id,link_points,encoded_poly_line,encoded_poly_line_lvls,owner,transcom_id,borough,link_name
0,159,44.11,125,0,2025-04-28T21:29:03.000,4616252,"40.8563506,-73.87233 40.85219,-73.871371 40.85...",ewjxF`e{aM~X_EfLs@pRFbE^fUlClPlC`TdE`Gb@\\|HMt...,BBBBBBBBBBBBB,NYC_DOT_LIC,4616252,Bronx,BRP N WATSON AVENUE - FORDHAM ROAD
1,3,0.00,0,-101,2025-04-28T21:29:03.000,4616324,"40.76375,-73.999191 40.763521,-73.99935 40.762...",mtxwF\\|}sbMl@^~GpK\\|LrIbLlH??lK~G\\|FtD`C~@}...,BBBBBBBBBBBBBBB,NYC_DOT_LIC,4616324,Manhattan,12th ave @ 45th - 11 ave ganservoort st
2,450,0.00,0,-101,2025-04-28T21:29:03.000,4616346,"40.8500304,-73.944831 40.8492,-73.945241 40.84...",uoixFdjibMdDpAhDDnWk@rDJvD^hG\\|AzEpB~BzAbEfD\...,BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB...,NYC_DOT_LIC,4616346,Manhattan,Westside Hwy S GWB - 57th St
3,2,0.00,0,-101,2025-04-28T21:29:03.000,4616325,"40.73933,-74.01004 40.73895,-74.01012 40.7376,...",y{swFvavbMjANlGSvQn@fa@fBhQdA,BBBBBB,NYC_DOT_LIC,4616325,Manhattan,11th ave s ganservoort - west st @ spring st
4,433,0.00,0,-101,2025-04-28T21:29:03.000,4616215,"40.52561,-74.23039 40.5258705,-74.22618 40.526...",adjvF\\|badMs@iYaBsPcEB_\\|@vDyLWeHg@mUeF}L}EuIaF,BBBBBBBBBB,NYC_DOT_LIC,4616215,Staten Island,WSE N TYRELLAN AVENUE - BLOOMINGDALE ROAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,213,9.94,228,0,2025-04-28T17:23:02.000,4456450,"40.80069,-73.92878 40.8013005,-73.930181 40.80...",i{_xFzefbMyBvGUlACt@Rj@d@f@z@@`@W\g@bA_DTk@b@i...,BBBBBBBBBBBBBBBBBBBBBBB,MTA Bridges & Tunnels,4456450,Manhattan,FDR N - TBB E 116TH STREET - MANHATTAN TRUSS
5996,141,43.49,160,0,2025-04-28T17:23:02.000,4456478,"40.772251,-73.919891 40.77391,-73.9222 40.7747...",qizwFhndbMkIlMeD`DyIbGyJ`HsOnK{OzKcBf@mBPoCKkB...,BBBBBBBBBBBBBBBBBBBBBBBBB,MTA Bridges & Tunnels,4456478,Queens,BE S TBB EXIT RAMP - QUEENS ANCHORAGE
5997,140,38.52,72,0,2025-04-28T17:23:02.000,4456479,"40.79789,-73.91988 40.79771,-73.92004 40.79758...",yi_xFfndbMb@^Xb@ThAEbB_@nByAbEm@fAkAbDiAlDo@nB...,BBBBBBBBBBBBBBB,MTA Bridges & Tunnels,4456479,Queens,BE S TBB EXIT RAMP - MANHATTAN LIFT SPAN
5998,202,46.60,47,0,2025-04-28T17:23:02.000,4456483,"40.789536,-73.78631 40.7894,-73.78765 40.78897...",qu}wFlkjaMXjGtAzJ@nB_@tC]~@s@lAuAlAuCbB??mAh@m...,BBBBBBBBBBBBBBBB,NYC_DOT_LIC,4456483,Queens,CIP N ramp to TNB - TNB Queens Anchorage


In [5]:
col_df = pd.read_json("../../../dummy_data/collision_data.jsonl", lines=True)
col_df.tail()

Unnamed: 0,crash_date,crash_time,on_street_name,off_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,cross_street_name,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
5995,2021-04-24T00:00:00.000,2025-05-11 14:40:00,MADISON AVENUE,EAST 83 STREET,0,0,0,0,0,0,...,40.778904,-73.96024,"{'latitude': '40.778904', 'longitude': '-73.96...",,,,,,,
5996,2021-04-23T00:00:00.000,2025-05-11 10:19:00,SHEFFIELD AVENUE,BLAKE AVENUE,0,0,0,0,0,0,...,,,,,,,,,,
5997,2021-04-24T00:00:00.000,2025-05-11 23:10:00,SPENCER AVENUE,218 STREET,4,0,0,0,0,0,...,40.729355,-73.747665,"{'latitude': '40.729355', 'longitude': '-73.74...",Unspecified,Sedan,,,,,
5998,2021-04-24T00:00:00.000,2025-05-11 01:27:00,CROSS BRONX EXPY,,0,0,0,0,0,0,...,,,,Unspecified,Sedan,,,,,
5999,2021-04-24T00:00:00.000,2025-05-11 03:18:00,ASHFORD STREET,ARLINGTON AVENUE,0,0,0,0,0,0,...,40.68054,-73.88674,"{'latitude': '40.68054', 'longitude': '-73.886...",,,,,,,


## Data Preprocessing

### Collision dataframe

In [7]:
col_df['crash_score'] = col_df['number_of_persons_injured'].map(int) + 3*(col_df['number_of_persons_killed'].map(int))
col_df['crash_score']

0       2
1       1
2       1
3       0
4       0
       ..
5995    0
5996    0
5997    4
5998    0
5999    0
Name: crash_score, Length: 6000, dtype: int64

In [8]:
col_df['crash_date'] = col_df['crash_date'].map(lambda x : str(x).split("T")[0] + "T")
col_df["crash_time"] = col_df['crash_date'] + col_df['crash_time'].map(lambda x : str(x).split(" ")[-1])
col_df['crash_time'] = col_df['crash_time'].map(lambda x : datetime.strptime(x, "%Y-%m-%dT%H:%M:%S"))

In [9]:
col_df

Unnamed: 0,crash_date,crash_time,on_street_name,off_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,cross_street_name,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5,crash_score
0,2021-09-11T,2021-09-11 02:39:00,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,2
1,2022-03-26T,2022-03-26 11:45:00,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,1
2,2023-11-01T,2023-11-01 01:29:00,OCEAN PARKWAY,AVENUE K,1,0,0,0,0,0,...,-73.970024,"{'latitude': '40.62179', 'longitude': '-73.970...",Unspecified,Sedan,,,,,,1
3,2022-06-29T,2022-06-29 06:55:00,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,0
4,2022-09-21T,2022-09-21 13:21:00,BROOKLYN BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,2021-04-24T,2021-04-24 14:40:00,MADISON AVENUE,EAST 83 STREET,0,0,0,0,0,0,...,-73.960240,"{'latitude': '40.778904', 'longitude': '-73.96...",,,,,,,,0
5996,2021-04-23T,2021-04-23 10:19:00,SHEFFIELD AVENUE,BLAKE AVENUE,0,0,0,0,0,0,...,,,,,,,,,,0
5997,2021-04-24T,2021-04-24 23:10:00,SPENCER AVENUE,218 STREET,4,0,0,0,0,0,...,-73.747665,"{'latitude': '40.729355', 'longitude': '-73.74...",Unspecified,Sedan,,,,,,4
5998,2021-04-24T,2021-04-24 01:27:00,CROSS BRONX EXPY,,0,0,0,0,0,0,...,,,Unspecified,Sedan,,,,,,0


In [10]:
col_columns = ['crash_time','latitude','longitude','crash_score']

In [11]:
col_df = col_df[col_columns]

In [None]:
col_df.dropna(inplace = True)
col_df.reset_index(inplace=True)
col_df.drop('index',axis=1, inplace=True)

In [14]:
col_df

Unnamed: 0,crash_time,latitude,longitude,crash_score
0,2023-11-01 01:29:00,40.621790,-73.970024,1
1,2021-09-11 09:35:00,40.667202,-73.866500,0
2,2021-12-14 08:13:00,40.683304,-73.917274,0
3,2021-12-14 17:05:00,40.709183,-73.956825,0
4,2021-12-14 08:17:00,40.868160,-73.831480,2
...,...,...,...,...
5494,2021-04-24 12:20:00,40.898815,-73.862230,1
5495,2021-04-24 12:00:00,40.741493,-73.875030,0
5496,2021-04-24 14:40:00,40.778904,-73.960240,0
5497,2021-04-24 23:10:00,40.729355,-73.747665,4


### Traffic dataframe

In [19]:
tr_df['data_as_of'] = tr_df['data_as_of'].map(lambda x : datetime.strptime(str(x).split('.')[0], "%Y-%m-%dT%H:%M:%S"))
tr_df

Unnamed: 0,id,speed,travel_time,status,data_as_of,link_id,link_points,encoded_poly_line,encoded_poly_line_lvls,owner,transcom_id,borough,link_name
0,159,44.11,125,0,2025-04-28 21:29:03,4616252,"40.8563506,-73.87233 40.85219,-73.871371 40.85...",ewjxF`e{aM~X_EfLs@pRFbE^fUlClPlC`TdE`Gb@\\|HMt...,BBBBBBBBBBBBB,NYC_DOT_LIC,4616252,Bronx,BRP N WATSON AVENUE - FORDHAM ROAD
1,3,0.00,0,-101,2025-04-28 21:29:03,4616324,"40.76375,-73.999191 40.763521,-73.99935 40.762...",mtxwF\\|}sbMl@^~GpK\\|LrIbLlH??lK~G\\|FtD`C~@}...,BBBBBBBBBBBBBBB,NYC_DOT_LIC,4616324,Manhattan,12th ave @ 45th - 11 ave ganservoort st
2,450,0.00,0,-101,2025-04-28 21:29:03,4616346,"40.8500304,-73.944831 40.8492,-73.945241 40.84...",uoixFdjibMdDpAhDDnWk@rDJvD^hG\\|AzEpB~BzAbEfD\...,BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB...,NYC_DOT_LIC,4616346,Manhattan,Westside Hwy S GWB - 57th St
3,2,0.00,0,-101,2025-04-28 21:29:03,4616325,"40.73933,-74.01004 40.73895,-74.01012 40.7376,...",y{swFvavbMjANlGSvQn@fa@fBhQdA,BBBBBB,NYC_DOT_LIC,4616325,Manhattan,11th ave s ganservoort - west st @ spring st
4,433,0.00,0,-101,2025-04-28 21:29:03,4616215,"40.52561,-74.23039 40.5258705,-74.22618 40.526...",adjvF\\|badMs@iYaBsPcEB_\\|@vDyLWeHg@mUeF}L}EuIaF,BBBBBBBBBB,NYC_DOT_LIC,4616215,Staten Island,WSE N TYRELLAN AVENUE - BLOOMINGDALE ROAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,213,9.94,228,0,2025-04-28 17:23:02,4456450,"40.80069,-73.92878 40.8013005,-73.930181 40.80...",i{_xFzefbMyBvGUlACt@Rj@d@f@z@@`@W\g@bA_DTk@b@i...,BBBBBBBBBBBBBBBBBBBBBBB,MTA Bridges & Tunnels,4456450,Manhattan,FDR N - TBB E 116TH STREET - MANHATTAN TRUSS
5996,141,43.49,160,0,2025-04-28 17:23:02,4456478,"40.772251,-73.919891 40.77391,-73.9222 40.7747...",qizwFhndbMkIlMeD`DyIbGyJ`HsOnK{OzKcBf@mBPoCKkB...,BBBBBBBBBBBBBBBBBBBBBBBBB,MTA Bridges & Tunnels,4456478,Queens,BE S TBB EXIT RAMP - QUEENS ANCHORAGE
5997,140,38.52,72,0,2025-04-28 17:23:02,4456479,"40.79789,-73.91988 40.79771,-73.92004 40.79758...",yi_xFfndbMb@^Xb@ThAEbB_@nByAbEm@fAkAbDiAlDo@nB...,BBBBBBBBBBBBBBB,MTA Bridges & Tunnels,4456479,Queens,BE S TBB EXIT RAMP - MANHATTAN LIFT SPAN
5998,202,46.60,47,0,2025-04-28 17:23:02,4456483,"40.789536,-73.78631 40.7894,-73.78765 40.78897...",qu}wFlkjaMXjGtAzJ@nB_@tC]~@s@lAuAlAuCbB??mAh@m...,BBBBBBBBBBBBBBBB,NYC_DOT_LIC,4456483,Queens,CIP N ramp to TNB - TNB Queens Anchorage


In [20]:
tr_df.loc[0,'link_points']

'40.8563506,-73.87233 40.85219,-73.871371 40.85007,-73.87111 40.8469404,-73.87115 40.8459605,-73.871311 40.8424005,-73.87202 40.83961,-73.87273 40.8362404,-73.87372 40.8349506,-73.8739 40.8333606,-73.873831 40.8319705,-73.873681 40.82985,-73.87313 40.82683'

In [24]:
def compute_centroid(link_points_str):
    try:
        points = [tuple(map(float, pair.split(','))) for pair in link_points_str.strip().split()]
        if not points:
            return (None, None)
        lats, lons = zip(*points)
        return sum(lats) / len(lats), sum(lons) / len(lons)
    except Exception:
        return (None, None)

In [27]:
# Change for Dask

tr_df[["c_lat", "c_long"]] = tr_df["link_points"].apply(
    lambda s: pd.Series(compute_centroid(s)),
    # meta={"c_lat": "f8", "c_long": "f8"}
)

In [29]:
tr_df

Unnamed: 0,id,speed,travel_time,status,data_as_of,link_id,link_points,encoded_poly_line,encoded_poly_line_lvls,owner,transcom_id,borough,link_name,c_lat,c_long
0,159,44.11,125,0,2025-04-28 21:29:03,4616252,"40.8563506,-73.87233 40.85219,-73.871371 40.85...",ewjxF`e{aM~X_EfLs@pRFbE^fUlClPlC`TdE`Gb@\\|HMt...,BBBBBBBBBBBBB,NYC_DOT_LIC,4616252,Bronx,BRP N WATSON AVENUE - FORDHAM ROAD,,
1,3,0.00,0,-101,2025-04-28 21:29:03,4616324,"40.76375,-73.999191 40.763521,-73.99935 40.762...",mtxwF\\|}sbMl@^~GpK\\|LrIbLlH??lK~G\\|FtD`C~@}...,BBBBBBBBBBBBBBB,NYC_DOT_LIC,4616324,Manhattan,12th ave @ 45th - 11 ave ganservoort st,40.755592,-74.004280
2,450,0.00,0,-101,2025-04-28 21:29:03,4616346,"40.8500304,-73.944831 40.8492,-73.945241 40.84...",uoixFdjibMdDpAhDDnWk@rDJvD^hG\\|AzEpB~BzAbEfD\...,BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB...,NYC_DOT_LIC,4616346,Manhattan,Westside Hwy S GWB - 57th St,,
3,2,0.00,0,-101,2025-04-28 21:29:03,4616325,"40.73933,-74.01004 40.73895,-74.01012 40.7376,...",y{swFvavbMjANlGSvQn@fa@fBhQdA,BBBBBB,NYC_DOT_LIC,4616325,Manhattan,11th ave s ganservoort - west st @ spring st,40.734298,-74.010392
4,433,0.00,0,-101,2025-04-28 21:29:03,4616215,"40.52561,-74.23039 40.5258705,-74.22618 40.526...",adjvF\\|badMs@iYaBsPcEB_\\|@vDyLWeHg@mUeF}L}EuIaF,BBBBBBBBBB,NYC_DOT_LIC,4616215,Staten Island,WSE N TYRELLAN AVENUE - BLOOMINGDALE ROAD,40.536165,-74.224091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,213,9.94,228,0,2025-04-28 17:23:02,4456450,"40.80069,-73.92878 40.8013005,-73.930181 40.80...",i{_xFzefbMyBvGUlACt@Rj@d@f@z@@`@W\g@bA_DTk@b@i...,BBBBBBBBBBBBBBBBBBBBBBB,MTA Bridges & Tunnels,4456450,Manhattan,FDR N - TBB E 116TH STREET - MANHATTAN TRUSS,,
5996,141,43.49,160,0,2025-04-28 17:23:02,4456478,"40.772251,-73.919891 40.77391,-73.9222 40.7747...",qizwFhndbMkIlMeD`DyIbGyJ`HsOnK{OzKcBf@mBPoCKkB...,BBBBBBBBBBBBBBBBBBBBBBBBB,MTA Bridges & Tunnels,4456478,Queens,BE S TBB EXIT RAMP - QUEENS ANCHORAGE,40.781225,-68.778638
5997,140,38.52,72,0,2025-04-28 17:23:02,4456479,"40.79789,-73.91988 40.79771,-73.92004 40.79758...",yi_xFfndbMb@^Xb@ThAEbB_@nByAbEm@fAkAbDiAlDo@nB...,BBBBBBBBBBBBBBB,MTA Bridges & Tunnels,4456479,Queens,BE S TBB EXIT RAMP - MANHATTAN LIFT SPAN,40.798488,-73.922285
5998,202,46.60,47,0,2025-04-28 17:23:02,4456483,"40.789536,-73.78631 40.7894,-73.78765 40.78897...",qu}wFlkjaMXjGtAzJ@nB_@tC]~@s@lAuAlAuCbB??mAh@m...,BBBBBBBBBBBBBBBB,NYC_DOT_LIC,4456483,Queens,CIP N ramp to TNB - TNB Queens Anchorage,40.790168,-68.653060


In [28]:
tr_df.columns

Index(['id', 'speed', 'travel_time', 'status', 'data_as_of', 'link_id',
       'link_points', 'encoded_poly_line', 'encoded_poly_line_lvls', 'owner',
       'transcom_id', 'borough', 'link_name', 'c_lat', 'c_long'],
      dtype='object')

In [30]:
tr_cols = ['speed','data_as_of','c_lat','c_long','borough']

In [31]:
tr_df = tr_df[tr_cols]

In [32]:
tr_df

Unnamed: 0,speed,data_as_of,c_lat,c_long,borough
0,44.11,2025-04-28 21:29:03,,,Bronx
1,0.00,2025-04-28 21:29:03,40.755592,-74.004280,Manhattan
2,0.00,2025-04-28 21:29:03,,,Manhattan
3,0.00,2025-04-28 21:29:03,40.734298,-74.010392,Manhattan
4,0.00,2025-04-28 21:29:03,40.536165,-74.224091,Staten Island
...,...,...,...,...,...
5995,9.94,2025-04-28 17:23:02,,,Manhattan
5996,43.49,2025-04-28 17:23:02,40.781225,-68.778638,Queens
5997,38.52,2025-04-28 17:23:02,40.798488,-73.922285,Queens
5998,46.60,2025-04-28 17:23:02,40.790168,-68.653060,Queens


In [33]:
tr_df.dropna(inplace = True)
tr_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tr_df.dropna(inplace = True)


Unnamed: 0,speed,data_as_of,c_lat,c_long,borough
1,0.00,2025-04-28 21:29:03,40.755592,-74.004280,Manhattan
3,0.00,2025-04-28 21:29:03,40.734298,-74.010392,Manhattan
4,0.00,2025-04-28 21:29:03,40.536165,-74.224091,Staten Island
6,0.00,2025-04-28 21:29:03,40.533127,-74.225573,Staten Island
7,0.00,2025-04-28 21:29:03,40.705254,-73.911048,Manhattan
...,...,...,...,...,...
5994,41.01,2025-04-28 17:23:02,40.781711,-73.927122,Queens
5996,43.49,2025-04-28 17:23:02,40.781225,-68.778638,Queens
5997,38.52,2025-04-28 17:23:02,40.798488,-73.922285,Queens
5998,46.60,2025-04-28 17:23:02,40.790168,-68.653060,Queens


### Joining

In [34]:
tr_df.max()

speed                        64.0
data_as_of    2025-04-28 21:29:03
c_lat                    40.85079
c_long                 -68.649072
borough             Staten Island
dtype: object

In [35]:
col_df.max()

crash_time     2024-10-05 12:40:00
latitude                 40.905174
longitude                      0.0
crash_score                      9
dtype: object

In [36]:
tr_df.min()

speed                         0.0
data_as_of    2025-04-28 17:23:02
c_lat                   40.533127
c_long                 -74.225573
borough                     Bronx
dtype: object

In [37]:
col_df.min()

crash_time     2016-04-16 14:20:00
latitude                       0.0
longitude                -74.24484
crash_score                      0
dtype: object