In [50]:
import random
import re
import pandas as pd
from tqdm import tqdm
import numpy as np
from pathlib import Path
from typing import Callable

from utils import load_edges, proba_to_logit, get_weights_from_class_fractions, load_labels_core, load_supersegments

In [51]:
# Config
# data_dir = Path("/Users/martin/PycharmProjects/traffic4cast/data/")
data_dir = Path("/Users/andrei/Desktop/data4cast/data/")

city_name = "melbourne"

In [3]:
nodes = pd.read_parquet(data_dir / f"road_graph/{city_name}/road_graph_nodes.parquet")
node_coordinates = nodes.groupby("node_id")[["x", "y"]].first().to_dict(orient="index")
node_to_lat_lng = nodes.set_index("node_id")[["x", "y"]].T.to_dict()

supersegments, supersegment_to_id, id_to_supersegment = load_supersegments(city_name, node_coordinates, del_segment_feats=False)

  0%|          | 0/4012 [00:00<?, ?it/s]

  0%|          | 0/4012 [00:00<?, ?it/s]

In [4]:
edges, edge_id_to_int, edge_int_to_id = load_edges(city_name)

In [5]:
speeds = pd.read_parquet(f"/Users/andrei/Desktop/data4cast/data/speed_classes/{city_name}")

In [6]:
supersegment_to_edges_mapping = [[(u, v) for u, v in zip(r["nodes"], r["nodes"][1:])] for r in supersegments.to_dict("records")]

In [7]:
supersegments.drop(columns=["identifier", "nodes", "coord_list", "representative_point"], inplace=True)

In [8]:
supersegments["edges"] = supersegment_to_edges_mapping

In [9]:
supersegments

Unnamed: 0,supersegment_id,x,y,edges
0,0,-0.292401,51.528028,"[(9596340681, 9241203), (9241203, 9596340679),..."
1,1,-0.160380,51.501745,"[(1504500003, 110060), (110060, 236894314), (2..."
2,2,-0.035030,51.474974,"[(6245856720, 6245836252), (6245836252, 171857..."
3,3,-0.124077,51.501483,"[(4765087519292221207, 3237371488848642588), (..."
4,4,-0.128442,51.614820,"[(1745361197, 1745361160), (1745361160, 196400..."
...,...,...,...,...
4007,4007,-0.017609,51.444752,"[(620089, 620075), (620075, 432825), (432825, ..."
4008,4008,-0.204618,51.588884,"[(196143, 2453134652), (2453134652, 1692087081..."
4009,4009,-0.123329,51.509656,"[(32925453, 107738), (107738, 107733), (107733..."
4010,4010,-0.162744,51.467241,"[(1155388513, 1155388654), (1155388654, 264866..."


In [10]:
edges

Unnamed: 0,u,v,parsed_maxspeed,speed_kph,importance,highway,oneway,lanes,tunnel,length_meters,counter_distance,edge_id,edge_int
0,78112,25508583,32.2,32.2,0,unclassified,False,,,19.402386,6,78112_25508583,0
1,78112,25508584,32.2,32.2,0,unclassified,False,,,63.881347,4,78112_25508584,1
2,78112,3257621681005534125,32.2,32.2,0,residential,True,,,82.385612,5,78112_3257621681005534125,2
3,99936,2146383887,32.2,32.2,0,unclassified,False,,,8.076410,1,99936_2146383887,3
4,99936,4544836433,32.2,32.2,0,unclassified,True,,,46.803240,0,99936_4544836433,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
132409,4595139612105786518,8842311879,35.1,35.1,0,residential,False,,,13.486853,4,4595139612105786518_8842311879,132409
132410,8230831116681660864,1149426165,35.1,35.1,0,residential,False,,,60.246550,9,8230831116681660864_1149426165,132410
132411,1688447984145568529,26559620,32.2,32.2,3,primary,True,2,,4.845324,1,1688447984145568529_26559620,132411
132412,3771856370570656347,9402106041,20.0,20.0,0,unclassified,True,,,8.118341,2,3771856370570656347_9402106041,132412


In [11]:
supersegments_edges = supersegments.explode("edges")

In [12]:
supersegments_edges["edge_id"] = [f"{x[0]}_{x[1]}" for x in supersegments_edges["edges"]]

In [13]:
supersegments = pd.read_parquet(data_dir / f"road_graph/{city_name}/road_graph_supersegments.parquet")

In [14]:
supersegments.iloc[0]["nodes"]

array([9596340681,    9241203, 9596340679,    9241200,   32553207,
           197271,     197240,   36679106,   36679116,     197222,
       1016971521,     197215,     197210,     197209,     197207,
           197206,     197298,     197211, 3146836855,     197290,
         71046904,     197292,   71046378,  324012957,     234219,
           234222,     234223,  774483306,     234224,   71051657,
           234225,     234227,     234237,     224790,  254682567])

In [15]:
speeds.head()

Unnamed: 0,u,v,day,t,volume_class,median_speed_kph,free_flow_kph
0,78112,25508583,2019-07-01,9,3,19.764706,36.352941
1,78112,25508583,2019-07-01,28,5,27.882353,36.352941
2,78112,25508583,2019-07-01,29,5,46.823529,36.352941
3,78112,25508583,2019-07-01,30,3,24.0,36.352941
4,78112,25508583,2019-07-01,34,5,38.470588,36.352941


In [16]:
speeds["edge_id"] = [f"{u}_{v}" for u, v in tqdm(zip(speeds["u"], speeds["v"]))]
speeds["edge_int"] = [edge_id_to_int[eid] for eid in tqdm(speeds["edge_id"])]
# del speeds["edge_id"]

337145078it [04:10, 1345293.65it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 337145078/337145078 [02:31<00:00, 2231743.15it/s]


In [17]:
mean_speeds = speeds.groupby(["edge_int"])[["volume_class", "median_speed_kph", "free_flow_kph"]].mean()

In [18]:
edges_with_speeds = edges.drop(columns=["u", "v", "edge_id"]).set_index("edge_int").join(mean_speeds)

In [19]:
edges_with_speeds[["volume_class", "median_speed_kph", "free_flow_kph"]] = \
    edges_with_speeds[["volume_class", "median_speed_kph", "free_flow_kph"]].fillna(
    edges_with_speeds[["volume_class", "median_speed_kph", "free_flow_kph"]].median()
)

In [20]:
supersegments_edges["edge_int"] = [edge_id_to_int[x] for x in supersegments_edges["edge_id"]]

In [21]:
supersegments_edges_speeds = supersegments_edges.merge(edges_with_speeds, left_on="edge_int", right_index=True).drop(columns=["edges", "edge_id"])

In [22]:
supersegments_edges_speeds["dummy_eta"] = supersegments_edges_speeds["length_meters"] / (supersegments_edges_speeds["median_speed_kph"] * 1000) * 3600

In [23]:
supersegments_edges_speeds["dummy_eta_freeflow"] = supersegments_edges_speeds["length_meters"] / (supersegments_edges_speeds["free_flow_kph"] * 1000) * 3600

In [24]:
supersegment_dummy_eta = supersegments_edges_speeds.groupby("supersegment_id")["dummy_eta"].sum()

In [25]:
supersegment_dummy_eta_freeflow = supersegments_edges_speeds.groupby("supersegment_id")["dummy_eta_freeflow"].sum()

In [26]:
supersegment_segment_counts = supersegments_edges_speeds.groupby("supersegment_id")["x"].count()

In [27]:
supersegment_lengths = supersegments_edges_speeds.groupby("supersegment_id")["length_meters"].sum()

In [28]:
supersegments_features = pd.DataFrame(index = supersegment_dummy_eta.index)

In [29]:
def unpack_lanes(data):
    lane_length = []
    avg_lanes = []
    for l in tqdm(data["lanes"]):
        if l:
            try:
                lanes = eval(l)
                if isinstance(lanes, int):
                    lanes = [lanes]
                if isinstance(lanes, float):
                    lanes = [lanes]
                elif isinstance(lanes, list):
                    lanes = [int(l) for l in lanes]
            except:
                lanes = [2]
        else:
            lanes = [2]
        lane_length.append(len(lanes))
        avg_lanes.append(np.mean(lanes))
        
    data["lane_length"] = lane_length
    data["avg_lanes"] = avg_lanes
    
    return data

unpack_lanes(supersegments_edges_speeds)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 114987/114987 [00:01<00:00, 58141.22it/s]


Unnamed: 0,supersegment_id,x,y,edge_int,parsed_maxspeed,speed_kph,importance,highway,oneway,lanes,tunnel,length_meters,counter_distance,volume_class,median_speed_kph,free_flow_kph,dummy_eta,dummy_eta_freeflow,lane_length,avg_lanes
0,0,-0.292401,51.528028,130710,48.3,48.3,3,primary,True,,,21.874808,0,4.186018,36.480361,39.254902,2.158677,2.006101,1,2.0
84,84,-0.342000,51.535227,130710,48.3,48.3,3,primary,True,,,21.874808,0,4.186018,36.480361,39.254902,2.158677,2.006101,1,2.0
239,239,-0.292251,51.526640,130710,48.3,48.3,3,primary,True,,,21.874808,0,4.186018,36.480361,39.254902,2.158677,2.006101,1,2.0
646,646,-0.351475,51.529245,130710,48.3,48.3,3,primary,True,,,21.874808,0,4.186018,36.480361,39.254902,2.158677,2.006101,1,2.0
772,772,-0.352310,51.527485,130710,48.3,48.3,3,primary,True,,,21.874808,0,4.186018,36.480361,39.254902,2.158677,2.006101,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3974,3974,-0.081726,51.364077,22137,48.3,48.3,3,primary,False,,,134.444440,1,3.960736,25.342590,35.882353,19.098285,13.488524,1,2.0
3974,3974,-0.081726,51.364077,82289,48.3,48.3,3,primary,False,,,71.554435,2,2.889417,35.988074,40.941176,7.157815,6.291855,1,2.0
3974,3974,-0.081726,51.364077,82292,48.3,48.3,3,primary,False,,,80.193630,3,4.118453,24.747924,36.705882,11.665507,7.865144,1,2.0
3974,3974,-0.081726,51.364077,113346,48.3,48.3,3,primary,True,,,76.955057,2,4.099010,21.564692,36.705882,12.846842,7.547515,1,2.0


In [30]:
supersegment_lanes = supersegments_edges_speeds.groupby("supersegment_id")["avg_lanes"].mean()

In [31]:
supersegments_features["dummy_eta"] = supersegment_dummy_eta
supersegments_features["dummy_eta_freeflow"] = supersegment_dummy_eta_freeflow
supersegments_features["segment_count"] = supersegment_segment_counts
supersegments_features["length"] = supersegment_lengths
supersegments_features["lanes"] = supersegment_lanes

In [32]:
supersegments_features

Unnamed: 0_level_0,dummy_eta,dummy_eta_freeflow,segment_count,length,lanes
supersegment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,476.998710,372.944764,34,5729.235729,2.965686
1,402.873689,263.324157,49,2660.969896,2.887755
2,242.885249,177.649159,25,1653.841867,2.360000
3,210.902577,148.598402,14,1112.990334,2.642857
4,66.842443,39.371535,5,600.552555,2.200000
...,...,...,...,...,...
4007,375.102890,233.569569,27,2382.413855,2.018519
4008,105.165501,89.222790,7,1463.985366,2.714286
4009,289.957470,225.483505,16,1659.607841,2.312500
4010,361.711821,256.445335,40,2802.584089,2.025000


In [33]:
supersegments_features.to_parquet(data_dir / "traffic" / city_name / "ss_speeds.parquet")

<h3> Tests below: check that all artifacts contain the most up to date columns </h3>

In [39]:
data_dir = Path("/Users/andrei/Desktop/data4cast/data/")

In [47]:
city_name = "london"
supersegment_speed_features = pd.read_parquet(data_dir / "traffic" / city_name / "ss_speeds.parquet")
print(supersegment_speed_features.columns)

Index(['dummy_eta', 'dummy_eta_freeflow', 'segment_count', 'length', 'lanes'], dtype='object')


In [48]:
city_name = "melbourne"
supersegment_speed_features = pd.read_parquet(data_dir / "traffic" / city_name / "ss_speeds.parquet")
print(supersegment_speed_features.columns)

Index(['dummy_eta', 'dummy_eta_freeflow', 'segment_count', 'length', 'lanes'], dtype='object')


In [49]:
city_name = "madrid"
supersegment_speed_features = pd.read_parquet(data_dir / "traffic" / city_name / "ss_speeds.parquet")
print(supersegment_speed_features.columns)

Index(['dummy_eta', 'dummy_eta_freeflow', 'segment_count', 'length', 'lanes'], dtype='object')
