In [None]:
import os
import sys

import json
import torch

import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict, Counter
from tqdm import tqdm
from datetime import datetime

sys.path.append("../../../..")
from pipelines.utils import ROOT_DIR, load_road_network

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

city = "porto"
edge_df, _, _, LG = load_road_network(city_name=city)

In [None]:
train = pd.read_parquet(os.path.join(ROOT_DIR,"datasets/trajectory",city,"train/train_{}.parquet".format(123),))
val = pd.read_parquet(os.path.join(ROOT_DIR,"datasets/trajectory",city,"val/val_{}.parquet".format(123),))
data = pd.concat([train, val])

In [None]:
def generate_speed_features(df, edge_df) -> pd.DataFrame:
        """
        Generates features containing average speed, utilization and accelaration
        for each edge i.e road segment.

        Returns:
            pd.DataFrame: features in shape num_edges x features
        """
        rdf = pd.DataFrame({"id": edge_df.fid}, index=edge_df.index)
        # calculate utilization on each edge which is defined as the count an edge is traversed by all trajectories
        seg_seqs = df["cpath"].values
        counter = Counter()
        for seq in seg_seqs:
            counter.update(Counter(seq))

        rdf["util"] = rdf.id.map(counter)


        speed_counter, count_counter = calc_avg_speed(df)
        for j in range(1, 25):
            rdf[f"avg_speed_{j}"] = rdf.id.map(
                {
                    k: (float(speed_counter[j][k]) / count_counter[j][k]) * 111000 * 3.6
                    for k in speed_counter[j]
                }
            )
        return rdf


def calc_avg_speed(data: pd.DataFrame):
    cpaths = data["cpath"].values
    opaths = data["opath"].values
    speeds = data["speed"].values
    times = data["timestamps"].values
    
    # Create a defaultdict of Counters for each hour
    speed_counters = defaultdict(Counter)
    count_counters = defaultdict(Counter)
    
    for opath, cpath, speed, time_stamps in tqdm(zip(opaths, cpaths, speeds, times), total=len(speeds)):
        last_lidx, last_ridx = 0, 0
        for l, r, s, t in zip(opath[0::1], opath[1::1], speed, time_stamps):
            t_hour = datetime.fromtimestamp(t).hour
            
            if s * 111000 * 3.6 >= 200:  # check unrealistic speed values
                continue
            
            lidxs, ridxs = np.where(cpath == l)[0], np.where(cpath == r)[0]
            lidx = lidxs[lidxs >= last_lidx][0]
            ridx = ridxs[(ridxs >= last_ridx) & (ridxs >= lidx)][0]
            
            assert lidx <= ridx
            traversed_edges = cpath[lidx : ridx + 1]
            
            # Update the counters for the specific hour
            speed_counters[t_hour].update(dict(zip(traversed_edges, [s] * len(traversed_edges))))
            count_counters[t_hour].update(dict(zip(traversed_edges, [1] * len(traversed_edges))))
            
            last_lidx, last_ridx = lidx, ridx
    
    return speed_counters, count_counters

In [None]:

out = generate_speed_features(data, edge_df)

In [None]:
# Interpolate NaN values in the specified columns
avg_speed_columns = [f'avg_speed_{i}' for i in range(1, 25)]
out[avg_speed_columns] = out[avg_speed_columns].interpolate(axis=1, method='linear', limit_direction='both')
out[avg_speed_columns] = out[avg_speed_columns].fillna(method='bfill', axis=1).fillna(method='ffill', axis=1)

In [None]:
# Fill all remaining NaN values with 40
out[avg_speed_columns] = out[avg_speed_columns].fillna(40)
# Set speed values below 10 to 10
out[avg_speed_columns] = out[avg_speed_columns].clip(lower=10)

In [None]:
# store as parquet
out.to_parquet(os.path.join(ROOT_DIR,"datasets/transition",city,"traffic_mx.parquet",))