In [None]:
import random
import re
import pandas as pd
from tqdm import tqdm
import pickle
import numpy as np
from pathlib import Path
from joblib import Parallel, delayed
import hdmedians as hd

from utils import create_nodes_with_counters, load_edges, load_supersegments, load_labels_core

In [None]:
city_name = "london"

data_dir = Path("/Users/martin/PycharmProjects/traffic4cast/data/")
# data_dir = Path("traffic4cast/data/")

traffic_path = data_dir / "traffic"

In [None]:
TRACK = "core"

In [None]:
def load_labels_extended(supersegment_to_id):
    files = sorted((data_dir / 'train' / city_name / 'labels').glob('eta_labels_*.parquet'))
    df = pd.read_parquet(files)
    df["supersegment_id"] = [supersegment_to_id[s] for s in df["identifier"]]
    print(df.shape)
    return df

In [None]:
engineered_edge_features_to_keep = ["edge_int", "nearest_counter_id", "counter_distance_euclidean", "counter_distance_euclidean_mean_all"]

nodes_with_counters = create_nodes_with_counters(city_name, blacklist=True)

nodes = pd.read_parquet(data_dir / f"road_graph/{city_name}/road_graph_nodes.parquet")

In [None]:
if TRACK == "core":
    entities, entity_to_id, id_to_entity = load_edges(city_name)
    entities = entities.merge(nodes[["node_id", "x", "y"]], left_on="u", right_on="node_id", how="left")
else:
    entities, entity_to_id, id_to_entity = load_supersegments(city_name)

In [None]:
# Fit spatial tree 
from sklearn.neighbors import KDTree, BallTree
tree = KDTree(nodes_with_counters[["x", "y"]], metric="euclidean")
print(nodes_with_counters.shape)

# # Query the tree for entity
dist, ind = tree.query(entities[["x", "y"]], k=10)
entities["nearest_counter_id"] = ind[:,0]
entities["counter_distance_euclidean"] = dist[:,0]
entities["counter_distance_euclidean_mean_all"] = dist.mean(axis=1)

In [None]:
# Load volume matrixes
with open(traffic_path / city_name / "volume_matrix.pkl", "rb") as f:
    matrixes, row_mapping = pickle.load(f)

In [None]:
if TRACK == "core":
    labels = load_labels_core(city_name, entity_to_id)
    labels = labels.merge(entities[engineered_edge_features_to_keep], on="edge_int")
else:
    raise NotImplementedError
    labels = load_labels_extended(entity_to_id)
    labels = labels.merge(entities, on="supersegment_id")
    
labels.count()

In [None]:
# labels = labels.merge(entities[engineered_edge_features_to_keep], on="edge_int")

In [None]:
assert matrixes["B_30_volumes_last"].shape[1] == nodes_with_counters.shape[0]

In [None]:
def get_weighted_values(df: pd.DataFrame, feature_name: str, weight_matrix: np.array, row_mapping: dict, mode="train"):
    # There's labels for 2020-06-03 in Melbourne, but not counters :scream:
    # Let's keep NaN values for when counter data is missing
    
    if mode == "train":
        vals = []

        for d, t, c in tqdm(zip(df["day"], df["t"], df["nearest_counter_id"])):
            row = row_mapping.get((d, t))
            if row is not None:
                val = weight_matrix[row,c]
            else:
                val = np.nan
            vals.append(val)
    else:
        vals = []

        for t, c in tqdm(zip(df["test_idx"], df["nearest_counter_id"])):
            row = row_mapping.get(t)
            if row is not None:
                val = weight_matrix[row,c]
            else:
                val = np.nan
            vals.append(val)
    
    df[f"euclidean_{feature_name}"] = vals
    return df

In [None]:
for f in matrixes:
    print(f)
    labels = get_weighted_values(labels, f, matrixes[f], row_mapping)

In [None]:
labels.count()

In [None]:
del labels["nearest_counter_id"]

In [None]:
filename = "cc_all_labels.parquet" if TRACK == "core" else "eta_all_labels.parquet"
labels.to_parquet(data_dir / "train" / city_name / "labels" / filename)

In [None]:
# raise ValueError

## Test set

In [None]:
def create_test_frame_core():
    # For test, we need to generate labels structure ourself
    pd.options.mode.chained_assignment = None  # default='warn'

    test_periods = 100

    # For test set, we need to create a submission set of length len(edges) * counters_test["test_idx"].nunique()
    # Do this in iterations, as direct join returned weird DF shape
    full_test = []
    for t in tqdm(range(test_periods)):
        full = entities[["u", "v"] + engineered_edge_features_to_keep].copy()
        full["test_idx"] = t
        full_test.append(full)

    full_test = pd.concat(full_test)
    print(full_test.shape)
    return full_test

def create_test_frame_extended(supersegment_to_id):
    test_left = pd.DataFrame({"identifier": list(supersegment_to_id.keys())})
    test_left = pd.concat([test_left]*100)
    test_idx = []
    current = 0
    unique_segments = test_left["identifier"].nunique()
    print(unique_segments)
    for i in range(100):
        test_idx.extend([current]*unique_segments)
        current += 1
    test_left["test_idx"] = test_idx

    assert test_idx[unique_segments-1] != test_idx[unique_segments]
    test_left["supersegment_id"] = [supersegment_to_id[s] for s in test_left["identifier"]]
    test_left
    
    test_left = test_left.merge(entities, on="supersegment_id")
    
    return test_left

In [None]:
if TRACK == "core":
    full_test = create_test_frame_core()
else:
    full_test = create_test_frame_extended(entity_to_id)

In [None]:
full_test

In [None]:
# Test set
with open(traffic_path / city_name / "volume_matrix_test.pkl", "rb") as f:
    matrixes_test, row_mapping_test = pickle.load(f)
    
for f in matrixes_test:
    print(f)
    full_test = get_weighted_values(full_test, f, matrixes_test[f], row_mapping_test, "test")

In [None]:
del full_test["nearest_counter_id"]
full_test.count()

In [None]:
full_test.to_parquet(data_dir / "test" / city_name / "labels" / filename)