Find nearest nodes here and convert to weighted counter data

In [None]:
!pip install lightgbm shap

In [None]:
import random
import re
import pandas as pd
from tqdm import tqdm
import pickle
import numpy as np
from pathlib import Path
from joblib import Parallel, delayed

from utils import create_nodes_with_counters, load_preprocessed_counters, load_labels_core

In [None]:
city_name = "london"

data_dir = Path("/Users/martin/PycharmProjects/traffic4cast/data/")
# data_dir = Path("traffic4cast/data/")

traffic_path = data_dir / "traffic"

In [None]:
nodes = pd.read_parquet(data_dir / f"road_graph/{city_name}/road_graph_nodes.parquet")

In [None]:
# Melbourne False, London True
nodes_with_counters = create_nodes_with_counters(city_name, blacklist=True)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
# Node pairwise distance matrix D (counter_dim x counter_dim)
import time
stm = time.time()
D = euclidean_distances(nodes_with_counters[["x", "y"]])
print(f"Took {time.time() - stm} seconds")

# Takes a bit too long for 90K+ edges
# 10k - 1.2s
# 20k - 5s
# 25k - 8.12s

In [None]:
def create_volume_matrix(counts: pd.DataFrame, feature: str, mode="train"):

    if mode == "train":
        grouper = ["day", "t"]
    else:
        grouper = ["test_idx"]
    
    # Create volume matrix V (t x counter_dim)
    volume_matrix = counts.pivot(index=grouper, columns="node_id", values=feature)
    row_mapping = {k:v for k, v in zip(volume_matrix.index, range(len(volume_matrix)))}
    column_mapping = {k:v for k, v in zip(range(volume_matrix.shape[1]), volume_matrix.columns)}
    row_mapping_inverse = {v:k for k, v in zip(volume_matrix.index, range(len(volume_matrix)))}
    column_mapping_inverse = {v:k for k, v in zip(range(volume_matrix.shape[1]), volume_matrix.columns)}
    
    # Get ride of nans
    vmc = volume_matrix.count()
    for col in vmc[vmc < vmc.max()].index:
        volume_matrix[col] = volume_matrix[col].ffill().bfill()
        
    V = volume_matrix.to_numpy()
    print(V.shape)
    
    return {
        "matrix": V,
        "row_mapping": row_mapping,
        "column_mapping": column_mapping,
        "row_mapping_inverse": row_mapping_inverse,
        "column_mapping_inverse": column_mapping_inverse
    }

In [None]:
def calculate_softmax_inverse_distance_weighted(D: np.array, k: int, denum_factor: float):
    """
    Create (sparse) weight matrix B (edge_dim x counter_dim).
    For simplification, we'll instead map edges to nearest counters, so B becomes (counter_dim x counter_dim)
    """
    
    # Take a K column submatrix with the nearest neighbours
    argsorted = np.argsort(D, axis=1)[:,:k]
    row_index = np.arange(len(D))
    nearest = D[row_index[:,None], argsorted]

    # Mapping distances to weights
#     denum_factor = 0.0005
    D_w = (1 / (nearest + denum_factor)) / 1000
    print(D_w.shape)
    
    # Softmax over rows
    mx = np.max(D_w, axis=-1, keepdims=True)
    numerator = np.exp(D_w - mx)
    denominator = np.sum(numerator, axis=-1, keepdims=True)
    S = numerator/denominator
    
    # Finally, create a sparse matrix with only softmax values over K neighbours filled in
    B = np.zeros_like(D)
    B[row_index[:,None], argsorted] = S
    
    return B

In [None]:
def create_test_labels():
    # For test, we need to generate labels structure ourself
    pd.options.mode.chained_assignment = None  # default='warn'

    test_periods = 100

    # For test set, we need to create a submission set of length len(edges) * counters_test["test_idx"].nunique()
    # Do this in iterations, as direct join returned weird DF shape
    full_test = []
    for t in tqdm(range(test_periods)):
        full = edges[["u", "v"] + engineered_edge_features_to_keep].copy()
        full["test_idx"] = t
        full_test.append(full)

    full_test = pd.concat(full_test)
    print(full_test.shape)
    return full_test

In [None]:
# # Just nearest neighbour volumes
# B_1 = calculate_softmax_inverse_distance_weighted(k=1, denum_factor=0.0005)

In [None]:
where_zero = np.where(D == 0)

In [None]:
B_30 = calculate_softmax_inverse_distance_weighted(D, k=30, denum_factor=0.0005)
np.median(B_30[where_zero[0], where_zero[1]])

In [None]:
B_50 = calculate_softmax_inverse_distance_weighted(D, k=50, denum_factor=0.0005)
np.median(B_50[where_zero[0], where_zero[1]])

In [None]:
B_100 = calculate_softmax_inverse_distance_weighted(D, k=100, denum_factor=0.0004)
np.median(B_100[where_zero[0], where_zero[1]])

In [None]:
B_300 = calculate_softmax_inverse_distance_weighted(D, k=300, denum_factor=0.0003)
np.median(B_300[where_zero[0], where_zero[1]])

In [None]:
B_500 = calculate_softmax_inverse_distance_weighted(D, k=500, denum_factor=0.00025)
np.median(B_500[where_zero[0], where_zero[1]])

In [None]:
# Arbitrary weighting over whole city
# The smaller denum_factor, the heavier is weight on closest observations
B_city_0008 = calculate_softmax_inverse_distance_weighted(D, k=len(D), denum_factor=0.0008)
np.median(B_city_0008[where_zero[0], where_zero[1]])

In [None]:
# B_city_0005 = calculate_softmax_inverse_distance_weighted(k=len(D), denum_factor=0.0005)

In [None]:
# B_city_0002 = calculate_softmax_inverse_distance_weighted(k=len(D), denum_factor=0.0002)

In [None]:
B_city_00015 = calculate_softmax_inverse_distance_weighted(D, k=len(D), denum_factor=0.00015)
np.median(B_city_00015[where_zero[0], where_zero[1]])

In [None]:
# B_city_000125 = calculate_softmax_inverse_distance_weighted(k=len(D), denum_factor=0.000125)

In [None]:
# B_city_0001 = calculate_softmax_inverse_distance_weighted(k=len(D), denum_factor=0.0001)

In [None]:
weighted_features = {
    # "B_1": B_1,
    # "B_10": B_10,
    "B_30": B_30,
    "B_50": B_50,
    "B_100": B_100,
    "B_300": B_300,
    "B_500": B_500,
    "B_city_0008": B_city_0008,
    # "B_city_0005": B_city_0005,
    # "B_city_0002": B_city_0002,
    "B_city_00015": B_city_00015,
    # "B_city_000125": B_city_000125,
    # "B_city_0001": B_city_0001
}

In [None]:
# raise ValueError

### Train set

In [None]:
# engineered_edge_features_to_keep = ["edge_int", "nearest_counter_id", "counter_distance_euclidean", "counter_distance_euclidean_mean_all"]

In [None]:
# # Hacky McHack
# sample_counters = {
#     "melbourne": "2020-06-01",
#     "london": "2019-07-01",
#     "madrid": "2021-06-01"
# }

# # Use one counter slice to find nearest counters
# # TODO - annoyingly, the counters seem to be a (slightly) changing set, need to handle this
# # TODO calculate superset of all counters, looping over files
# counters = pd.read_parquet(data_dir / f"train/{city_name}/input/counters_{sample_counters[city_name]}.parquet")
# counters = counters[counters.t == 4]
# del counters["volumes_1h"]

# print(counters["node_id"].nunique())

# nodes_with_counters["node_id"].nunique()

In [None]:
counts_train = load_preprocessed_counters(city_name, mode="train")
print(counts_train["node_id"].nunique())
counts_train = counts_train[counts_train["node_id"].isin(nodes_with_counters["node_id"])]
print(counts_train["node_id"].nunique())
counts_train

In [None]:
engineered_volume_features = ["volumes_last"]

In [None]:
matrixes = {}
for f in engineered_volume_features:
    vol_matrix_output = create_volume_matrix(counts_train, f, "train")
    V = vol_matrix_output["matrix"]
    
    # Weighted counter observations W = VB^T (t x counter_dim), where row
    # w_{t, i} = \sum_{j}^{counter_dim} v_{t, j} * b_{i, j}
    for w_feat in weighted_features:
        print(f, w_feat)
        w_mat = weighted_features[w_feat]
        matrixes[f"{w_feat}_{f}"] = np.dot(V, w_mat).astype("float32")

In [None]:
# Save volume matrixes
with open(traffic_path / city_name / "volume_matrix.pkl", "wb") as f:
    pickle.dump((matrixes, vol_matrix_output["row_mapping"]) , f)

In [None]:
# del matrixes

In [None]:
def get_weighted_values(df: pd.DataFrame, feature_name: str, weight_matrix: np.array, row_mapping: dict, mode="train"):
    # There's labels for 2020-06-03 in Melbourne, but not counters :scream:
    # Let's keep NaN values for when counter data is missing
    
    if mode == "train":
        vals = []

        for d, t, c in tqdm(zip(df["day"], df["t"], df["nearest_counter_id"])):
            row = row_mapping.get((d, t))
            if row is not None:
                val = weight_matrix[row,c]
            else:
                val = np.nan
            vals.append(val)
    else:
        vals = []

        for t, c in tqdm(zip(df["test_idx"], df["nearest_counter_id"])):
            row = row_mapping.get(t)
            if row is not None:
                val = weight_matrix[row,c]
            else:
                val = np.nan
            vals.append(val)
    
    df[f"euclidean_{feature_name}"] = vals
    return df

## Test set

In [None]:
counts_test = load_preprocessed_counters(city_name, mode="test")
print(counts_test["node_id"].nunique())
counts_test = counts_test[counts_test["node_id"].isin(nodes_with_counters["node_id"])]
print(counts_test["node_id"].nunique())
counts_test

In [None]:
matrixes_test = {}
for f in engineered_volume_features:
    vol_matrix_output_test = create_volume_matrix(counts_test, f, "test")
    V_test = vol_matrix_output_test["matrix"]
    
    # Weighted counter observations W = VB^T (t x counter_dim), where row
    # w_{t, i} = \sum_{j}^{counter_dim} v_{t, j} * b_{i, j}
    # if f == "volumes_gr":
    #     w_feat = "B_1" # Save time
    #     w_mat = weighted_features[w_feat]
    #     matrixes_test[f"{w_feat}_{f}"] = np.dot(V, w_mat)
    # else:
    for w_feat in weighted_features:
        print(f, w_feat)
        w_mat = weighted_features[w_feat]
        matrixes_test[f"{w_feat}_{f}"] = np.dot(V_test, w_mat).astype("float32")

In [None]:
# Save volume matrixes
with open(traffic_path / city_name / "volume_matrix_test.pkl", "wb") as f:
    pickle.dump((matrixes_test, vol_matrix_output_test["row_mapping"]) , f)