In [None]:
import random
import re
import pandas as pd
from tqdm import tqdm
import pickle
import numpy as np
from pathlib import Path

from utils import load_edges, split_train_valid, proba_to_logit, load_labels_core, load_preprocessed_counters, get_weights_from_class_fractions

In [None]:
# Config
from conf import data_dir
city_name = "melbourne"

In [None]:
# Hpw many observations per class do we require
sparse_threshold = 2
# Use validation set?
FULL_TRAIN = False

In [None]:
edges, edge_id_to_int, edge_int_to_id = load_edges(city_name)

In [None]:
# Create mapping of times to traffic regimes
counts = load_preprocessed_counters(city_name, "train")

In [None]:
traffic = counts.groupby(["day", "t"])["volumes_last"].median()
high_traffic_threshold = traffic.median()
traffic = traffic.reset_index()
traffic["high_traffic"] = [1 if v > high_traffic_threshold else 0 for v in traffic["volumes_last"]]
del counts
traffic

## Create traffic conditional features based on speed_classes data (the raw source for all labels)

In [None]:
speeds = pd.read_parquet(data_dir / "speed_classes" / city_name)

print(speeds.info())
print(speeds.count())

speeds["edge_id"] = [f"{u}_{v}" for u, v in tqdm(zip(speeds["u"], speeds["v"]))]
speeds["edge_int"] = [edge_id_to_int[eid] for eid in tqdm(speeds["edge_id"])]
del speeds["edge_id"]

if FULL_TRAIN:
    speeds_train, speeds_valid = split_train_valid(city_name, speeds)
else:
    speeds_train = speeds
# speeds.shape
# Not exactly same shape as labels - looks like a subset (almost identical tho)

In [None]:
speeds_train = speeds_train.merge(traffic, on=["day", "t"], how="left")

In [None]:
speeds_train.count()

In [None]:
mdn_speeds_traffic = speeds_train.groupby(["edge_int", "high_traffic"])["median_speed_kph"].median().to_dict()
vol_distributions_traffic = speeds_train.groupby(["edge_int", "high_traffic"])["volume_class"].value_counts().to_dict()

edges["mdn_speed_traffic0"] = [ mdn_speeds_traffic.get((e, 0)) for e in tqdm(edges["edge_int"]) ]
edges["mdn_speed_traffic1"] = [ mdn_speeds_traffic.get((e, 1)) for e in tqdm(edges["edge_int"]) ]

for traffic in [0, 1]:
    for vol in [1, 3, 5]:
        edges[f"count_vol{vol}_traffic{traffic}"] = [ vol_distributions_traffic.get((e, traffic, vol), 0) for e in tqdm(edges["edge_int"]) ]

edges["count_vol_total_traffic0"] = edges["count_vol1_traffic0"] + edges["count_vol3_traffic0"] + edges["count_vol5_traffic0"]
edges["count_vol_total_traffic1"] = edges["count_vol1_traffic1"] + edges["count_vol3_traffic1"] + edges["count_vol5_traffic1"]

for traffic in [0, 1]:
    for vol in [1, 3, 5]:
        edges[f"proba_vol{vol}_traffic{traffic}"] = edges[f"count_vol{vol}_traffic{traffic}"] / edges[f"count_vol_total_traffic{traffic}"]
        
del mdn_speeds_traffic, vol_distributions_traffic

In [None]:
# For these features, we apply a more strict threshold
sparse_threshold_traffic = 5
sparse_edge_id = (edges["count_vol1_traffic0"] < sparse_threshold_traffic) | (edges["count_vol3_traffic0"] < sparse_threshold_traffic) | (edges["count_vol5_traffic0"] < sparse_threshold_traffic) | (edges["count_vol1_traffic1"] < sparse_threshold_traffic) | (edges["count_vol3_traffic1"] < sparse_threshold_traffic) | (edges["count_vol5_traffic1"] < sparse_threshold_traffic)

print(edges.shape)
print(edges.loc[sparse_edge_id].shape)

feats_to_safeguard = ["proba_vol1_traffic0", "proba_vol3_traffic0", "proba_vol5_traffic0", "proba_vol1_traffic1", "proba_vol3_traffic1", "proba_vol5_traffic1", "mdn_speed_traffic0", "mdn_speed_traffic1"]
safeguarded_vals = edges.loc[sparse_edge_id][feats_to_safeguard].median().to_dict()
print(safeguarded_vals)

for feat in feats_to_safeguard:
    edges.loc[sparse_edge_id, feat] = safeguarded_vals[feat]

## Similar target encoding features, but unconditional on traffic

In [None]:
mdn_speeds = speeds_train.groupby("edge_int")["median_speed_kph"].median().to_dict()
free_speeds = speeds_train.groupby("edge_int")["free_flow_kph"].median().to_dict()
vol_distributions = speeds_train.groupby("edge_int")["volume_class"].value_counts().to_dict()

edges["count_vol1"] = [ vol_distributions.get((e, 1), 0) for e in tqdm(edges["edge_int"]) ]
edges["count_vol3"] = [ vol_distributions.get((e, 3), 0) for e in tqdm(edges["edge_int"]) ]
edges["count_vol5"] = [ vol_distributions.get((e, 5), 0) for e in tqdm(edges["edge_int"]) ]

edges["count_vol_total"] = edges["count_vol1"] + edges["count_vol3"] + edges["count_vol5"]

edges["proba_vol1"] = edges["count_vol1"] / edges["count_vol_total"]
edges["proba_vol3"] = edges["count_vol3"] / edges["count_vol_total"]
edges["proba_vol5"] = edges["count_vol5"] / edges["count_vol_total"]

edges["mdn_speed"] = [ mdn_speeds.get(e) for e in tqdm(edges["edge_int"]) ]
edges["mdn_free_speed"] = [ free_speeds.get(e) for e in tqdm(edges["edge_int"]) ]

# Safeguarding against leakage
sparse_edge_id = (edges["count_vol1"] < sparse_threshold) | (edges["count_vol3"] < sparse_threshold) | (edges["count_vol5"] < sparse_threshold)

print(edges.shape)

print(edges.loc[sparse_edge_id].shape)

feats_to_safeguard = ["proba_vol1", "proba_vol3", "proba_vol5", "mdn_speed", "mdn_free_speed"]
safeguarded_vals = edges.loc[sparse_edge_id][feats_to_safeguard].median().to_dict()
safeguarded_vals

for feat in feats_to_safeguard:
    edges.loc[sparse_edge_id, feat] = safeguarded_vals[feat]

print(edges.count())

edges[[
    "edge_int",
    "count_vol_total",
    "proba_vol1",
    "proba_vol3",
    "proba_vol5",
    "mdn_speed",
    "mdn_free_speed",
    "proba_vol1_traffic0",
    "proba_vol3_traffic0",
    "proba_vol5_traffic0",
    "proba_vol1_traffic1",
    "proba_vol3_traffic1",
    "proba_vol5_traffic1",
    "mdn_speed_traffic0",
    "mdn_speed_traffic1"
]].to_parquet(data_dir / "traffic" / city_name / "bomber_feats.parquet")

## Finally we calculate traffic unconditional class congestion logits which we will use as the LightGBM initialization score.

Note that we need to scale the raw probabilities by normalized class weights when converting to logits, in order for them to optimize our custom loss function

In [None]:
labels = load_labels_core(city_name, edge_id_to_int)

In [None]:
labels.shape

In [None]:
if FULL_TRAIN:
    train, valid = split_train_valid(city_name, labels)
    del labels, valid
else:
    train = labels

In [None]:
cc_distributions = train.groupby("edge_int")["cc"].value_counts().to_dict()

In [None]:
train.groupby("edge_int")["cc"].value_counts().quantile([0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 0.999])

In [None]:
train[train["cc"] == 1].groupby(["edge_int", "cc"])["cc"].count().quantile([0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 0.999])

In [None]:
train[train["cc"] == 2].groupby(["edge_int", "cc"])["cc"].count().quantile([0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 0.999])

In [None]:
train[train["cc"] == 3].groupby(["edge_int", "cc"])["cc"].count().quantile([0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 0.999])

In [None]:
edges["count_green"] = [ cc_distributions.get((e, 1), 0) for e in tqdm(edges["edge_int"]) ]
edges["count_yellow"] = [ cc_distributions.get((e, 2), 0) for e in tqdm(edges["edge_int"]) ]
edges["count_red"] = [ cc_distributions.get((e, 3), 0) for e in tqdm(edges["edge_int"]) ]

In [None]:
edges["count_total"] = edges["count_green"] + edges["count_yellow"] + edges["count_red"]

edges["proba_green"] = edges["count_green"] / edges["count_total"]
edges["proba_yellow"] = edges["count_yellow"] / edges["count_total"]
edges["proba_red"] = edges["count_red"] / edges["count_total"]

In [None]:
sparse_edge_id = (edges["count_green"] < sparse_threshold) | (edges["count_yellow"] < sparse_threshold) | (edges["count_red"] < sparse_threshold)

In [None]:
edges.shape

In [None]:
edges.loc[sparse_edge_id].shape

In [None]:
edges.loc[sparse_edge_id][["proba_green", "proba_yellow", "proba_red"]].median()

In [None]:
class_fractions = {
    "london": ({"green": 0.5367906303432076, "yellow": 0.35138063340805714, "red": 0.11182873624873524}),
    "madrid": {"green": 0.4976221039083026, "yellow": 0.3829591430424158, "red": 0.1194187530492816},
    "melbourne": {"green": 0.7018930324884697, "yellow": 0.2223245729555099, "red": 0.0757823945560204},
}

class_weights = get_weights_from_class_fractions([class_fractions[city_name][c] for c in ["green", "yellow", "red"]])
norm_class_weights = np.array(class_weights) / np.sum(class_weights)

In [None]:
# # Try to stop leakage - Overwrite sparse edges with fallback
feats_to_safeguard = ["proba_green", "proba_yellow", "proba_red"]
safeguarded_vals = edges.loc[sparse_edge_id][feats_to_safeguard].median().to_dict()
print(safeguarded_vals)

for feat in feats_to_safeguard:
    edges.loc[sparse_edge_id, feat] = safeguarded_vals[feat]
    
# Overwrite sparse edges with fallback - previous one
# edges.loc[edges["count_total"] < 30, "proba_green"] = low_traffic_edge_distributions[city_name]["green"]
# edges.loc[edges["count_total"] < 30, "proba_yellow"] = low_traffic_edge_distributions[city_name]["yellow"]
# edges.loc[edges["count_total"] < 30, "proba_red"] = low_traffic_edge_distributions[city_name]["red"]

In [None]:
edges["logit_green"] = [proba_to_logit(p*norm_class_weights[0]) for p in tqdm(edges["proba_green"])]
edges["logit_yellow"] = [proba_to_logit(p*norm_class_weights[1]) for p in tqdm(edges["proba_yellow"])]
edges["logit_red"] = [proba_to_logit(p*norm_class_weights[2]) for p in tqdm(edges["proba_red"])]

In [None]:
edges[["logit_green", "logit_yellow", "logit_red"]].quantile(q=[0.0001, 0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999, 0.9999])

In [None]:
edges[["logit_green", "logit_yellow", "logit_red"]].min()

In [None]:
edges[["logit_green", "logit_yellow", "logit_red"]].max()

In [None]:
edges[["edge_int", "logit_green", "logit_yellow", "logit_red"]].to_parquet(data_dir / "traffic" / city_name / "cc_dist.parquet")