In [None]:
!pip install lightgbm h3 shap hdmedians

In [None]:
import re
import pandas as pd
from tqdm import tqdm
import pickle
import numpy as np
import lightgbm as lgb
from pathlib import Path
import shap

In [None]:
from utils import create_nodes_with_counters, merge_pcas, split_train_valid, load_edges, load_labels_core
from conf import data_dir

In [None]:
city_name = "melbourne"

traffic_path = data_dir / "traffic"

model_name = "core_final"

# No validation set?
FULL_TRAIN = True
CONTINUE_TRAINING = False
CONTINUE_TRAINING_ITER = 690
# Running locally on 32GB Mac works with 3e7
SAMPLE_ROW_COUNT = None
NEIGHBORS_FOR_WEIGHTING = 10

class_fractions = {
    "london": ({"green": 0.5367906303432076, "yellow": 0.35138063340805714, "red": 0.11182873624873524}),
    "madrid": {"green": 0.4976221039083026, "yellow": 0.3829591430424158, "red": 0.1194187530492816},
    "melbourne": {"green": 0.7018930324884697, "yellow": 0.2223245729555099, "red": 0.0757823945560204},
}

In [None]:
def create_categorical_features(data):
    
    feats = ["oneway", "highway", "tunnel"]
    
    # Encode categorical features
    for f in feats:
        categories = data[f].astype("category")
        cat_codes = categories.cat.codes
        data[f"{f}_cat"] = cat_codes
    
    feature_dicts = {}
    for f in feats:
        feature_dicts[f] = {k: v for k, v in zip(data[f], cat_codes)}
    
    return data, feature_dicts

In [None]:
nodes = pd.read_parquet(data_dir / f"road_graph/{city_name}/road_graph_nodes.parquet")

In [None]:
edges, edge_id_to_int, edge_int_to_id = load_edges(city_name)
edges, edge_dicts = create_categorical_features(edges)
edges

In [None]:
# Get representative point of edge
edges = edges.merge(nodes[["node_id", "x", "y"]], left_on="u", right_on="node_id", how="left")
# TODO should we take centre between u and v?

In [None]:
nodes_with_counters = create_nodes_with_counters(city_name, blacklist=False)
nodes_with_counters

In [None]:
# Find nearest counter for edges which are not immediately at counter
from sklearn.neighbors import KDTree, BallTree
tree = KDTree(nodes_with_counters[["x", "y"]], metric="euclidean")
dist, ind = tree.query(edges[["x", "y"]], k=NEIGHBORS_FOR_WEIGHTING)
edges["nearest_counter_id"] = ind[:,0]
edges["counter_distance_euclidean"] = dist[:,0]
edges["counter_distance_euclidean_mean_all"] = dist.mean(axis=1)
edges

## Data creation

In [None]:
labels = load_labels_core(city_name, edge_id_to_int)

In [None]:
labels["cc"] = labels["cc"] - 1
labels["cc"].value_counts()

In [None]:
# Merge edge features
labels = labels.merge(edges[[
    "edge_int", "x", "y", "counter_distance_euclidean", "counter_distance_euclidean_mean_all", "parsed_maxspeed", "oneway_cat", "highway_cat", "length_meters", "counter_distance", "importance"]], on="edge_int")
labels.shape

In [None]:
labels.count()

In [None]:
# These are just city average volume features which should be redundant, just in case
volume_agg_train = pd.read_parquet(traffic_path / city_name / f"volume_agg_train.parquet")
labels = labels.merge(volume_agg_train, on=["day", "t"], how="left")
labels.shape

In [None]:
labels = merge_pcas(city_name, labels)

In [None]:
# Merge edge target encodings
print("Merging static target encodings")
cc_distributions = pd.read_parquet(data_dir / "traffic" / city_name / "cc_dist.parquet")
labels = labels.merge(cc_distributions, on="edge_int")
print(labels.shape)

bomber_feats = pd.read_parquet(data_dir / "traffic" / city_name / "bomber_feats.parquet")
labels = labels.merge(bomber_feats, on="edge_int")

In [None]:
# Transform logits to probabilities
labels["proba_green"] = np.exp(labels["logit_green"]) / (np.exp(labels["logit_green"]) + np.exp(labels["logit_yellow"]) + np.exp(labels["logit_red"]))
labels["proba_yellow"] = np.exp(labels["logit_yellow"]) / (np.exp(labels["logit_green"]) + np.exp(labels["logit_yellow"]) + np.exp(labels["logit_red"]))
labels["proba_red"] = np.exp(labels["logit_red"]) / (np.exp(labels["logit_green"]) + np.exp(labels["logit_yellow"]) + np.exp(labels["logit_red"]))

In [None]:
labels.count()

In [None]:
print(labels.shape)
# Drop rows where there was no counter data
labels = labels.dropna()
print(labels.shape)

## Train

In [None]:
train, valid = split_train_valid(city_name, labels)
del labels

In [None]:
features = [
    # Edge positional features
    "counter_distance_euclidean",
    "counter_distance_euclidean_mean_all",
    "x",
    "y",
    # Target encoding features
    "proba_green",
    "proba_yellow",
    "proba_red",
    "proba_vol1",
    "proba_vol3",
    "proba_vol5",
    "mdn_speed",
    "mdn_free_speed",
    "count_vol_total",
    "proba_vol1_traffic0",
    "proba_vol3_traffic0",
    "proba_vol5_traffic0",
    "proba_vol1_traffic1",
    "proba_vol3_traffic1",
    "proba_vol5_traffic1",
    "mdn_speed_traffic0",
    "mdn_speed_traffic1",
    # These are just city averages
    "volumes_gr", # 
    "volumes_sum",
    "volumes_last",
    # Edge features
    "edge_int",
    "parsed_maxspeed",
    "oneway_cat",
    "highway_cat",
    "importance",
    "length_meters",
    "counter_distance",
    # Secret sauce: city context PCA features
] + [f for f in train.columns if f.startswith("PC")]

label = "cc"

In [None]:
def get_weights_from_class_fractions(class_fractions):
    n = np.sum(class_fractions)
    return [n / (c * 3) for c in class_fractions]

In [None]:
class_weights = get_weights_from_class_fractions([class_fractions[city_name][c] for c in ["green", "yellow", "red"]])
# We use these to weight training samples so that optimizing for logloss becomes equivalent to weighted crossentropy

In [None]:
if FULL_TRAIN:    
    train = pd.concat([train, valid])
    print(train.shape)
    del valid
    
    init_score_train = train[["logit_green", "logit_yellow", "logit_red"]]
    lgb_set = lgb.Dataset(train[features], train[label], init_score=init_score_train)
    
    weights_train = [class_weights[l] for l in train[label]]
    lgb_set.set_weight(weights_train)
else:
    init_score_train = train[["logit_green", "logit_yellow", "logit_red"]]
    init_score_valid = valid[["logit_green", "logit_yellow", "logit_red"]]
    
    lgb_train = lgb.Dataset(train[features], train[label], init_score=init_score_train)
    lgb_eval = lgb.Dataset(valid[features], valid[label], reference=lgb_train, init_score=init_score_valid)
    
    weights_train = [class_weights[l] for l in train[label]]
    weights_eval = [class_weights[l] for l in valid[label]]
    
    # Weight samples to optimize for weighted cross entropy
    lgb_train.set_weight(weights_train)
    lgb_eval.set_weight(weights_eval)

In [None]:
# The rest is from a single Optuna run, we optimize only num_leaves a bit and num_iters
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    "num_classes": 3,
    # Crazy how large we can make this! These are individual decision trees with tens of thousands of leaves
    # A smaller number should be less risky though, as the valid loss plateau would span over a larger number of iterations
    # But we wanted to save training time here
#     "num_leaves": 10000,
    "num_leaves": 5000,
    'learning_rate': 0.1,
    'feature_fraction': 1.0,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'lambda_l1': 8.544245989665272,
    'lambda_l2': 0.09577740930772316,
    'min_child_samples': 10,
}

model_path = data_dir / "models" / model_name / city_name
model_path.mkdir(parents=True, exist_ok=True)

def save_model_callback(env):
    if env.iteration > 100:
        if env.iteration % 10 == 0:
            print("Saving!")
            env.model.save_model(model_path / f"model_full_{env.iteration}.lgb")

In [None]:
# With 5k/10k leaves, we need very few iterations
NO_LGB_ITERS = 300

In [None]:
print('Starting training...')
if FULL_TRAIN:
    if CONTINUE_TRAINING:
        print(f"Continuing training from iter {CONTINUE_TRAINING_ITER}")
    
    gbm = lgb.train(params,
                    lgb_set,
                    num_boost_round=NO_LGB_ITERS,
                    valid_sets=[lgb_set],
                    callbacks=[save_model_callback],
                    init_model=model_path / f"model_full_{CONTINUE_TRAINING_ITER}.lgb" if CONTINUE_TRAINING else None,
                    verbose_eval=10)
else:
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=NO_LGB_ITERS,
                    valid_sets=[lgb_train, lgb_eval],
                    callbacks=[lgb.early_stopping(stopping_rounds=200)],
                    verbose_eval=2)

In [None]:
def shap_wrapped(data, model, features):
    explainer = shap.TreeExplainer(model)
    X = data.sample(500)[features]
    shap_values = explainer.shap_values(X)
    shap.initjs()    
    shap.summary_plot(shap_values, X)
    shap.summary_plot(shap_values, X, plot_type="bar")
    
shap_wrapped(train, gbm, features)

In [None]:
gbm.save_model(model_path / f"model_full_{NO_LGB_ITERS}.lgb")

In [None]:
model_name

In [None]:
features

In [None]:
raise ValueError

## Generate test predictions

In [None]:
# Generally we want to cherry pick this on the test set (wouldn't work with double blind test set)
iters_to_use = 280

model_path = data_dir / "models" / model_name / city_name / f"model_full_{iters_to_use}.lgb"
gbm = lgb.Booster(model_file=model_path)
print(iters_to_use)
# Note that LGB trains separate tree per each logit
assert gbm.num_trees() / 3 == iters_to_use+1


In [None]:
test_path = data_dir / "test" / city_name / "input" / "counters_test.parquet"
counters_test = pd.read_parquet(test_path)

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

# For test set, we need to create a submission set of length len(edges) * counters_test["test_idx"].nunique()
# Do this in iterations, as direct join returned weird DF shape
full_test = []
for t in tqdm(range(counters_test["test_idx"].nunique())):
    full = edges.copy()
    full["test_idx"] = t
    full_test.append(full)
    
full_test = pd.concat(full_test)
full_test.shape

In [None]:
volume_agg_test = pd.read_parquet(traffic_path / city_name / f"volume_agg_test.parquet")
full_test = full_test.merge(volume_agg_test, on=["test_idx"])
full_test.shape

In [None]:
full_test = merge_pcas(city_name, full_test, mode="test")

In [None]:
full_test = full_test.merge(cc_distributions, on="edge_int")

In [None]:
full_test = full_test.merge(bomber_feats, on="edge_int")

In [None]:
full_test["proba_green"] = np.exp(full_test["logit_green"]) / (np.exp(full_test["logit_green"]) + np.exp(full_test["logit_yellow"]) + np.exp(full_test["logit_red"]))
full_test["proba_yellow"] = np.exp(full_test["logit_yellow"]) / (np.exp(full_test["logit_green"]) + np.exp(full_test["logit_yellow"]) + np.exp(full_test["logit_red"]))
full_test["proba_red"] = np.exp(full_test["logit_red"]) / (np.exp(full_test["logit_green"]) + np.exp(full_test["logit_yellow"]) + np.exp(full_test["logit_red"]))

In [None]:
for f in features:
    assert f in full_test.columns, f

In [None]:
full_test.count()

In [None]:
import time
stm = time.time()
gbm_preds = gbm.predict(full_test[features], raw_score=True)
print(f"Took {time.time() - stm} seconds")

In [None]:
# Add predictions to init_soce because LGB learns the increment
full_preds = gbm_preds + full_test[["logit_green", "logit_yellow", "logit_red"]]

In [None]:
# Save disk space
full_test["logit_green"] = full_preds["logit_green"].round(3)
full_test["logit_yellow"] = full_preds["logit_yellow"].round(3)
full_test["logit_red"] = full_preds["logit_red"].round(3)

In [None]:
full_test["test_idx"].nunique()

In [None]:
full_test["edge_int"].nunique()

In [None]:
full_test.shape

In [None]:
submission_path = data_dir / "submissions" / model_name / city_name / "labels" / "cc_labels_test.parquet"

submission_features = [
    "logit_green",
    "logit_yellow",
    "logit_red",
    "u",
    "v",
    "test_idx"
]

import time
stm = time.time()
submission_path.parent.mkdir(parents=True, exist_ok=True)
full_test[submission_features].to_parquet(submission_path)
print(f"Took {time.time() - stm} seconds")

full_test[submission_features]