In [None]:
# Force to use CPU for benchmarking
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
import tensorflow as tf

# Check if using GPU
if tf.config.list_physical_devices('GPU'):
    print("Using GPU")
else:
    print("Using CPU")

In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer as load_data
import numpy as np
from h3 import h3
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow import keras
from sklearn.ensemble import RandomForestClassifier
from keras.layers import Dense, Dropout, Embedding, concatenate, Flatten, BatchNormalization, Activation, Discretization
from keras import models
import keras
from tensorflow.keras.optimizers import RMSprop
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import lightgbm as lgb

In [None]:
from utils import clean, preprocess

In [None]:
plt.rcParams['figure.figsize'] = [20,20]

In [None]:
df = pd.read_csv("nyc-taxi-trip-duration/train.csv")

In [None]:
df.info()

In [None]:
df["timestamp"] = pd.to_datetime(df["pickup_datetime"])
df = df.sort_values("timestamp")

In [None]:
df = clean(df)
df, hash_vocab_size, h3_cell_mappings = preprocess(df)

In [None]:
test_cutoff = df["timestamp"].max() - pd.Timedelta(weeks=4)
valid_cutoff = test_cutoff - pd.Timedelta(weeks=2)
df_test = df[df["timestamp"] > test_cutoff]
df_train = df[df["timestamp"] < valid_cutoff]
df_valid = df[df["timestamp"].between(valid_cutoff, test_cutoff)]
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)
h3_resolutions = [4, 5, 6, 7, 8, 9, 10] # Which to actually use for embeddings

In [None]:
numeric_features = [
"pickup_longitude",
"pickup_latitude",
"dropoff_longitude",
"dropoff_latitude",
]



In [None]:
X = df[numeric_features]
y = (df["trip_duration"] > 600).astype(int)

X_train = df_train[numeric_features]
y_train = (df_train["trip_duration"] > 600).astype(int)

X_valid = df_valid[numeric_features]
y_valid = (df_valid["trip_duration"] > 600).astype(int)

X_test = df_test[numeric_features]
y_test = (df_test["trip_duration"] > 600).astype(int)

In [None]:
X_train_h3 = df_train[[x for x in df.columns if x.startswith("h3_hash_index")]]
X_valid_h3 = df_valid[[x for x in df.columns if x.startswith("h3_hash_index")]]
X_test_h3 = df_test[[x for x in df.columns if x.startswith("h3_hash_index")]]

In [None]:
def train_lgb(num_estimators: int, params: dict):
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    
    model = lgb.train(params,
                lgb_train,
                num_boost_round=num_estimators,
                valid_sets=[lgb_train, lgb_eval],
                early_stopping_rounds=50)
    
    return model


In [None]:
TREE_DEPTH = 5

In [None]:
lgb_binary_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#     'metric': 'binary', # accuracy
    'max_depth': TREE_DEPTH, 
    'learning_rate': 0.1,
    'verbose': 0
}

In [None]:
lgb_binary = train_lgb(num_estimators=10000, params=lgb_binary_params)

In [None]:
lgb_full_predictions = lgb_binary.predict(X_test, num_iteration=lgb_binary.best_iteration)

In [None]:
from keras_models import EmbeddedH3Model

In [None]:
nn_hyperparams = {
    "batch_size": 128,
    "epochs": 100,
    "starting_lr": 1e-3
}

In [None]:
# Embeddings on h3 cell indices
h3_embed = EmbeddedH3Model(h3_resolutions=h3_resolutions, binary=True, hyperparams=nn_hyperparams)
h3_embed.train(x_train=X_train_h3, y_train=y_train, x_valid=X_valid_h3, y_valid=y_valid, embedding_vocab_size=hash_vocab_size)
h3_embed_pred = h3_embed.predict(X_test_h3)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print(accuracy_score(y_true=y_test, y_pred=lgb_full_predictions.round()))

In [None]:
print(accuracy_score(y_true=y_test, y_pred=h3_embed_pred.round()))

In [None]:
import mplleaflet

In [None]:
def create_decision_boundary(model, X, xlim=(-74.005, -73.96), ylim=(40.73, 40.78), pickup_lon=None, pickup_lat=None):
    # Create a grid for plotting decision boundary. We fix the pickup coordinates to be able to visualise in 2D

    if not pickup_lon:
        pickup_lon = X["pickup_longitude"].median()
    if not pickup_lat:
        pickup_lat = X["pickup_latitude"].median()
            
    X = X.to_numpy()
    x_min, x_max = np.percentile(X[:, 0], 0.1), np.percentile(X[:, 0], 99.99)
    y_min, y_max = np.percentile(X[:, 1], 0.1), np.percentile(X[:, 1], 99.99)
    grid_size = 0.0001
    xx, yy = np.meshgrid(np.arange(x_min, x_max, grid_size), np.arange(y_min, y_max, grid_size))
    xx_ravel = xx.ravel()
    yy_ravel = yy.ravel()

    pred_array = np.c_[np.repeat(pickup_lon, len(xx_ravel)), np.repeat(pickup_lat, len(yy_ravel)), xx_ravel, yy_ravel]
    preds = model.predict(pred_array)
    preds = preds.reshape(xx.shape).round()
    
    return xx, yy, preds

In [None]:
xx, yy, preds = plot_decision_boundary(lgb_binary, X_test)

In [None]:
plt.scatter(pickup_lon, pickup_lat, color="red")
plt.contourf(xx, yy, preds, alpha=0.4)
plt.xlabel("Longitude")
plt.xlim(xlim)
plt.ylim(ylim)
plt.ylabel("Latitude")

mplleaflet.display()

In [None]:
pickup_sample = X_test.sample(1).iloc[0]
lon_sample = pickup_sample["pickup_longitude"]
lat_sample = pickup_sample["pickup_latitude"]

In [None]:
xx_sample, yy_sample, preds_sample = plot_decision_boundary(lgb_binary, X_test, pickup_lon=lon_sample, pickup_lat=lat_sample)

In [None]:
plt.scatter(lon_sample, lat_sample, color="red")
plt.contourf(xx_sample, yy_sample, preds_sample, alpha=0.4)
plt.xlabel("Longitude")
plt.xlim(xlim)
plt.ylim(ylim)
plt.ylabel("Latitude")

mplleaflet.display()

In [None]:
def create_decision_boundary_h3_model(model, X, pickup_lon=None, pickup_lat=None):
    
    if not pickup_lon:
        pickup_lon = X["pickup_longitude"].median()
    if not pickup_lat:
        pickup_lat = X["pickup_latitude"].median()
    
    X = X.to_numpy()
    x_min, x_max = np.percentile(X[:, 0], 0.1), np.percentile(X[:, 0], 99.99)
    y_min, y_max = np.percentile(X[:, 1], 0.1), np.percentile(X[:, 1], 99.99)
    grid_size = 0.0001
    xx, yy = np.meshgrid(np.arange(x_min, x_max, grid_size), np.arange(y_min, y_max, grid_size))
    xx_ravel = xx.ravel()
    yy_ravel = yy.ravel()
    
    df_mesh = pd.DataFrame({"pickup_longitude": median_pickup_lon, "pickup_latitude": median_pickup_lat, "dropoff_longitude": xx_ravel, "dropoff_latitude": yy_ravel})
    
    for h3_res in h3_resolutions:
        df_mesh[f"src_h3_{h3_res}"] = [h3.geo_to_h3(x, y, h3_res) for x, y in
                                  zip(df_mesh["pickup_latitude"], df_mesh["pickup_longitude"])]
        df_mesh[f"dst_h3_{h3_res}"] = [h3.geo_to_h3(x, y, h3_res) for x, y in
                                  zip(df_mesh["dropoff_latitude"], df_mesh["dropoff_longitude"])]

    h3_cell_tokens = {}
    for point in ["src", "dst"]:
        h3_cell_tokens[point] = {}
        for h3_res in h3_resolutions:
            h3_cell_tokens[point][h3_res] = {}
            for i, cell in enumerate(h3_cell_mappings[point][h3_res]):
                h3_cell_tokens[point][h3_res][cell] = i
            
    for point in ["src", "dst"]:
        for h3_res in h3_resolutions:
            df_mesh[f"h3_hash_index_{point}_{h3_res}"] = [int(h3_cell_tokens[point][h3_res].get(c, -1)) for c in df_mesh[f"{point}_h3_{h3_res}"]]
    
    for point in ["src", "dst"]:
        # We have one unused embedding key to assign to OOV tokens - they'll get randomly initialised embeddings
        for h3_res in h3_resolutions:
            vocab_size = hash_vocab_size[point][h3_res]
            df_mesh[f"h3_hash_index_{point}_{h3_res}"] = df_mesh[f"h3_hash_index_{point}_{h3_res}"].replace(-1, vocab_size)
            
    nn_preds = model.predict(df_mesh).round()
    nn_preds = nn_preds.reshape(xx.shape)
    
    return xx, yy, nn_preds

In [None]:
xx, yy, nn_preds = create_decision_boundary_h3_model(h3_embed, X_test)

In [None]:
plt.scatter(pickup_lon, pickup_lat, color="red")
plt.contourf(xx, yy, nn_preds, alpha=0.4)
plt.xlabel("Longitude")
plt.xlim(xlim)
plt.ylim(ylim)
plt.ylabel("Latitude")

mplleaflet.display()