In [None]:
# Force to use CPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
import tensorflow as tf

# Check if using GPU
if tf.config.list_physical_devices('GPU'):
    print("Using GPU")
else:
    print("Using CPU")

In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer as load_data
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow import keras
from sklearn.ensemble import RandomForestClassifier
from keras.layers import Dense, Dropout, Embedding, concatenate, Flatten, BatchNormalization, Activation, Discretization, Add
from keras import models
import keras
from tensorflow.keras.optimizers import RMSprop
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
import lightgbm as lgb

In [None]:
from utils import clean, preprocess

In [None]:
plt.rcParams['figure.figsize'] = [20,20]

# Prepare data

In [None]:
df = pd.read_csv("nyc-taxi-trip-duration/train.csv")

In [None]:
TREE_DEPTH = 5

In [None]:
df = clean(df)

In [None]:
df, hash_vocab_size, h3_cell_mappings  = preprocess(df)

In [None]:
weeks_of_data = (df["timestamp"].max() - df["timestamp"].min()).days // 7
print(weeks_of_data)

In [None]:
test_cutoff = df["timestamp"].max() - pd.Timedelta(weeks=4)

In [None]:
valid_cutoff = test_cutoff - pd.Timedelta(weeks=2)

In [None]:
# Split by timestamp to avoid temporal leakage
df_test = df[df["timestamp"] > test_cutoff]
df_train = df[df["timestamp"] < valid_cutoff]
df_valid = df[df["timestamp"].between(valid_cutoff, test_cutoff)]

In [None]:
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

In [None]:
h3_resolutions = [4, 5, 6, 7, 8, 9, 10] # Which to actually use for embeddings

In [None]:
numeric_features = [
"pickup_longitude",
"pickup_latitude",
"dropoff_longitude",
"dropoff_latitude",
# "haversine_distance"
]

In [None]:
y = df["trip_duration"]

In [None]:
X = df[numeric_features]

In [None]:
X_train = df_train[numeric_features]
y_train = df_train["trip_duration"]

X_valid = df_valid[numeric_features]
y_valid = df_valid["trip_duration"]

X_test = df_test[numeric_features]
y_test = df_test["trip_duration"]

In [None]:
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

In [None]:
X_train_h3 = df_train[[x for x in df.columns if x.startswith("h3_hash_index")]]
X_valid_h3 = df_valid[[x for x in df.columns if x.startswith("h3_hash_index")]]
X_test_h3 = df_test[[x for x in df.columns if x.startswith("h3_hash_index")]]

In [None]:
del df

# Train trees

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    'max_depth': TREE_DEPTH, 
    'learning_rate': 0.1,
    'verbose': 0
}

In [None]:
def train_lgb(num_estimators: int, params: dict):
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    
    model = lgb.train(params,
                lgb_train,
                num_boost_round=num_estimators,
                valid_sets=[lgb_train, lgb_eval],
                early_stopping_rounds=50)
    
    return model

In [None]:
# Single decision tree
clf = DecisionTreeClassifier(max_depth=TREE_DEPTH)
clf = clf.fit(X_train, y_train)
plot_tree(clf)
plt.show()

In [None]:
clf_pred = clf.predict(X_test)

In [None]:
# Small ensemble of DT
lgb_tiny = train_lgb(num_estimators=10, params=params)

In [None]:
lgb_tiny_pred = lgb_tiny.predict(X_test)

In [None]:
# Full ensemble of DT
lgb_full = train_lgb(num_estimators=50000, params=params)

In [None]:
lgb_full_pred = lgb_full.predict(X_test, num_iteration=lgb_full.best_iteration)

## Discretize features

In [None]:
def create_feature_bins(X, all_feature_splits: dict):
    X_binned = pd.DataFrame()
    for feature in all_feature_splits:
        feature_name = numeric_features[feature]
        bins = Discretization(bin_boundaries=all_feature_splits[feature])(X[feature_name])
        X_binned[feature_name] = bins
    return X_binned

In [None]:
# Create bins based on quantiles
QUANTILE_BINS = 100
# Calculate quantiles (based on training set)
quantiles = np.linspace(0, 1, num=QUANTILE_BINS)
print(f"Calculating {len(quantiles)} quantiles")

quantile_values = {}
for i, cf in enumerate(numeric_features):
    quantile_values[i] = list(np.quantile(X_train[cf], quantiles))
    
X_binned_by_quantile = []
for X, name in zip([X_train, X_valid, X_test], ["train", "valid", "test"]):
    X_binned_by_quantile.append(create_feature_bins(X, quantile_values))

# Train neural networks

In [None]:
from keras_models import MLPModel, EmbeddedBinModel, EmbeddedH3Model

In [None]:
# # Logistic regression on quantile bins
# lr = LogisticRegressionTrainer()
# lr.train(x_train=X_binned_by_quantile[0], y_train=y_train, x_valid=X_binned_by_quantile[1], y_valid=y_valid)

In [None]:
hyperparams = {
    "batch_size": 128,
    "epochs": 100,
    "starting_lr": 1e-3
}

In [None]:
# Simple MLP on raw coordinates
mlp = MLPModel(hyperparams=hyperparams)
mlp.train(x_train=X_train, y_train=y_train, x_valid=X_valid, y_valid=y_valid)
mlp_pred = mlp.predict(X_test)

In [None]:
# Embeddings on quantized contionuous features
quant_embed = EmbeddedBinModel(numeric_features=numeric_features, hyperparams=hyperparams)
quant_embed.train(x_train=X_binned_by_quantile[0], y_train=y_train, x_valid=X_binned_by_quantile[1], y_valid=y_valid, discrete_bin_vocab_size=QUANTILE_BINS)
quant_embed_pred = quant_embed.predict(X_binned_by_quantile[2])

In [None]:
# Embeddings on h3 cell indices
h3_embed = EmbeddedH3Model(h3_resolutions=h3_resolutions, hyperparams=hyperparams)
h3_embed.train(x_train=X_train_h3, y_train=y_train, x_valid=X_valid_h3, y_valid=y_valid, embedding_vocab_size=hash_vocab_size)
h3_embed_pred = h3_embed.predict(X_test_h3)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error

In [None]:
def validate_metrics(y_true, y_pred):
    def median_pred(y_true, y_pred):
        return np.median(y_pred)
    
    metrics = {
        "MAE": mean_absolute_error,
        "MdAE": median_absolute_error,
        "MSE": mean_squared_error,
        "Mdn": median_pred,
        "R2": r2_score
    }
    
    for metric in metrics:
        print(metric)
        print(round(metrics[metric](y_true=y_true, y_pred=y_pred), 2))

In [None]:
validate_metrics(y_true=y_test, y_pred=clf_pred)

In [None]:
validate_metrics(y_true=y_test, y_pred=lgb_tiny_pred)

In [None]:
validate_metrics(y_true=y_test, y_pred=lgb_full_pred)

In [None]:
validate_metrics(y_true=y_test, y_pred=quant_embed_pred)

In [None]:
validate_metrics(y_true=y_test, y_pred=h3_embed_pred)