In [None]:
import os

MODEL_PATH = 'models'
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
numeric_feat = [
    'pickup_weekday', 
    # 'pickup_weekofyear', 
    'pickup_hour', 
    'pickup_week_hour', 
    'pickup_minute', 
    'passenger_count',
]
categorical_feat = [
    'PULocationID', 
    'DOLocationID',
]
features = numeric_feat + categorical_feat
y_col = 'tip_fraction'

In [None]:
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster()
client = Client(cluster)
client

In [None]:
import dask.dataframe as dd

taxi = dd.read_csv(
    's3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv',
    parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'],
    storage_options={'anon': True},
    assume_missing=True,
).sample(frac=0.3, replace=False)

In [None]:
print(f'Num rows: {len(taxi)}, Size: {taxi.memory_usage(deep=True).sum().compute() / 1e6} MB')

In [None]:
def prep_df(df: dd.DataFrame) -> dd.DataFrame:
    '''
    Generate features from a raw taxi dataframe.
    '''
    df = df[df.fare_amount > 0]  # avoid divide-by-zero
    df['tip_fraction'] = df.tip_amount / df.fare_amount
    
    df['pickup_weekday'] = df.tpep_pickup_datetime.dt.weekday
    # df['pickup_weekofyear'] = df.tpep_pickup_datetime.dt.isocalendar().week
    df['pickup_hour'] = df.tpep_pickup_datetime.dt.hour
    df['pickup_week_hour'] = (df.pickup_weekday * 24) + df.pickup_hour
    df['pickup_minute'] = df.tpep_pickup_datetime.dt.minute
    df = df[features + [y_col]].astype(float).fillna(-1)
    
    return df
    
taxi_train = prep_df(taxi)

In [None]:
taxi_train.head()

In [None]:
%%time
dX = taxi_train[features].to_dask_array(lengths=True).persist()
dy = taxi_train[y_col].to_dask_array(lengths=True).persist()
_ = wait([dX, dy])

In [None]:
from sklearn.model_selection import GridSearchCV

import xgboost as xgb
import joblib

param_grid = {
    "max_depth": [5, 10],
    "min_child_weight": [10, 20],
    "learning_rate": [0.05]
}

scoring = {
    "rmse": lambda est, X, y: max(est.evals_result_["validation_0"]["rmse"])
}
dreg = xgb.dask.DaskXGBRegressor(tree_method="gpu_hist", eval_metric="rmse")
dreg.client = client
dgrid = GridSearchCV(dreg, param_grid, scoring=scoring, error_score='raise', cv=2, refit="rmse", n_jobs=1)

In [None]:
%%time
dgrid.fit(dX, dy, eval_set=[(dX, dy)])