In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import numba, socket
import cudf
import dask, dask_cudf
from dask_cuda import LocalCUDACluster
from dask.delayed import delayed
import dask.dataframe as dd
from dask.distributed import Client, wait, LocalCluster
from dask_ml.model_selection import train_test_split
from dask.utils import parse_bytes

cluster = LocalCUDACluster(
    rmm_pool_size=parse_bytes("48GB") # Each GPU has 16GB of memory
)
client = Client(cluster)
client.restart()

dask.config.set({'distributed.scheduler.work-stealing': False})
dask.config.get('distributed.scheduler.work-stealing')

distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize


False

In [2]:
df = dask_cudf.read_csv('data/train.csv')

In [3]:
df['key'] = df['key'].astype('datetime64[ns]')
df['fare_amount'] = df ['fare_amount'].astype('float32')
df['pickup_datetime'] = df['pickup_datetime'].astype('datetime64[ns]')
df['pickup_longitude'] = df ['pickup_longitude'].astype('float32')
df['pickup_latitude'] = df ['pickup_latitude'].astype('float32')
df['dropoff_longitude'] = df ['dropoff_longitude'].astype('float32')
df['dropoff_latitude'] = df ['dropoff_latitude'].astype('float32')
df['passenger_count'] = df ['passenger_count'].astype('uint8')

In [4]:
# apply a list of filter conditions to throw out records with missing or outlier values
query_frags = [
    'fare_amount >= 2.5 and fare_amount < 500',
    'passenger_count > 0 and passenger_count < 6',
    'pickup_longitude > -75 and pickup_longitude < -73',
    'dropoff_longitude > -75 and dropoff_longitude < -73',
    'pickup_latitude > 40 and pickup_latitude < 42',
    'dropoff_latitude > 40 and dropoff_latitude < 42'
]
df = df.query(' and '.join(query_frags))

In [5]:
import math
from math import cos, sin, asin, sqrt, pi
        
def jfk_distance(dropoff_latitude, dropoff_longitude, jfk_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_jfk = pi/180 * 40.6413
        y_jfk = pi/180 * -73.7781
        
        dlon = y_jfk - y_1
        dlat = x_jfk - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_jfk) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        jfk_distance[i] = c * r
        
def lga_distance(dropoff_latitude, dropoff_longitude, lga_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_lga = pi/180 * 40.7769
        y_lga = pi/180 * -73.8740
        
        dlon = y_lga - y_1
        dlat = x_lga - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_lga) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        lga_distance[i] = c * r
        
def ewr_distance(dropoff_latitude, dropoff_longitude, ewr_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_ewr = pi/180 * 40.6895
        y_ewr = pi/180 * -74.1745
        
        dlon = y_ewr - y_1
        dlat = x_ewr - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_ewr) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        ewr_distance[i] = c * r
        
def tsq_distance(dropoff_latitude, dropoff_longitude, tsq_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_tsq = pi/180 * 40.7580
        y_tsq = pi/180 * -73.9855
        
        dlon = y_tsq - y_1
        dlat = x_tsq - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_tsq) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        tsq_distance[i] = c * r
        
def met_distance(dropoff_latitude, dropoff_longitude, met_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_met = pi/180 * 40.7794
        y_met = pi/180 * -73.9632
        
        dlon = y_met - y_1
        dlat = x_met - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_met) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        met_distance[i] = c * r
        
def wtc_distance(dropoff_latitude, dropoff_longitude, wtc_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_wtc = pi/180 * 40.7126
        y_wtc = pi/180 * -74.0099
        
        dlon = y_wtc - y_1
        dlat = x_wtc - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_wtc) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        wtc_distance[i] = c * r
        
def add_features(df):
    df['hour'] = df['pickup_datetime'].dt.hour
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['day'] = df['pickup_datetime'].dt.day
    df['weekday'] = df['pickup_datetime'].dt.weekday
    
    df = df.apply_rows(jfk_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(jfk_distance=np.float32), kwargs=dict())
    
    df = df.apply_rows(lga_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(lga_distance=np.float32), kwargs=dict())
        
    df = df.apply_rows(ewr_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(ewr_distance=np.float32), kwargs=dict())
            
    df = df.apply_rows(tsq_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(tsq_distance=np.float32), kwargs=dict())
    
    df = df.apply_rows(met_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(met_distance=np.float32), kwargs=dict())
    
    df = df.apply_rows(wtc_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(wtc_distance=np.float32), kwargs=dict())
    
    df = df.drop(['pickup_datetime','key'], axis=1)
    
    return df

In [6]:
# now add the features
parts = [dask.delayed(add_features)(part) for part in df.to_delayed()]
df = dask_cudf.from_delayed(parts)


In [7]:
y = df['fare_amount']
X = df.drop(['fare_amount'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)



In [12]:
dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
dvalid = xgb.dask.DaskDMatrix(client, X_test, y_test)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

"""params = {
    'learning_rate': 0.1,
    'max_depth': 11,
    'objective': 'reg:squarederror',
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 1,
    'booster' : 'gbtree',
    'eval_metric': ["auc", "rmse"],
    'tree_method':'gpu_hist',
}"""

params = {
    'booster' : 'gbtree',
    'eval_metric': "rmse",
    'tree_method':'gpu_hist',
    'objective': 'reg:squarederror',
    'min_child_weight': 1,
    'colsample_bytree': 0.7,
    'learning_rate': 0.05,
}

trained_model = xgb.dask.train(client, params, dtrain, num_boost_round=2000, evals=watchlist, early_stopping_rounds=100, verbose_eval=100)

[19:41:05] task [xgboost.dask]:tcp://10.1.0.7:38745 got new rank 0
[19:41:05] task [xgboost.dask]:tcp://10.1.0.7:36841 got new rank 1
[19:41:05] task [xgboost.dask]:tcp://10.1.0.7:41243 got new rank 2
[19:41:05] task [xgboost.dask]:tcp://10.1.0.7:45365 got new rank 3


[0]	train-rmse:13.89134	valid-rmse:13.89349
[100]	train-rmse:4.38305	valid-rmse:4.39411
[200]	train-rmse:4.08317	valid-rmse:4.09658
[300]	train-rmse:3.95006	valid-rmse:3.96496
[400]	train-rmse:3.87741	valid-rmse:3.89451
[500]	train-rmse:3.83173	valid-rmse:3.85081
[600]	train-rmse:3.80098	valid-rmse:3.82192
[700]	train-rmse:3.77527	valid-rmse:3.79801
[800]	train-rmse:3.75531	valid-rmse:3.77970
[900]	train-rmse:3.73800	valid-rmse:3.76422
[1000]	train-rmse:3.72250	valid-rmse:3.74999
[1100]	train-rmse:3.70962	valid-rmse:3.73874
[1200]	train-rmse:3.69982	valid-rmse:3.73033
[1300]	train-rmse:3.68939	valid-rmse:3.72140
[1400]	train-rmse:3.68043	valid-rmse:3.71399
[1500]	train-rmse:3.67236	valid-rmse:3.70721
[1600]	train-rmse:3.66474	valid-rmse:3.70078
[1700]	train-rmse:3.65801	valid-rmse:3.69575
[1800]	train-rmse:3.65193	valid-rmse:3.69094
[1900]	train-rmse:3.64615	valid-rmse:3.68685
[1999]	train-rmse:3.64063	valid-rmse:3.68271


In [28]:
trained_model['booster'].save_model("xg_gpu.model")

In [22]:
type(trained_model['booster'])

xgboost.core.Booster

In [29]:
model2 = xgb.XGBRegressor()
model2.load_model("xg_gpu.model")
model2

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=24, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='auto', validate_parameters=1, verbosity=None)

In [9]:
# prepare test set
test = dask_cudf.read_csv('data/test.csv')

test['key'] = test['key'].astype('datetime64[ns]')
test['pickup_datetime'] = test['pickup_datetime'].astype('datetime64[ns]')
test['pickup_longitude'] = test ['pickup_longitude'].astype('float32')
test['pickup_latitude'] = test ['pickup_latitude'].astype('float32')
test['dropoff_longitude'] = test ['dropoff_longitude'].astype('float32')
test['dropoff_latitude'] = test ['dropoff_latitude'].astype('float32')
test['passenger_count'] = test ['passenger_count'].astype('uint8')

# now add the features
tparts = [dask.delayed(add_features)(part) for part in test.to_delayed()]
test = dask_cudf.from_delayed(tparts)

# inspect the result
test.head().to_pandas()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,year,month,day,weekday,jfk_distance,lga_distance,ewr_distance,tsq_distance,met_distance,wtc_distance
0,-73.97332,40.763805,-73.98143,40.743835,1,13,2015,1,27,1,20.588026,9.76666,17.357328,1.611907,4.242198,4.221205
1,-73.986862,40.719383,-73.998886,40.739201,1,13,2015,1,27,1,21.564537,11.323337,15.799259,2.375148,5.386536,3.100046
2,-73.982521,40.751259,-73.979652,40.74614,1,11,2011,10,8,5,20.607002,9.533081,17.587698,1.407808,3.949447,4.517171
3,-73.981163,40.767807,-73.990448,40.751637,1,21,2012,12,1,5,21.7027,10.201569,16.980177,0.821209,3.846622,4.639801
4,-73.966049,40.789776,-73.988564,40.744427,1,21,2012,12,1,5,21.126984,10.302486,16.818722,1.531182,4.436936,3.969482


In [10]:
# make predictions
dtest = xgb.dask.DaskDMatrix(client, test)
prediction = xgb.dask.predict(client, trained_model['booster'], dtest)
s = prediction.compute()