In [1]:
import sys
!cp ../input/rapids/rapids.0.16.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import numba, socket
import cudf
import dask, dask_cudf
from dask_cuda import LocalCUDACluster
from dask.delayed import delayed
import dask.dataframe as dd
from dask.distributed import Client, wait, LocalCluster
from dask_ml.model_selection import train_test_split

cluster = LocalCluster(n_workers=1)
client = Client(cluster)
client.restart()

dask.config.set({'distributed.scheduler.work-stealing': False})
dask.config.get('distributed.scheduler.work-stealing')
dask.config.set({'distributed.scheduler.bandwidth': 1})
dask.config.get('distributed.scheduler.bandwidth')

client


+---------+--------+-----------+---------+
| Package | client | scheduler | workers |
+---------+--------+-----------+---------+
| numpy   | 1.18.5 | 1.18.5    | 1.19.2  |
| tornado | 5.0.2  | 5.0.2     | 6.0.4   |
+---------+--------+-----------+---------+


0,1
Client  Scheduler: tcp://127.0.0.1:41765  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 2  Memory: 13.96 GB


In [3]:
!nvidia-smi

Sun Dec  6 22:37:16 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    34W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [4]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Wed_Oct_23_19:24:38_PDT_2019
Cuda compilation tools, release 10.2, V10.2.89


In [5]:
df = dask_cudf.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/train.csv')

In [6]:
df['key'] = df['key'].astype('datetime64[ns]')
df['fare_amount'] = df ['fare_amount'].astype('float32')
df['pickup_datetime'] = df['pickup_datetime'].astype('datetime64[ns]')
df['pickup_longitude'] = df ['pickup_longitude'].astype('float32')
df['pickup_latitude'] = df ['pickup_latitude'].astype('float32')
df['dropoff_longitude'] = df ['dropoff_longitude'].astype('float32')
df['dropoff_latitude'] = df ['dropoff_latitude'].astype('float32')
df['passenger_count'] = df ['passenger_count'].astype('uint8')

In [7]:
# apply a list of filter conditions to throw out records with missing or outlier values
query_frags = [
    'fare_amount >= 2.5 and fare_amount < 500',
    'passenger_count > 0 and passenger_count < 6',
    'pickup_longitude > -75 and pickup_longitude < -73',
    'dropoff_longitude > -75 and dropoff_longitude < -73',
    'pickup_latitude > 40 and pickup_latitude < 42',
    'dropoff_latitude > 40 and dropoff_latitude < 42'
]
df = df.query(' and '.join(query_frags))

In [8]:
import math
from math import cos, sin, asin, sqrt, pi

def h_distance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude, h_distance):
    for i, (x_1, y_1, x_2, y_2) in enumerate(zip(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_2 = pi/180 * x_2
        y_2 = pi/180 * y_2
        
        dlon = y_2 - y_1
        dlat = x_2 - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_2) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        h_distance[i] = c * r
        
def jfk_distance(dropoff_latitude, dropoff_longitude, jfk_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_jfk = pi/180 * 40.6413
        y_jfk = pi/180 * -73.7781
        
        dlon = y_jfk - y_1
        dlat = x_jfk - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_jfk) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        jfk_distance[i] = c * r
        
def lga_distance(dropoff_latitude, dropoff_longitude, lga_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_lga = pi/180 * 40.7769
        y_lga = pi/180 * -73.8740
        
        dlon = y_lga - y_1
        dlat = x_lga - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_lga) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        lga_distance[i] = c * r
        
def ewr_distance(dropoff_latitude, dropoff_longitude, ewr_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_ewr = pi/180 * 40.6895
        y_ewr = pi/180 * -74.1745
        
        dlon = y_ewr - y_1
        dlat = x_ewr - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_ewr) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        ewr_distance[i] = c * r
        
def tsq_distance(dropoff_latitude, dropoff_longitude, tsq_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_tsq = pi/180 * 40.7580
        y_tsq = pi/180 * -73.9855
        
        dlon = y_tsq - y_1
        dlat = x_tsq - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_tsq) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        tsq_distance[i] = c * r
        
def met_distance(dropoff_latitude, dropoff_longitude, met_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_met = pi/180 * 40.7794
        y_met = pi/180 * -73.9632
        
        dlon = y_met - y_1
        dlat = x_met - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_met) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        met_distance[i] = c * r
        
def wtc_distance(dropoff_latitude, dropoff_longitude, wtc_distance):
    for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_wtc = pi/180 * 40.7126
        y_wtc = pi/180 * -74.0099
        
        dlon = y_wtc - y_1
        dlat = x_wtc - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_wtc) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        wtc_distance[i] = c * r
        
def add_features(df):
    df['hour'] = df['pickup_datetime'].dt.hour
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['day'] = df['pickup_datetime'].dt.day
    df['weekday'] = df['pickup_datetime'].dt.weekday
    
    df = df.apply_rows(h_distance, incols=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(h_distance=np.float32), kwargs=dict())
    
    df = df.apply_rows(jfk_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(jfk_distance=np.float32), kwargs=dict())
    
    df = df.apply_rows(lga_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(lga_distance=np.float32), kwargs=dict())
        
    df = df.apply_rows(ewr_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(ewr_distance=np.float32), kwargs=dict())
            
    df = df.apply_rows(tsq_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(tsq_distance=np.float32), kwargs=dict())
    
    df = df.apply_rows(met_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(met_distance=np.float32), kwargs=dict())
    
    df = df.apply_rows(wtc_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(wtc_distance=np.float32), kwargs=dict())
    
    df = df.drop(['pickup_datetime','key'], axis=1)
    
    return df

In [9]:
# now add the features
parts = [dask.delayed(add_features)(part) for part in df.to_delayed()]
df = dask_cudf.from_delayed(parts)

# inspect the result
df.head().to_pandas()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,year,month,day,weekday,h_distance,jfk_distance,lga_distance,ewr_distance,tsq_distance,met_distance,wtc_distance
0,4.5,-73.844315,40.721317,-73.841614,40.712276,1,17,2009,6,15,0,1.030742,9.538091,7.686337,28.176201,13.14625,12.673676,14.18404
1,16.9,-74.016045,40.711304,-73.979271,40.782005,1,16,2010,1,5,1,8.450001,23.071407,8.881979,19.400442,2.720324,1.383812,8.13742
2,5.7,-73.982735,40.761269,-73.991241,40.750561,2,0,2011,8,18,3,1.389632,21.690807,10.299277,16.870735,0.958202,3.982538,4.504328
3,7.7,-73.987129,40.733143,-73.99157,40.758091,1,4,2012,4,21,5,2.799211,22.192249,10.119627,17.199213,0.511319,3.364836,5.288893
4,5.3,-73.968094,40.768009,-73.956657,40.783764,1,7,2010,3,9,1,1.999081,21.850355,7.001391,21.136244,3.755865,0.734109,9.09577


In [10]:
y = df['fare_amount']
X = df.drop(['fare_amount'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)



In [11]:
%%time
dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
dvalid = xgb.dask.DaskDMatrix(client, X_test, y_test)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

params = {
    'learning_rate': 0.05,
    'max_depth': 11,
    'objective': 'reg:squarederror',
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 1,
    'gamma': 1,
    'silent': True,
    'verbose_eval': True,
    'booster' : 'gbtree',
    'eval_metric': 'rmse',
    'tree_method':'gpu_hist',
    'n_gpus': 1
}

trained_model = xgb.dask.train(client, params, dtrain, num_boost_round=7000, evals=watchlist, early_stopping_rounds=100, verbose_eval=100)

  self.sync(self._update_scheduler_info)


CPU times: user 25.4 s, sys: 5.64 s, total: 31.1 s
Wall time: 16min 38s


In [12]:
# prepare test set
test = dask_cudf.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/test.csv')

test['key'] = test['key'].astype('datetime64[ns]')
test['pickup_datetime'] = test['pickup_datetime'].astype('datetime64[ns]')
test['pickup_longitude'] = test ['pickup_longitude'].astype('float32')
test['pickup_latitude'] = test ['pickup_latitude'].astype('float32')
test['dropoff_longitude'] = test ['dropoff_longitude'].astype('float32')
test['dropoff_latitude'] = test ['dropoff_latitude'].astype('float32')
test['passenger_count'] = test ['passenger_count'].astype('uint8')

# now add the features
tparts = [dask.delayed(add_features)(part) for part in test.to_delayed()]
test = dask_cudf.from_delayed(tparts)

# inspect the result
test.head().to_pandas()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,year,month,day,weekday,h_distance,jfk_distance,lga_distance,ewr_distance,tsq_distance,met_distance,wtc_distance
0,-73.97332,40.763805,-73.98143,40.743835,1,13,2015,1,27,1,2.32326,20.588026,9.76666,17.357328,1.611907,4.242198,4.221205
1,-73.986862,40.719383,-73.998886,40.739201,1,13,2015,1,27,1,2.425353,21.564537,11.323337,15.799259,2.375148,5.386536,3.100046
2,-73.982521,40.751259,-73.979652,40.74614,1,11,2011,10,8,5,0.618412,20.607002,9.533081,17.587698,1.407808,3.949447,4.517171
3,-73.981163,40.767807,-73.990448,40.751637,1,21,2012,12,1,5,1.960778,21.7027,10.201569,16.980177,0.821209,3.846622,4.639801
4,-73.966049,40.789776,-73.988564,40.744427,1,21,2012,12,1,5,5.38728,21.126984,10.302486,16.818722,1.531182,4.436936,3.969482


In [13]:
# make predictions
dtest = xgb.dask.DaskDMatrix(client, test)
prediction = xgb.dask.predict(client, trained_model['booster'], dtest)
s = prediction.compute()

sub = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/sample_submission.csv')

submission = pd.DataFrame()
submission['key']  = sub['key']
submission['fare_amount'] = s
submission.to_csv('submission.csv', index = False)

In [14]:
submission.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,8.782512
1,2015-01-27 13:08:24.0000003,10.056155
2,2011-10-08 11:53:44.0000002,3.943451
3,2012-12-01 21:12:12.0000002,9.362447
4,2012-12-01 21:12:12.0000003,16.856165


In [15]:
submission.tail()

Unnamed: 0,key,fare_amount
9909,2015-05-10 12:37:51.0000002,9.417111
9910,2015-01-12 17:05:51.0000001,10.57254
9911,2015-04-19 20:44:15.0000001,52.712883
9912,2015-01-31 01:05:19.0000005,18.493864
9913,2015-01-18 14:06:23.0000006,6.873839
