## 1. Importing necessary modules & creating CUDA enabled GPU Cluster

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import dask, dask_cudf
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
from dask_ml.model_selection import train_test_split
from dask.utils import parse_bytes
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import cos, sin, asin, sqrt, pi
import time
start = time.perf_counter()

## 2. Create CUDA Cluster

In [2]:
cluster = LocalCUDACluster(
    rmm_pool_size=parse_bytes("48GB") #I've 32GB of GPU Memory, set this according to your setup.
)
client = Client(cluster)
client.restart()

dask.config.set({'distributed.scheduler.work-stealing': False})
cluster_creation_time = time.perf_counter()

distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize


## 3. Reading Data

In [3]:
df = dask_cudf.read_csv('../data/train.csv')
read_time = time.perf_counter()

## 4. Dtype Conversion

In [4]:
def dtype_conversion(df):
    df['key'] = df['key'].astype('datetime64[ns]')
    df['fare_amount'] = df ['fare_amount'].astype('float32')
    df['pickup_datetime'] = df['pickup_datetime'].astype('datetime64[ns]')
    df['pickup_longitude'] = df ['pickup_longitude'].astype('float32')
    df['pickup_latitude'] = df ['pickup_latitude'].astype('float32')
    df['dropoff_longitude'] = df ['dropoff_longitude'].astype('float32')
    df['dropoff_latitude'] = df ['dropoff_latitude'].astype('float32')
    df['passenger_count'] = df ['passenger_count'].astype('uint8')
    return df

df = dtype_conversion(df)
dtype_conversion_time = time.perf_counter()

## 5. Applying Constraints for NYC

In [5]:
def apply_constraints(df):
    query_frags = [
        'fare_amount >= 2.5 and fare_amount < 500',
        'passenger_count > 0 and passenger_count < 6',
        'pickup_longitude > -75 and pickup_longitude < -73',
        'dropoff_longitude > -75 and dropoff_longitude < -73',
        'pickup_latitude > 40 and pickup_latitude < 42',
        'dropoff_latitude > 40 and dropoff_latitude < 42'
    ]
    df = df.query(' and '.join(query_frags))
    return df

df = apply_constraints(df)
apply_constraints_time = time.perf_counter()

## 6. Feature Engineering

In [6]:
def feature_engg(df):
    def jfk_distance(dropoff_latitude, dropoff_longitude, jfk_distance):
        for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
            x_1 = pi/180 * x_1
            y_1 = pi/180 * y_1
            x_jfk = pi/180 * 40.6413
            y_jfk = pi/180 * -73.7781
            
            dlon = y_jfk - y_1
            dlat = x_jfk - x_1
            a = sin(dlat/2)**2 + cos(x_1) * cos(x_jfk) * sin(dlon/2)**2
            
            c = 2 * asin(sqrt(a)) 
            r = 6371 # Radius of earth in kilometers
            
            jfk_distance[i] = c * r
            
    def lga_distance(dropoff_latitude, dropoff_longitude, lga_distance):
        for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
            x_1 = pi/180 * x_1
            y_1 = pi/180 * y_1
            x_lga = pi/180 * 40.7769
            y_lga = pi/180 * -73.8740
            
            dlon = y_lga - y_1
            dlat = x_lga - x_1
            a = sin(dlat/2)**2 + cos(x_1) * cos(x_lga) * sin(dlon/2)**2
            
            c = 2 * asin(sqrt(a)) 
            r = 6371 # Radius of earth in kilometers
            
            lga_distance[i] = c * r
            
    def ewr_distance(dropoff_latitude, dropoff_longitude, ewr_distance):
        for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
            x_1 = pi/180 * x_1
            y_1 = pi/180 * y_1
            x_ewr = pi/180 * 40.6895
            y_ewr = pi/180 * -74.1745
            
            dlon = y_ewr - y_1
            dlat = x_ewr - x_1
            a = sin(dlat/2)**2 + cos(x_1) * cos(x_ewr) * sin(dlon/2)**2
            
            c = 2 * asin(sqrt(a)) 
            r = 6371 # Radius of earth in kilometers
            
            ewr_distance[i] = c * r
            
    def tsq_distance(dropoff_latitude, dropoff_longitude, tsq_distance):
        for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
            x_1 = pi/180 * x_1
            y_1 = pi/180 * y_1
            x_tsq = pi/180 * 40.7580
            y_tsq = pi/180 * -73.9855
            
            dlon = y_tsq - y_1
            dlat = x_tsq - x_1
            a = sin(dlat/2)**2 + cos(x_1) * cos(x_tsq) * sin(dlon/2)**2
            
            c = 2 * asin(sqrt(a)) 
            r = 6371 # Radius of earth in kilometers
            
            tsq_distance[i] = c * r
            
    def met_distance(dropoff_latitude, dropoff_longitude, met_distance):
        for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
            x_1 = pi/180 * x_1
            y_1 = pi/180 * y_1
            x_met = pi/180 * 40.7794
            y_met = pi/180 * -73.9632
            
            dlon = y_met - y_1
            dlat = x_met - x_1
            a = sin(dlat/2)**2 + cos(x_1) * cos(x_met) * sin(dlon/2)**2
            
            c = 2 * asin(sqrt(a)) 
            r = 6371 # Radius of earth in kilometers
            
            met_distance[i] = c * r
            
    def wtc_distance(dropoff_latitude, dropoff_longitude, wtc_distance):
        for i, (x_1, y_1) in enumerate(zip(dropoff_latitude, dropoff_longitude)):
            x_1 = pi/180 * x_1
            y_1 = pi/180 * y_1
            x_wtc = pi/180 * 40.7126
            y_wtc = pi/180 * -74.0099
            
            dlon = y_wtc - y_1
            dlat = x_wtc - x_1
            a = sin(dlat/2)**2 + cos(x_1) * cos(x_wtc) * sin(dlon/2)**2
            
            c = 2 * asin(sqrt(a)) 
            r = 6371 # Radius of earth in kilometers
            
            wtc_distance[i] = c * r
            
    def add_features(df):
        df['hour'] = df['pickup_datetime'].dt.hour
        df['year'] = df['pickup_datetime'].dt.year
        df['month'] = df['pickup_datetime'].dt.month
        df['day'] = df['pickup_datetime'].dt.day
        df['weekday'] = df['pickup_datetime'].dt.weekday
        
        df = df.apply_rows(jfk_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                        outcols=dict(jfk_distance=np.float32), kwargs=dict())
        
        df = df.apply_rows(lga_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                        outcols=dict(lga_distance=np.float32), kwargs=dict())
            
        df = df.apply_rows(ewr_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                        outcols=dict(ewr_distance=np.float32), kwargs=dict())
                
        df = df.apply_rows(tsq_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                        outcols=dict(tsq_distance=np.float32), kwargs=dict())
        
        df = df.apply_rows(met_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                        outcols=dict(met_distance=np.float32), kwargs=dict())
        
        df = df.apply_rows(wtc_distance, incols=['dropoff_latitude', 'dropoff_longitude'],
                        outcols=dict(wtc_distance=np.float32), kwargs=dict())
        
        df = df.drop(['pickup_datetime','key'], axis=1)
        
        return df

    parts = [dask.delayed(add_features)(part) for part in df.to_delayed()]
    df = dask_cudf.from_delayed(parts)
    return df

df = feature_engg(df)
feature_engg_time = time.perf_counter()

## 7. Splitting Data

In [7]:
y = df['fare_amount']
X = df.drop(['fare_amount'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)
split_data_time = time.perf_counter()

## 8. Training

In [8]:
dtrain = xgb.dask.DaskDMatrix(client, X_train, y_train)
dvalid = xgb.dask.DaskDMatrix(client, X_test, y_test)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

params = {
    'min_child_weight': 1,
    'learning_rate': 0.05,
    'colsample_bytree': 0.7,
    #'max_depth': 11,
    'subsample': 0.7,
    'booster' : 'gbtree',
    'objective': 'reg:squarederror',
    'tree_method':'gpu_hist',
    'eval_metric': "rmse",
    }

model = xgb.dask.train(client, params, dtrain, num_boost_round=1000, evals=watchlist, early_stopping_rounds=100, verbose_eval=100)
training_time = time.perf_counter()

[22:07:46] task [xgboost.dask]:tcp://10.1.0.7:34153 got new rank 0
[22:07:46] task [xgboost.dask]:tcp://10.1.0.7:32851 got new rank 1
[22:07:46] task [xgboost.dask]:tcp://10.1.0.7:42629 got new rank 2
[22:07:46] task [xgboost.dask]:tcp://10.1.0.7:37207 got new rank 3


[0]	train-rmse:13.89172	valid-rmse:13.89161
[100]	train-rmse:4.40133	valid-rmse:4.40682
[200]	train-rmse:4.07343	valid-rmse:4.08141
[300]	train-rmse:3.95726	valid-rmse:3.96766
[400]	train-rmse:3.88254	valid-rmse:3.89547
[500]	train-rmse:3.83883	valid-rmse:3.85384
[600]	train-rmse:3.80886	valid-rmse:3.82628
[700]	train-rmse:3.78088	valid-rmse:3.80052
[800]	train-rmse:3.75761	valid-rmse:3.77917
[900]	train-rmse:3.74116	valid-rmse:3.76522
[999]	train-rmse:3.72612	valid-rmse:3.75228


## 9. Testing

In [9]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return (rmse, mae, r2)

def test(client, model):
    df = dask_cudf.read_csv('../data/test.csv')
    df = dtype_conversion(df)
    df = apply_constraints(df)
    df = feature_engg(df)
    actual = df['fare_amount']
    actual = actual.compute().to_array()
    df =  df.drop('fare_amount', axis = 1)
    df = xgb.dask.DaskDMatrix(client, df)

    pred = xgb.dask.predict(client, model, df)
    pred = pred.compute()
    rmse, mae, r2 = eval_metrics(actual, pred)
    print("Root Mean Squared Error : ", rmse)
    print("Mean Absolute Error : ", mae)
    print("R-squared Score : ", r2)

test(client, model)
test_time = time.perf_counter()
stop = time.perf_counter()

Root Mean Squared Error :  3.7332113
Mean Absolute Error :  1.6824707
R-squared Score :  0.851153974000958


## 10. Time Taken

In [1]:
print("Creating CUDA Cluster : ", cluster_creation_time - start, "s")
print("Reading : ", read_time - cluster_creation_time, "s")
print("Data Type Conversion : ", dtype_conversion_time - read_time, "s")
print("Applying Constraints : ", apply_constraints_time - dtype_conversion_time, "s")
print("Feature Engineering : ", feature_engg_time - apply_constraints_time, "s")
print("Splitting Data : ", split_data_time - feature_engg_time, "s")
print("Training : ", training_time - split_data_time, "s")
print("Testing : ", test_time - training_time, "s")
print("Total : ", stop - start, "s")

Creating CUDA Cluster : 12.231508846045472s
Reading :  2.5630480609834194s
Data Type Conversion :  1.0433651669882238s
Applying Constraints :  0.7912436389597133s
Feature Engineering :  5.620851180050522s
Splitting Data :  0.25998906802851707s
Training :  576.3746650048997s
Testing :  7.6042080810293555s
Total :  606.4889245460508s
