In [2]:
# !pip install "dask-cloudprovider[azure]"
# !pip install "dask-cloudprovider[azure]" --upgrade
# !pip install --upgrade azure-mgmt-network azure-mgmt-compute
# !pip install gcsfs
# !pip install dask_xgboost
# !pip install azureml

In [3]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import dask
import cudf
import dask_cudf

from dask_ml.model_selection import train_test_split
from cuml.metrics import mean_squared_error

from cuml.dask.ensemble import RandomForestRegressor
from cuml.dask.common import utils as dask_utils

import numpy as np
import pandas as pd
import os
from urllib.request import urlretrieve
import gzip

In [4]:
import numpy as np
import numba, xgboost, socket
import dask, dask_cudf
from dask.distributed import Client, wait

In [5]:
# list of column names that need to be re-mapped
remap = {}
remap['tpep_pickup_datetime'] = 'pickup_datetime'
remap['tpep_dropoff_datetime'] = 'dropoff_datetime'
remap['ratecodeid'] = 'rate_code'

#create a list of columns & dtypes the df must have
must_haves = {
 'pickup_datetime': 'datetime64[ms]',
 'dropoff_datetime': 'datetime64[ms]',
 'passenger_count': 'int32',
 'trip_distance': 'float32',
 'pickup_longitude': 'float32',
 'pickup_latitude': 'float32',
 'rate_code': 'int32',
 'dropoff_longitude': 'float32',
 'dropoff_latitude': 'float32',
 'fare_amount': 'float32'
}

query_frags = [
    'fare_amount > 0 and fare_amount < 500',
    'passenger_count > 0 and passenger_count < 6',
    'pickup_longitude > -75 and pickup_longitude < -73',
    'dropoff_longitude > -75 and dropoff_longitude < -73',
    'pickup_latitude > 40 and pickup_latitude < 42',
    'dropoff_latitude > 40 and dropoff_latitude < 42'
]

In [6]:
def clean(df_part, remap, must_haves):
    tmp = {col:col.strip().lower() for col in list(df_part.columns)}
    df_part = df_part.rename(columns=tmp)
    # rename using the supplied mapping
    df_part = df_part.rename(columns=remap)
    # iterate through columns in this df partition
    for col in df_part.columns:
        # drop anything not in our expected list
        if col not in must_haves:
            df_part = df_part.drop(col, axis=1)
            continue

        # fixes datetime error found by Ty Mckercher and fixed by Paul Mahler
        if df_part[col].dtype == 'object' and col in ['pickup_datetime', 'dropoff_datetime']:
            df_part[col] = df_part[col].astype('datetime64[ms]')
            continue

        # if column was read as a string, recast as float
        if df_part[col].dtype == 'object':
            df_part[col] = df_part[col].str.fillna('-1')
            df_part[col] = df_part[col].astype('float32')
        else:
            # downcast from 64bit to 32bit types
            # Tesla T4 are faster on 32bit ops
            if 'int' in str(df_part[col].dtype):
                df_part[col] = df_part[col].astype('int32')
            if 'float' in str(df_part[col].dtype):
                df_part[col] = df_part[col].astype('float32')
            df_part[col] = df_part[col].fillna(-1)
    return df_part

In [7]:
import math
from math import cos, sin, asin, sqrt, pi

def haversine_distance_kernel(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude, h_distance):
    for i, (x_1, y_1, x_2, y_2) in enumerate(zip(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude)):
        x_1 = pi/180 * x_1
        y_1 = pi/180 * y_1
        x_2 = pi/180 * x_2
        y_2 = pi/180 * y_2
        
        dlon = y_2 - y_1
        dlat = x_2 - x_1
        a = sin(dlat/2)**2 + cos(x_1) * cos(x_2) * sin(dlon/2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        h_distance[i] = c * r

def day_of_the_week_kernel(day, month, year, day_of_week):
    for i, (d_1, m_1, y_1) in enumerate(zip(day, month, year)):
        if month[i] <3:
            shift = month[i]
        else:
            shift = 0
        Y = year[i] - (month[i] < 3)
        y = Y - 2000
        c = 20
        d = day[i]
        m = month[i] + shift + 1
        day_of_week[i] = (d + math.floor(m*2.6) + y + (y//4) + (c//4) -2*c)%7
        
def add_features(df):
    df['hour'] = df['pickup_datetime'].dt.hour
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['day'] = df['pickup_datetime'].dt.day
    df['diff'] = df['dropoff_datetime'].astype('int32') - df['pickup_datetime'].astype('int32')
    
    df['pickup_latitude_r'] = df['pickup_latitude']//.01*.01
    df['pickup_longitude_r'] = df['pickup_longitude']//.01*.01
    df['dropoff_latitude_r'] = df['dropoff_latitude']//.01*.01
    df['dropoff_longitude_r'] = df['dropoff_longitude']//.01*.01
    
    df = df.drop('pickup_datetime', axis=1)
    df = df.drop('dropoff_datetime', axis =1)
    
    
    df = df.apply_rows(haversine_distance_kernel,
                   incols=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'],
                   outcols=dict(h_distance=np.float32),
                   kwargs=dict())
    
    
    df = df.apply_rows(day_of_the_week_kernel,
                      incols=['day', 'month', 'year'],
                      outcols=dict(day_of_week=np.float32),
                      kwargs=dict())
    
    
    df['is_weekend'] = (df['day_of_week']<2)
    return df

In [8]:
location = ""
resource_group = ""
vnet = ""
security_group = ""

vm_size = "Standard_NC12s_v3"
docker_image = "rapidsai/rapidsai:cuda10.2-runtime-ubuntu18.04-py3.8"
worker_class = "dask_cuda.CUDAWorker"
 
n_workers = 1
env_vars = {"EXTRA_PIP_PACKAGES": "gcsfs"} 

In [9]:
from distributed import Client
from dask_cloudprovider.azure import AzureVMCluster

cluster = AzureVMCluster(
    location=location,
    resource_group=resource_group,
    vnet=vnet,
    security_group=security_group,
    vm_size=vm_size,
    docker_image=docker_image,
    worker_class=worker_class,
    env_vars=env_vars,
)

Creating scheduler instance
Assigned public IP
Network interface ready
Creating VM
Created VM dask-ec6ab826-scheduler
Waiting for scheduler to run
Scheduler is running


  next(self.gen)


In [10]:
cu_rf_params = {
    'n_estimators': 100,
    'max_depth': 16,
}

In [13]:
with Client(cluster) as client:
    import dask_cudf
    cluster.scale(2)
    client.wait_for_workers(2)
    from cuml.dask.ensemble import RandomForestRegressor

    base_path = 'gcs://anaconda-public-data/nyc-taxi/csv/'
    df_2014 = dask_cudf.read_csv(base_path+'2014/yellow_tripdata_2014*.csv', n_rows=1000)

    df_2014 = clean(df_2014, remap, must_haves)

    df_2014 = df_2014.query(' and '.join(query_frags))

    taxi_df = df_2014.map_partitions(add_features)

    taxi_df = taxi_df.dropna()
    taxi_df = taxi_df.astype("float32")
    X, y = taxi_df.drop(["fare_amount"], axis=1), taxi_df["fare_amount"].astype('float32')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

    workers = client.has_what().keys()

    X_train, X_test, y_train, y_test = dask_utils.persist_across_workers(client,
                                                           [X_train, X_test, y_train, y_test],
                                                           workers=workers)
    cu_dask_rf = RandomForestRegressor(**cu_rf_params, ignore_empty_partitions=True)
    cu_dask_rf = cu_dask_rf.fit(X_train, y_train)

    y_pred = cu_dask_rf.predict(X_test)
    
    _y_pred, _y_test = y_pred.compute().to_array(), y_test.compute().to_array()
    
    score = mean_squared_error(_y_pred, _y_test)
    print("RMSE: ", np.sqrt(score))


+---------+--------+-----------+---------+
| Package | client | scheduler | workers |
+---------+--------+-----------+---------+
| blosc   | 1.10.1 | None      | None    |
| lz4     | 3.1.1  | None      | None    |
+---------+--------+-----------+---------+


RMSE:  3.0230505
