In [None]:
import os
import socket

In [None]:
import dask
import dask_cudf
import distributed
import dask_xgboost as dxgb

In [None]:
print("- setting dask settings")
dask.config.set({'distributed.scheduler.work-stealing': False})
dask.config.set({'distributed.scheduler.bandwidth': 1})

print("-- Changes to dask settings")
print("--- Setting work-stealing to ", dask.config.get('distributed.scheduler.work-stealing'))
print("--- Setting scheduler bandwidth to ", dask.config.get('distributed.scheduler.bandwidth'))
print("-- Settings updates complete")

In [None]:
ip = socket.gethostbyname(socket.gethostname())
scheduler = "tcp://" + ip + ":8786"
client = distributed.Client(scheduler)
client.restart()
client

In [None]:
# update this path to reflect the datastore from Tracked Metrics if you downloaded the NYC Taxi Trip dataset
datastore = "/path/to/azure/datastore"

In [None]:
# list of column names that need to be re-mapped
remap = {}
remap['tpep_pickup_datetime'] = 'pickup_datetime'
remap['tpep_dropoff_datetime'] = 'dropoff_datetime'
remap['ratecodeid'] = 'rate_code'

#create a list of columns & dtypes the df must have
must_haves = {
    'pickup_datetime': 'datetime64[ms]',
    'dropoff_datetime': 'datetime64[ms]',
    'passenger_count': 'int32',
    'trip_distance': 'float32',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'rate_code': 'int32',
    'dropoff_longitude': 'float32',
    'dropoff_latitude': 'float32',
    'fare_amount': 'float32'
}

In [None]:
# helper function which takes a DataFrame partition
def clean(df_part, remap, must_haves):    
    # some col-names include pre-pended spaces remove & lowercase column names
    tmp = {col:col.strip().lower() for col in list(df_part.columns)}
    df_part = df_part.rename(tmp)
    
    # rename using the supplied mapping
    df_part = df_part.rename(remap)
    
    # iterate through columns in this df partition
    for col in df_part.columns:
        # drop anything not in our expected list
        if col not in must_haves:
            df_part = df_part.drop(col)
            continue

        if df_part[col].dtype == 'object' and col in ['pickup_datetime', 'dropoff_datetime']:
            df_part[col] = df_part[col].astype('datetime64[ms]')
            continue
            
        # if column was read as a string, recast as float
        if df_part[col].dtype == 'object':
            df_part[col] = df_part[col].str.fillna('-1')
            df_part[col] = df_part[col].astype('float32')
        else:
            # downcast from 64bit to 32bit types
            # Tesla T4 are faster on 32bit ops
            if 'int' in str(df_part[col].dtype):
                df_part[col] = df_part[col].astype('int32')
            if 'float' in str(df_part[col].dtype):
                df_part[col] = df_part[col].astype('float32')
            df_part[col] = df_part[col].fillna(-1)
    
    return df_part

In [None]:
# adjust this dictionary list if you'd like to use different year in this workload
is_valid_years = {
    "2014": False,
    "2015": False,
    "2016": True
}

In [None]:
data_path = os.path.join(datastore, "data/nyctaxi")

dfs = []
if not os.path.exists(data_path):
    print("WARNING: the NYC Taxi Trip Data was not found in the Azure datastore")
    print("WARNING: updating the data path to use a public datastore")
    print("WARNING: data will be downloaded and processed in-situ")
    print("WARNING: this degrades performance")
    print("WARNING: to avoid this performance degradation, use the `--download_nyctaxi_data=True` option when using start_azureml.py")
    data_path = "gcs://anaconda-public-data/nyc-taxi/csv/"
    if is_valid_years["2014"]:
        taxi_df_2014 = dask_cudf.read_csv(os.path.join(data_path, "2014/yellow_*.csv"))
        taxi_df_2014 = taxi_df_2014.map_partitions(clean, remap, must_haves)
        dfs.append(taxi_df_2014)
    if is_valid_years["2015"]:
        taxi_df_2015 = dask_cudf.read_csv(os.path.join(data_path, "2015/yellow_*.csv"))
        taxi_df_2015 = taxi_df_2015.map_partitions(clean, remap, must_haves)
        dfs.append(taxi_df_2015)
    if is_valid_years["2016"]:
        valid_months_2016 = [str(x).rjust(2, '0') for x in range(1, 7)]
        valid_files_2016 = [os.path.join(data_path, "2016/yellow_tripdata_2016-{}.csv".format(month)) for month in valid_months_2016]
        taxi_df_2016 = dask_cudf.read_csv(valid_files_2016)
        taxi_df_2016 = taxi_df_2016.map_partitions(clean, remap, must_haves)
        dfs.append(taxi_df_2016)
else:
    if is_valid_years["2014"] and os.path.exists(os.path.join(data_path, "2014")):
        taxi_df_2014 = dask_cudf.read_csv(os.path.join(data_path, "2014/yellow_*.csv"))
        taxi_df_2014 = taxi_df_2014.map_partitions(clean, remap, must_haves)
        dfs.append(taxi_df_2014)
    if is_valid_years["2015"] and os.path.exists(os.path.join(data_path, "2014")):
        taxi_df_2015 = dask_cudf.read_csv(os.path.join(data_path, "2015/yellow_*.csv"))
        taxi_df_2015 = taxi_df_2015.map_partitions(clean, remap, must_haves)
        dfs.append(taxi_df_2015)
    if is_valid_years["2016"] and os.path.exists(os.path.join(data_path, "2014")):
        taxi_df_2016 = dask_cudf.read_csv(os.path.join(data_path, "2016/yellow_*.csv"))
        taxi_df_2016 = taxi_df_2016.map_partitions(clean, remap, must_haves)
        dfs.append(taxi_df_2016)

taxi_df = dask.dataframe.multi.concat(dfs)

In [None]:
print("Column names are as follows:")
for column in taxi_df.columns:
    print(column)

In [None]:
# apply a list of filter conditions to throw out records with missing or outlier values
query_frags = [
    'fare_amount > 0 and fare_amount < 500',
    'passenger_count > 0 and passenger_count < 6',
    'pickup_longitude > -75 and pickup_longitude < -73',
    'dropoff_longitude > -75 and dropoff_longitude < -73',
    'pickup_latitude > 40 and pickup_latitude < 42',
    'dropoff_latitude > 40 and dropoff_latitude < 42'
]
taxi_df = taxi_df.query(' and '.join(query_frags))

# inspect the results of cleaning
taxi_df.head().to_pandas()

In [None]:
import math
from math import cos, sin, asin, sqrt, pi
import numpy as np

def haversine_distance_kernel(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude, h_distance):
    for i, (x_1, y_1, x_2, y_2) in enumerate(zip(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude)):
        x_1 = pi / 180 * x_1
        y_1 = pi / 180 * y_1
        x_2 = pi / 180 * x_2
        y_2 = pi / 180 * y_2
        
        dlon = y_2 - y_1
        dlat = x_2 - x_1
        a = sin(dlat / 2)**2 + cos(x_1) * cos(x_2) * sin(dlon / 2)**2
        
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers
        
        h_distance[i] = c * r

def day_of_the_week_kernel(day, month, year, day_of_week):
    for i, (d_1, m_1, y_1) in enumerate(zip(day, month, year)):
        if month[i] < 3:
            shift = month[i]
        else:
            shift = 0
        Y = year[i] - (month[i] < 3)
        y = Y - 2000
        c = 20
        d = day[i]
        m = month[i] + shift + 1
        day_of_week[i] = (d + math.floor(m * 2.6) + y + (y // 4) + (c // 4) - 2 * c) % 7
        
def add_features(df):
    df['hour'] = df['pickup_datetime'].dt.hour
    df['year'] = df['pickup_datetime'].dt.year
    df['month'] = df['pickup_datetime'].dt.month
    df['day'] = df['pickup_datetime'].dt.day
    df['diff'] = df['dropoff_datetime'].astype('int32') - df['pickup_datetime'].astype('int32')
    
    df['pickup_latitude_r'] = df['pickup_latitude'] // .01 * .01
    df['pickup_longitude_r'] = df['pickup_longitude'] // .01 * .01
    df['dropoff_latitude_r'] = df['dropoff_latitude'] // .01 * .01
    df['dropoff_longitude_r'] = df['dropoff_longitude'] // .01 * .01
    
    df = df.drop('pickup_datetime')
    df = df.drop('dropoff_datetime')

    df = df.apply_rows(haversine_distance_kernel,
                       incols=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'],
                       outcols=dict(h_distance=np.float32),
                       kwargs=dict())

    df = df.apply_rows(day_of_the_week_kernel,
                       incols=['day', 'month', 'year'],
                       outcols=dict(day_of_week=np.float32),
                       kwargs=dict())


    df['is_weekend'] = (df['day_of_week']<2).astype("int32")
    return df

In [None]:
%%time

# actually add the features
taxi_df = taxi_df.map_partitions(add_features).persist()
done = distributed.wait(taxi_df)
# inspect the result
# taxi_df.head().to_pandas()

In [None]:
%matplotlib inline
taxi_df.groupby('hour').fare_amount.mean().compute().to_pandas().sort_index().plot(legend=True);

In [None]:
%%time

X_train = taxi_df.query('day < 25').persist()

# create a Y_train ddf with just the target variable
Y_train = X_train[['fare_amount']].persist()
# drop the target variable from the training ddf
X_train = X_train[X_train.columns.difference(['fare_amount'])]

# this wont return until all data is in GPU memory
done = distributed.wait([X_train, Y_train])

In [None]:
%%time

params = {
    'learning_rate'  : 0.3,
    'max_depth'      : 8,
    'objective'      : 'reg:squarederror',
    'subsample'      : 0.6,
    'gamma'          : 1,
    'silent'         : True,
    'verbose_eval'   : True,
    'tree_method'    :'gpu_hist'
}

trained_model = dxgb.train(client, params, X_train, Y_train, num_boost_round=100)

In [None]:
def drop_empty_partitions(df):
    lengths = df.map_partitions(len).compute()
    nonempty = [length > 0 for length in lengths]
    return df.partitions[nonempty]

In [None]:
X_test = taxi_df.query('day >= 25').persist()
X_test = drop_empty_partitions(X_test)

# Create Y_test with just the fare amount
Y_test = X_test[['fare_amount']]

# Drop the fare amount from X_test
X_test = X_test[X_test.columns.difference(['fare_amount'])]

# display test set size
# len(X_test)

In [None]:
# generate predictions on the test set

Y_test['prediction'] = dxgb.predict(client, trained_model, X_test)

In [None]:
Y_test['squared_error'] = (Y_test['prediction'] - Y_test['fare_amount'])**2

# inspect the results to make sure our calculation looks right
Y_test.head().to_pandas()

In [None]:
# compute the actual RMSE over the full test set
RMSE = Y_test.squared_error.mean().compute()
math.sqrt(RMSE)