# Create datasets for machine learning

- add unique id to each observation
- create time-based features
- select features and label for each ML task
- split data (last 6 months of 2019 as test data)

**ML tasks**
- `total_amount`: regression, all observations
- `tip_percent`: regression, calculate from `tip_amount`/`total_amount`


In [1]:
import dask.dataframe as dd
import os
import numpy as np

if 'TAXI_S3' not in os.environ:
    raise ValueError('Set TAXI_S3 environment variable to an S3 location that you have read/write access to')
taxi_path = os.environ['TAXI_S3']

In [2]:
from dask.distributed import Client
from dask_saturn import SaturnCluster
import time

cluster = SaturnCluster(n_workers=20, scheduler_size='2xlarge', worker_size='2xlarge', nthreads=8)
client = Client(cluster)

cluster

[2020-08-04 12:30:31] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

In [3]:
taxi = dd.read_parquet(f'{taxi_path}/data/taxi_parquet', engine='pyarrow')
taxi.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_taxizone_id,dropoff_taxizone_id,pickup_latitude,...,dropoff_longitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,VTS,2009-01-04 02:52:00,2009-01-04 03:02:00,1.0,2.63,,0,,,40.721567,...,-73.993803,2,8.9,0.5,,0.0,0.0,,9.4,
1,VTS,2009-01-04 03:31:00,2009-01-04 03:38:00,3.0,4.55,,0,,,40.73629,...,-73.95585,1,12.1,0.5,,2.0,0.0,,14.6,
2,VTS,2009-01-03 15:43:00,2009-01-03 15:57:00,5.0,10.35,,0,,,40.739748,...,-73.869983,1,23.7,0.0,,4.74,0.0,,28.44,
3,DDS,2009-01-01 20:52:58,2009-01-01 21:14:00,1.0,5.0,,0,,,40.790955,...,-73.996558,1,14.9,0.5,,3.05,0.0,,18.45,
4,DDS,2009-01-24 16:18:23,2009-01-24 16:24:56,1.0,0.4,,0,,,40.719382,...,-74.008378,2,3.7,0.0,,0.0,0.0,,3.7,


In [73]:
%%time
len(taxi)

CPU times: user 190 ms, sys: 11.8 ms, total: 202 ms
Wall time: 11.4 s


1611604226

In [18]:
# only use zones for ML tasks
# change to spatial join to pull in old data if there's time
taxi = taxi[taxi.pickup_taxizone_id.notnull() & taxi.dropoff_taxizone_id.notnull()]

## Create some features and split data

- These features are stateless, so we can compute them before splitting into train/test sets
    - Can you think of more features that might make our models better?
- Use last 6 months of 2019 as test set, all other data for train set

In [19]:
taxi['tip_fraction'] = taxi.tip_amount / taxi.total_amount
taxi['pickup_weekday'] = taxi.pickup_datetime.dt.weekday
taxi['pickup_weekofyear'] = taxi.pickup_datetime.dt.weekofyear
taxi['pickup_hour'] = taxi.pickup_datetime.dt.hour
taxi['pickup_minute'] = taxi.pickup_datetime.dt.minute
taxi['pickup_week_hour'] = (taxi.pickup_weekday * 24) + taxi.pickup_hour

In [20]:
features = [
    'pickup_taxizone_id',
    'dropoff_taxizone_id',
    'pickup_weekday',
    'pickup_weekofyear',
    'pickup_hour',
    'pickup_minute',
    'pickup_week_hour',
    'passenger_count',
]

In [21]:
import datetime

train = taxi[(taxi.pickup_datetime >= datetime.datetime(2009, 1, 1)) &
             (taxi.pickup_datetime < datetime.datetime(2019, 7, 1))]
test = taxi[(taxi.pickup_datetime >= datetime.datetime(2019, 7, 1)) & 
            (taxi.pickup_datetime < datetime.datetime(2020, 1, 1))]

In [78]:
from dask import compute

compute(
    train.shape[0],
    train.pickup_datetime.min(),
    train.pickup_datetime.max(),
    
    test.shape[0],
    test.pickup_datetime.min(),
    test.pickup_datetime.max(),
)

(322518322,
 Timestamp('2009-01-01 00:00:00'),
 Timestamp('2019-06-30 23:59:59'),
 39939208,
 Timestamp('2019-07-01 00:00:00'),
 Timestamp('2019-12-31 23:59:52'))

## Write out files

**NOTE**: each ML dataset will generate different uuid's, so will not be able to link between them

- `amount`: predict total amount
- `tip`: predict tip percentage

In [4]:
import s3fs
import uuid

fs = s3fs.S3FileSystem()
ml_path = f'{taxi_path}/data/ml'


def write_df(df: dd.DataFrame, path: str, partition_size='100MB', rm=True) -> dd.DataFrame:
    """
    Add uuid, repartition, and write dataframe to parquet
        
    Parameters:
        df (DataFrame): DataFrame to write
        path (str): S3 Path to write to
        partition_size (str): size for repartitioning
        rm (bool): Whether to remove files before writing
    """
    if rm and fs.exists(path):
        fs.rm(path, recursive=True)
        
    cols = df.columns.values.tolist()
    df['id'] = df.iloc[:, 0].apply(lambda _: uuid.uuid4().hex, meta=('id', 'object'))
    df = df[['id'] + cols]
    
    if partition_size:
        df = df.repartition(partition_size=partition_size)
    
    df.to_parquet(path, engine='pyarrow', compression='snappy')

### Fare amount prediction, use all data

In [98]:
amt_train = train[features + ['total_amount']]
amt_test = test[features + ['total_amount']]

In [104]:
%time write_df(amt_train, f'{ml_path}/amount_train')

CPU times: user 6.24 s, sys: 58 ms, total: 6.3 s
Wall time: 23min 57s


In [99]:
%time write_df(amt_test, f'{ml_path}/amount_test')

CPU times: user 5.32 s, sys: 40.6 ms, total: 5.36 s
Wall time: 6min 25s


<br>

### Tip percentage prediction, only use credit card payment

`payment_type` of 1 means credit card, based on [data dictionary](https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf)

In [22]:
tip_train = train[(train.payment_type == 1) & train.tip_fraction.notnull()][features + ['tip_fraction']]
tip_test = test[(test.payment_type == 1) & test.tip_fraction.notnull()][features + ['tip_fraction']]

In [23]:
%time write_df(tip_train, f'{ml_path}/tip_train')

CPU times: user 6.07 s, sys: 53.3 ms, total: 6.12 s
Wall time: 16min 8s


In [24]:
%time write_df(tip_test, f'{ml_path}/tip_test')

CPU times: user 5.42 s, sys: 33.5 ms, total: 5.46 s
Wall time: 3min 21s


## Check our work

(might need to restart kernel, `pyarrow` gets a bit confused)

In [5]:
amt_train_ = dd.read_parquet(f'{ml_path}/amount_train', engine='pyarrow')
len(amt_train_)

322518322

In [6]:
amt_train_.head()

Unnamed: 0,id,pickup_taxizone_id,dropoff_taxizone_id,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_week_hour,passenger_count,total_amount
0,6784edfd34d445f9bd80245a4bbe93b5,263.0,236.0,6,27,6,56,150,1.0,9.0
1,6bb1030dd0b1442381df8e7bbd9a4df1,142.0,163.0,6,27,10,50,154,5.0,8.16
2,3eddd06df6984d45a126a7cc634fa736,74.0,66.0,6,27,10,50,154,1.0,27.8
3,9807ff55b3e944dd987e0da65abcc2cd,264.0,264.0,6,27,10,50,154,1.0,6.3
4,45bcb2475494482ab04188421eec6934,48.0,68.0,6,27,10,50,154,1.0,6.3


In [15]:
%time np.round(amt_train_.describe().compute(), 3).T

CPU times: user 9.18 s, sys: 126 ms, total: 9.31 s
Wall time: 31.8 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pickup_taxizone_id,322518322.0,162.917,66.556,1.0,132.0,162.0,234.0,265.0
dropoff_taxizone_id,322518322.0,160.983,70.48,1.0,125.0,163.0,236.0,265.0
pickup_weekday,322518322.0,3.019,1.945,0.0,2.0,4.0,6.0,6.0
pickup_weekofyear,322518322.0,26.782,15.022,1.0,18.0,36.0,52.0,52.0
pickup_hour,322518322.0,13.709,6.216,0.0,12.0,17.0,21.0,23.0
pickup_minute,322518322.0,29.571,17.334,0.0,15.0,31.0,45.0,59.0
pickup_week_hour,322518322.0,86.154,46.607,0.0,61.0,112.0,162.0,167.0
passenger_count,322518322.0,1.613,1.258,0.0,1.0,1.0,2.0,192.0
total_amount,322518322.0,16.734,189.021,-800.3,11.16,15.36,22.33,1084772.17


In [7]:
amt_test_ = dd.read_parquet(f'{ml_path}/amount_test', engine='pyarrow')
len(amt_test_)

39939208

In [8]:
amt_test_.head()

Unnamed: 0,id,pickup_taxizone_id,dropoff_taxizone_id,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_week_hour,passenger_count,total_amount
652305,15bce2261c9848bb98ff2f7961c3bf8a,211.0,113.0,0,37,9,44,9,1.0,6.3
652306,6c01c56e81134585baa28fdf98a39143,162.0,170.0,0,37,10,19,10,1.0,10.3
667249,af034981fdff4eb880ea06fe1dda1e0e,237.0,236.0,0,37,10,42,10,1.0,12.36
667250,d705ca12f46640dca91a0d650a30ae2d,43.0,170.0,0,37,11,7,11,1.0,14.3
682800,9ed65c90386145ecb03bc1b0de4ba40f,162.0,141.0,0,37,11,33,11,1.0,10.0


In [16]:
%time np.round(amt_test_.describe().compute(), 3).T

CPU times: user 1.22 s, sys: 7.59 ms, total: 1.23 s
Wall time: 5.63 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pickup_taxizone_id,39939208.0,162.939,65.851,1.0,132.0,162.0,234.0,265.0
dropoff_taxizone_id,39939208.0,160.995,70.189,1.0,116.0,162.0,234.0,265.0
pickup_weekday,39939208.0,2.953,1.944,0.0,2.0,4.0,6.0,6.0
pickup_weekofyear,39939208.0,39.524,8.169,1.0,36.0,44.0,52.0,52.0
pickup_hour,39939208.0,13.897,6.017,0.0,11.0,16.0,20.0,23.0
pickup_minute,39939208.0,29.554,17.341,0.0,15.0,30.0,45.0,59.0
pickup_week_hour,39939208.0,84.761,46.625,0.0,59.0,111.0,155.0,167.0
passenger_count,39692607.0,1.554,1.191,0.0,1.0,1.0,2.0,9.0
total_amount,39939208.0,19.668,94.979,-1871.8,11.8,15.95,23.76,411042.81


In [25]:
tip_train_ = dd.read_parquet(f'{ml_path}/tip_train', engine='pyarrow')
len(tip_train_)

219897929

In [26]:
tip_train_.head()

Unnamed: 0,id,pickup_taxizone_id,dropoff_taxizone_id,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_week_hour,passenger_count,tip_fraction
0,733c487ba20a4f3db5659c945b05f6bf,263.0,236.0,6,27,6,56,150,1.0,0.3
1,d349a5902cb14eba9422fadf6a4b030d,142.0,163.0,6,27,10,50,154,5.0,0.166667
2,dda7ccc0a58b44e4b11c9de18c237617,74.0,66.0,6,27,10,50,154,1.0,0.0
5,abd5c77a61e2427d9be07b9421e83317,161.0,142.0,6,27,10,50,154,1.0,0.128205
7,f05e9663a2be42f8826d56b127b21ab2,226.0,261.0,6,27,10,50,154,1.0,0.166667


In [27]:
%time np.round(tip_train_.describe().compute(), 3).T

CPU times: user 6.52 s, sys: 63.3 ms, total: 6.58 s
Wall time: 22 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pickup_taxizone_id,219897929.0,163.467,66.117,1.0,137.0,162.0,234.0,265.0
dropoff_taxizone_id,219897929.0,161.519,70.23,1.0,125.0,163.0,234.0,265.0
pickup_weekday,219897929.0,2.983,1.93,0.0,2.0,4.0,6.0,6.0
pickup_weekofyear,219897929.0,26.531,15.031,1.0,18.0,36.0,52.0,52.0
pickup_hour,219897929.0,13.782,6.269,0.0,12.0,17.0,20.0,23.0
pickup_minute,219897929.0,29.587,17.331,0.0,15.0,30.0,45.0,59.0
pickup_week_hour,219897929.0,85.381,46.17,0.0,62.0,111.0,160.0,167.0
passenger_count,219897929.0,1.598,1.251,0.0,1.0,1.0,2.0,192.0
tip_fraction,219897929.0,0.153,0.054,-0.01,0.146,0.167,0.167,1.0


In [28]:
tip_test_ = dd.read_parquet(f'{ml_path}/tip_test', engine='pyarrow')
len(tip_test_)

28433203

In [29]:
tip_test_.head()

Unnamed: 0,id,pickup_taxizone_id,dropoff_taxizone_id,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_week_hour,passenger_count,tip_fraction
652306,c0ef188c059445baa5e73d75976a78a6,162.0,170.0,0,37,10,19,10,1.0,0.097087
667249,ee10146cfa5a4f33a88b401b7d00df5b,237.0,236.0,0,37,10,42,10,1.0,0.166667
682800,34946e497c814f5fb822727ee40b2546,162.0,141.0,0,37,11,33,11,1.0,0.12
581537,31aadae2b0b842baaea0b39243ee512b,264.0,264.0,2,47,22,10,70,5.0,0.089474
602848,94c02f92e5e94293a4b097731113c570,264.0,264.0,3,47,2,58,74,5.0,0.0


In [30]:
%time np.round(tip_test_.describe().compute(), 3).T

CPU times: user 961 ms, sys: 15.1 ms, total: 976 ms
Wall time: 4 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pickup_taxizone_id,28433203.0,164.268,65.5,1.0,132.0,162.0,234.0,265.0
dropoff_taxizone_id,28433203.0,162.371,69.742,1.0,125.0,162.0,234.0,265.0
pickup_weekday,28433203.0,2.919,1.929,0.0,2.0,4.0,6.0,6.0
pickup_weekofyear,28433203.0,39.624,8.046,1.0,36.0,44.0,52.0,52.0
pickup_hour,28433203.0,13.977,6.073,0.0,12.0,16.0,20.0,23.0
pickup_minute,28433203.0,29.568,17.339,0.0,15.0,30.0,45.0,59.0
pickup_week_hour,28433203.0,84.038,46.195,0.0,62.0,108.0,146.0,167.0
passenger_count,28433203.0,1.537,1.182,0.0,1.0,1.0,2.0,9.0
tip_fraction,28433203.0,0.15,0.052,0.0,0.131,0.167,0.167,1.0
