# Create datasets for machine learning

- add unique id to each observation
- create time-based features
- select features and label for each ML task
- split data (last 6 months of 2019 as test data)

**ML tasks**
- `total_amount`: regression, all observations
- `tip_percent`: regression, calculate from `tip_amount`/`total_amount`


In [1]:
import dask.dataframe as dd
import os
import numpy as np

if 'TAXI_S3' not in os.environ:
    raise ValueError('Set TAXI_S3 environment variable to an S3 location that you have read/write access to')
taxi_path = os.environ['TAXI_S3']

In [2]:
from dask.distributed import Client
from dask_saturn import SaturnCluster
import time

cluster = SaturnCluster(n_workers=20, scheduler_size='2xlarge', worker_size='2xlarge', nthreads=8)
client = Client(cluster)

cluster

[2020-08-06 19:12:06] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

In [3]:
taxi = dd.read_parquet(f'{taxi_path}/data/taxi_parquet', engine='pyarrow')
taxi.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_taxizone_id,dropoff_taxizone_id,pickup_latitude,...,dropoff_longitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,VTS,2009-01-04 02:52:00,2009-01-04 03:02:00,1.0,2.63,,0,,,40.721567,...,-73.993803,2,8.9,0.5,,0.0,0.0,,9.4,
1,VTS,2009-01-04 03:31:00,2009-01-04 03:38:00,3.0,4.55,,0,,,40.73629,...,-73.95585,1,12.1,0.5,,2.0,0.0,,14.6,
2,VTS,2009-01-03 15:43:00,2009-01-03 15:57:00,5.0,10.35,,0,,,40.739748,...,-73.869983,1,23.7,0.0,,4.74,0.0,,28.44,
3,DDS,2009-01-01 20:52:58,2009-01-01 21:14:00,1.0,5.0,,0,,,40.790955,...,-73.996558,1,14.9,0.5,,3.05,0.0,,18.45,
4,DDS,2009-01-24 16:18:23,2009-01-24 16:24:56,1.0,0.4,,0,,,40.719382,...,-74.008378,2,3.7,0.0,,0.0,0.0,,3.7,


In [4]:
%%time
len(taxi)

CPU times: user 128 ms, sys: 4.4 ms, total: 132 ms
Wall time: 11.3 s


1611604226

In [5]:
# only use zones for ML tasks
# change to spatial join to pull in old data if there's time
taxi = taxi[taxi.pickup_taxizone_id.notnull() & taxi.dropoff_taxizone_id.notnull()]

## Create some features and split data

- These features are stateless, so we can compute them before splitting into train/test sets
    - Can you think of more features that might make our models better?
- Use last 6 months of 2019 as test set, all other data for train set

In [6]:
taxi['tip_fraction'] = taxi.tip_amount / taxi.total_amount
taxi['pickup_weekday'] = taxi.pickup_datetime.dt.weekday
taxi['pickup_weekofyear'] = taxi.pickup_datetime.dt.weekofyear
taxi['pickup_hour'] = taxi.pickup_datetime.dt.hour
taxi['pickup_minute'] = taxi.pickup_datetime.dt.minute
taxi['pickup_week_hour'] = (taxi.pickup_weekday * 24) + taxi.pickup_hour

In [7]:
features = [
    'pickup_taxizone_id',
    'dropoff_taxizone_id',
    'pickup_weekday',
    'pickup_weekofyear',
    'pickup_hour',
    'pickup_minute',
    'pickup_week_hour',
    'passenger_count',
]

In [8]:
import datetime

train = taxi[(taxi.pickup_datetime >= datetime.datetime(2009, 1, 1)) &
             (taxi.pickup_datetime < datetime.datetime(2019, 7, 1))]
test = taxi[(taxi.pickup_datetime >= datetime.datetime(2019, 7, 1)) & 
            (taxi.pickup_datetime < datetime.datetime(2020, 1, 1))]

In [9]:
from dask import compute

compute(
    train.shape[0],
    train.pickup_datetime.min(),
    train.pickup_datetime.max(),
    
    test.shape[0],
    test.pickup_datetime.min(),
    test.pickup_datetime.max(),
)

(322518322,
 Timestamp('2009-01-01 00:00:00'),
 Timestamp('2019-06-30 23:59:59'),
 39939208,
 Timestamp('2019-07-01 00:00:00'),
 Timestamp('2019-12-31 23:59:52'))

## Write out files

**NOTE**: each ML dataset will generate different uuid's, so will not be able to link between them

- `amount`: predict total amount
- `tip`: predict tip percentage

In [3]:
import s3fs
import uuid

fs = s3fs.S3FileSystem()
ml_path = f'{taxi_path}/data/ml'


def write_df(df: dd.DataFrame, path: str, partition_size='100MB', rm=True, create_id=True) -> dd.DataFrame:
    """
    Add uuid, repartition, and write dataframe to parquet
        
    Parameters:
        df (DataFrame): DataFrame to write
        path (str): S3 Path to write to
        partition_size (str): size for repartitioning
        rm (bool): Whether to remove files before writing
        create_id (bool): Whether to add a unique id column
    """
    if rm and fs.exists(path):
        fs.rm(path, recursive=True)
    
    if create_id:
        cols = df.columns.values.tolist()
        df['id'] = df.iloc[:, 0].apply(lambda _: uuid.uuid4().hex, meta=('id', 'object'))
        df = df[['id'] + cols]
    
    if partition_size:
        df = df.repartition(partition_size=partition_size)
    
    df.to_parquet(path, write_index=False, engine='pyarrow', compression='snappy')

### Fare amount prediction, use all data

In [11]:
amt_train = train[features + ['total_amount']]
amt_test = test[features + ['total_amount']]

In [12]:
%time write_df(amt_train, f'{ml_path}/amount_train')

CPU times: user 6.72 s, sys: 84.4 ms, total: 6.8 s
Wall time: 29min


In [13]:
%time write_df(amt_test, f'{ml_path}/amount_test')

CPU times: user 5.55 s, sys: 27.1 ms, total: 5.58 s
Wall time: 7min 47s


<br>

### Tip percentage prediction, only use credit card payment

`payment_type` of 1 means credit card, based on [data dictionary](https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf)

In [14]:
tip_train = train[(train.payment_type == 1) & train.tip_fraction.notnull()][features + ['tip_fraction']]
tip_test = test[(test.payment_type == 1) & test.tip_fraction.notnull()][features + ['tip_fraction']]

In [15]:
%time write_df(tip_train, f'{ml_path}/tip_train')

CPU times: user 6.88 s, sys: 2.33 s, total: 9.2 s
Wall time: 14min 27s


In [16]:
%time write_df(tip_test, f'{ml_path}/tip_test')

CPU times: user 5.45 s, sys: 18.7 ms, total: 5.47 s
Wall time: 2min 43s


### Write samples for single-node work

Force 1 partition because we know the files will be small

In [4]:
%%time

tip_train_sample = (
    dd.read_parquet(f'{ml_path}/tip_train', engine='pyarrow')
    .sample(frac=0.05, replace=False, random_state=42)
)
write_df(tip_train_sample, f'{ml_path}/tip_train_sample', create_id=False)

tip_test_sample = (
    dd.read_parquet(f'{ml_path}/tip_test', engine='pyarrow')
    .sample(frac=0.05, replace=False, random_state=42)
)
write_df(tip_test_sample, f'{ml_path}/tip_test_sample', create_id=False)

CPU times: user 1.44 s, sys: 65 ms, total: 1.51 s
Wall time: 38.7 s


## Check our work

(might need to restart kernel, `pyarrow` gets a bit confused)

In [17]:
amt_train_ = dd.read_parquet(f'{ml_path}/amount_train', engine='pyarrow')
len(amt_train_)

322518322

In [18]:
amt_train_.head()

Unnamed: 0,id,pickup_taxizone_id,dropoff_taxizone_id,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_week_hour,passenger_count,total_amount
0,e624521847324f29aea590ab888242ae,263.0,236.0,6,27,6,56,150,1.0,9.0
1,2b222cbab875464f9ab842dee5700b4f,142.0,163.0,6,27,10,50,154,5.0,8.16
2,41d6e65728804c72ab41911b8b927a1e,74.0,66.0,6,27,10,50,154,1.0,27.8
3,d17abc2ab2564d8bb9855cc16cd24ae8,264.0,264.0,6,27,10,50,154,1.0,6.3
4,bd24cda21bce46518e3599f42f443523,48.0,68.0,6,27,10,50,154,1.0,6.3


In [19]:
%time np.round(amt_train_.describe().compute(), 3).T

CPU times: user 9.16 s, sys: 187 ms, total: 9.35 s
Wall time: 25.4 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pickup_taxizone_id,322518322.0,162.917,66.556,1.0,132.0,162.0,234.0,265.0
dropoff_taxizone_id,322518322.0,160.983,70.48,1.0,125.0,163.0,236.0,265.0
pickup_weekday,322518322.0,3.019,1.945,0.0,2.0,4.0,6.0,6.0
pickup_weekofyear,322518322.0,26.782,15.022,1.0,18.0,36.0,52.0,52.0
pickup_hour,322518322.0,13.709,6.216,0.0,12.0,17.0,21.0,23.0
pickup_minute,322518322.0,29.571,17.334,0.0,15.0,31.0,45.0,59.0
pickup_week_hour,322518322.0,86.154,46.607,0.0,61.0,112.0,162.0,167.0
passenger_count,322518322.0,1.613,1.258,0.0,1.0,1.0,2.0,192.0
total_amount,322518322.0,16.734,189.021,-800.3,11.16,15.36,22.33,1084772.17


In [20]:
amt_test_ = dd.read_parquet(f'{ml_path}/amount_test', engine='pyarrow')
len(amt_test_)

39939208

In [21]:
amt_test_.head()

Unnamed: 0,id,pickup_taxizone_id,dropoff_taxizone_id,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_week_hour,passenger_count,total_amount
0,e3998ec7b49848948aed269b53eea9bf,211.0,113.0,0,37,9,44,9,1.0,6.3
1,82b77260da9a4cdea3f33e488ddb9d40,162.0,170.0,0,37,10,19,10,1.0,10.3
2,4a4eb1915b564822954aed9e3e853c0f,237.0,236.0,0,37,10,42,10,1.0,12.36
3,d9e78065d9c344f3a395ec5359580393,43.0,170.0,0,37,11,7,11,1.0,14.3
4,0fcc5c62d32747c7a958816155ae80b4,162.0,141.0,0,37,11,33,11,1.0,10.0


In [22]:
%time np.round(amt_test_.describe().compute(), 3).T

CPU times: user 1.16 s, sys: 19 ms, total: 1.18 s
Wall time: 13.2 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pickup_taxizone_id,39939208.0,162.939,65.851,1.0,132.0,162.0,234.0,265.0
dropoff_taxizone_id,39939208.0,160.995,70.189,1.0,116.0,162.0,234.0,265.0
pickup_weekday,39939208.0,2.953,1.944,0.0,2.0,4.0,6.0,6.0
pickup_weekofyear,39939208.0,39.524,8.169,1.0,36.0,44.0,52.0,52.0
pickup_hour,39939208.0,13.897,6.017,0.0,11.0,16.0,20.0,23.0
pickup_minute,39939208.0,29.554,17.341,0.0,15.0,30.0,45.0,59.0
pickup_week_hour,39939208.0,84.761,46.625,0.0,59.0,111.0,155.0,167.0
passenger_count,39692607.0,1.554,1.191,0.0,1.0,1.0,2.0,9.0
total_amount,39939208.0,19.668,94.979,-1871.8,11.8,15.95,23.76,411042.81


In [23]:
tip_train_ = dd.read_parquet(f'{ml_path}/tip_train', engine='pyarrow')
len(tip_train_)

219897929

In [24]:
tip_train_.head()

Unnamed: 0,id,pickup_taxizone_id,dropoff_taxizone_id,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_week_hour,passenger_count,tip_fraction
0,c92d7d1d969a4c5a89c7dc93f905881e,263.0,236.0,6,27,6,56,150,1.0,0.3
1,f6f9b9c710ff4adfb29c8687ee646d0d,142.0,163.0,6,27,10,50,154,5.0,0.166667
2,cadce931261a42a2b26878a5f610e900,74.0,66.0,6,27,10,50,154,1.0,0.0
3,445763736d5d456697048f87ca10f47f,161.0,142.0,6,27,10,50,154,1.0,0.128205
4,51c777d26f2d4f278e8862d2ce6dbd42,226.0,261.0,6,27,10,50,154,1.0,0.166667


In [25]:
%time np.round(tip_train_.describe().compute(), 3).T

CPU times: user 6.45 s, sys: 85.4 ms, total: 6.53 s
Wall time: 23.9 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pickup_taxizone_id,219897929.0,163.467,66.117,1.0,137.0,162.0,234.0,265.0
dropoff_taxizone_id,219897929.0,161.519,70.23,1.0,125.0,163.0,234.0,265.0
pickup_weekday,219897929.0,2.983,1.93,0.0,2.0,4.0,6.0,6.0
pickup_weekofyear,219897929.0,26.531,15.031,1.0,18.0,36.0,52.0,52.0
pickup_hour,219897929.0,13.782,6.269,0.0,12.0,17.0,20.0,23.0
pickup_minute,219897929.0,29.587,17.331,0.0,15.0,30.0,45.0,59.0
pickup_week_hour,219897929.0,85.381,46.17,0.0,62.0,111.0,160.0,167.0
passenger_count,219897929.0,1.598,1.251,0.0,1.0,1.0,2.0,192.0
tip_fraction,219897929.0,0.153,0.054,-0.01,0.146,0.167,0.167,1.0


In [26]:
tip_test_ = dd.read_parquet(f'{ml_path}/tip_test', engine='pyarrow')
len(tip_test_)

28433203

In [27]:
tip_test_.head()

Unnamed: 0,id,pickup_taxizone_id,dropoff_taxizone_id,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_week_hour,passenger_count,tip_fraction
0,8e8109754e3e4cb7879c4e9ee216d58d,162.0,170.0,0,37,10,19,10,1.0,0.097087
1,a30e7c87866f417ab15dee5617f272a0,237.0,236.0,0,37,10,42,10,1.0,0.166667
2,1a7a611d0809489d99a5120727e0476a,162.0,141.0,0,37,11,33,11,1.0,0.12
3,736e84ca12a640cc858c210bd58f744c,264.0,264.0,2,47,22,10,70,5.0,0.089474
4,f2c24299d9a34ce986b7a271c5cc80b2,264.0,264.0,3,47,2,58,74,5.0,0.0


In [28]:
%time np.round(tip_test_.describe().compute(), 3).T

CPU times: user 876 ms, sys: 15.2 ms, total: 891 ms
Wall time: 10.4 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pickup_taxizone_id,28433203.0,164.268,65.5,1.0,132.0,162.0,234.0,265.0
dropoff_taxizone_id,28433203.0,162.371,69.742,1.0,125.0,162.0,234.0,265.0
pickup_weekday,28433203.0,2.919,1.929,0.0,2.0,4.0,6.0,6.0
pickup_weekofyear,28433203.0,39.624,8.046,1.0,36.0,44.0,52.0,52.0
pickup_hour,28433203.0,13.977,6.073,0.0,12.0,16.0,20.0,23.0
pickup_minute,28433203.0,29.568,17.339,0.0,15.0,30.0,45.0,59.0
pickup_week_hour,28433203.0,84.038,46.195,0.0,62.0,108.0,146.0,167.0
passenger_count,28433203.0,1.537,1.182,0.0,1.0,1.0,2.0,9.0
tip_fraction,28433203.0,0.15,0.052,0.0,0.131,0.167,0.167,1.0
