# Introduction

This notebook is the ETL for the yellow and green NYC taxi data for the years 2019-2020.

The data has been downloaded separately from https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page and stored in `../data/raw/`. The file names are in the format `<colour>_tripdata_yyyy-mm.csv`.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path
from dotenv import find_dotenv
from functools import reduce
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql.types import (
    IntegerType,
    DateType,
    FloatType,
    StringType
)
from pyspark.ml.regression import (
    RandomForestRegressor,
    GBTRegressor,
    GeneralizedLinearRegression    
)
from pyspark.ml.feature import (
    StringIndexer,
    VectorAssembler,
    VectorIndexer,
    OneHotEncoderEstimator
)
from pyspark.ml.evaluation import (
    RegressionEvaluator
)
from pyspark.ml import Pipeline

In [3]:
project_dir = Path(find_dotenv()).cwd().parent
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
processed_data_dir = data_dir / 'processed'
reports_dir = project_dir / 'reports'
models_dir = project_dir / 'models'

In [4]:
spark = (
    SparkSession
    .builder
    .appName('new_york_taxis')
    .getOrCreate()
)

In [5]:
spark.version

'2.4.5'

In [6]:
spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()

'3.0.0'

# Load data

In [21]:
path = processed_data_dir.joinpath('df_cleaned').as_posix()
df = spark.read.parquet(path)

# Choose features

**Drop  columns**
* The `Fare_amount` column is **not** to be used.
* The date columns are to be dropped because the features have already been extracted, e.g. `pickup_dayofyear`.

The label column is `Total_amount`.

In [8]:
df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- Passenger_count: integer (nullable = true)
 |-- Fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- Total_amount: float (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- congestion_surcharge: float (nullable = true)
 |-- colour: string (nullable = true)
 |-- pickup_year: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_dayofyear: integer (nullable = true)
 |-- pickup_dayofmonth: integer (nullable = true)
 |-- picku

In [61]:
df.select('Total_amount').describe().show()

+-------+------------------+
|summary|      Total_amount|
+-------+------------------+
|  count|         115510730|
|   mean|1.4713562514553544|
| stddev| 5.751008121661385|
|    min|               0.0|
|    max|            4012.3|
+-------+------------------+



In [22]:
drop_cols = [
    'pickup_datetime',
    'dropoff_datetime',
    'Fare_amount',
]

In [23]:
df_dropped = reduce(DataFrame.drop, drop_cols, df)

In [24]:
df_dropped.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- Passenger_count: integer (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- Total_amount: float (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- congestion_surcharge: float (nullable = true)
 |-- colour: string (nullable = true)
 |-- pickup_year: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_dayofyear: integer (nullable = true)
 |-- pickup_dayofmonth: integer (nullable = true)
 |-- pickup_dayofweek: integer (nullable = true)
 |-- pickup_weekofyear: integer (nullable = true)
 |-- pickup_hourofday: integer (nullable = true)
 |-- 

In [62]:
df_sampled = df_dropped.sample(fraction=0.05)
for col in df_sampled.columns:
    df_sampled.select(col).describe().show()

+-------+-------------------+
|summary|           VendorID|
+-------+-------------------+
|  count|            5679245|
|   mean| 1.6619224914579314|
| stddev|0.48763179817301777|
|    min|                  1|
|    max|                  4|
+-------+-------------------+

+-------+------------------+
|summary|Store_and_fwd_flag|
+-------+------------------+
|  count|            334910|
|   mean|              null|
| stddev|              null|
|    min|                 N|
|    max|                 Y|
+-------+------------------+

+-------+------------------+
|summary|        RatecodeID|
+-------+------------------+
|  count|           5730608|
|   mean|3.0218915619424704|
| stddev|123.76770517109335|
|    min|              -.06|
|    max|          99521.33|
+-------+------------------+

+-------+------------------+
|summary|      PULocationID|
+-------+------------------+
|  count|           5726160|
|   mean| 8.162409712617182|
| stddev|32.367606620129976|
|    min|                 1|
| 

In [64]:
df_dropped.describe().show()

+-------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+-----------------+---------------------+------------------+------------------+--------------------+---------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+
|summary|          VendorID|Store_and_fwd_flag|        RatecodeID|     PULocationID|      DOLocationID|   Passenger_count|             extra|          mta_tax|        tip_amount|     tolls_amount|improvement_surcharge|      Total_amount|      Payment_type|congestion_surcharge|   colour|       pickup_year|      pickup_month|  pickup_dayofyear| pickup_dayofmonth|  pickup_dayofweek| pickup_weekofyear|  pickup_hourofday|      dropoff_year|     trip_duration| trip_distance_km|             speed|
+-------

# Formatting

## Categorical columns

PySpark algorithms requires the data to be in a certain format, e.g. categorical values are to be represented as indexes.

In [25]:
string_cols = [item[0] for item in df_dropped.dtypes if item[1].startswith('string')]
string_cols

['VendorID',
 'Store_and_fwd_flag',
 'RatecodeID',
 'PULocationID',
 'DOLocationID',
 'Payment_type',
 'colour']

In [26]:
stages = []
ohe_cols = []
for col in string_cols:
    index_col = f'{col}_index'
    ohe_col = f'{col}_ohe'
    col_indexer = StringIndexer(inputCol=col, outputCol=index_col).setHandleInvalid('keep')
    col_encoder = OneHotEncoderEstimator(inputCols=[index_col], outputCols=[ohe_col])
    stages += [col_indexer, col_encoder]
    ohe_cols.append(ohe_col)

In [27]:
ohe_cols

['VendorID_ohe',
 'Store_and_fwd_flag_ohe',
 'RatecodeID_ohe',
 'PULocationID_ohe',
 'DOLocationID_ohe',
 'Payment_type_ohe',
 'colour_ohe']

## Numerical columns

In [28]:
num_cols = list(set(df_dropped.columns) - set(string_cols) - set(['Total_amount']))
num_cols

['pickup_month',
 'trip_duration',
 'dropoff_year',
 'pickup_year',
 'tip_amount',
 'tolls_amount',
 'mta_tax',
 'trip_distance_km',
 'extra',
 'pickup_hourofday',
 'speed',
 'Passenger_count',
 'pickup_dayofweek',
 'pickup_weekofyear',
 'pickup_dayofmonth',
 'congestion_surcharge',
 'pickup_dayofyear',
 'improvement_surcharge']

## Fillna

The categorical columns will be filled with "missing". `VectorAssembler` doesn't handle null values.

In [29]:
df_filled = (
    df_dropped
    .fillna(value='missing', subset=string_cols)
    .fillna(value=0, subset=num_cols)
)

## Assemble the columns

In [30]:
assembler = VectorAssembler(inputCols=num_cols + ohe_cols, outputCol='features')
stages += [assembler]

## Pipeline

In [31]:
stages

[StringIndexer_654ac2d1e024,
 OneHotEncoderEstimator_2d1977d41002,
 StringIndexer_e3afd387a4c6,
 OneHotEncoderEstimator_70023d4644a9,
 StringIndexer_218e20b1952a,
 OneHotEncoderEstimator_d1ee9e4f41bf,
 StringIndexer_1f3144c17704,
 OneHotEncoderEstimator_455e6e113c31,
 StringIndexer_82cc46283c24,
 OneHotEncoderEstimator_9edf116eb6d7,
 StringIndexer_bb19802f4547,
 OneHotEncoderEstimator_0f05de519926,
 StringIndexer_c370622cbd3d,
 OneHotEncoderEstimator_488afd6c47d8,
 VectorAssembler_7d6f018be5b8]

In [32]:
pipeline = Pipeline(stages=stages)

In [33]:
pipeline_model = pipeline.fit(df_filled)

In [34]:
df_transformed = pipeline_model.transform(df_filled)

In [22]:
df_transformed.limit(10).show()

+--------+------------------+----------+------------+------------+---------------+-----+-------+----------+------------+---------------------+------------+------------+--------------------+------+-----------+------------+----------------+-----------------+----------------+-----------------+----------------+------------+-------------+------------------+------------------+--------------+-------------+------------------------+----------------------+----------------+-------------------+------------------+----------------+------------------+----------------+------------------+----------------+------------+-------------+--------------------+
|VendorID|Store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|Passenger_count|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|Total_amount|Payment_type|congestion_surcharge|colour|pickup_year|pickup_month|pickup_dayofyear|pickup_dayofmonth|pickup_dayofweek|pickup_weekofyear|pickup_hourofday|dropoff_year|trip_duration|  trip_distance_km|  

# Split data

The last three months of the data are to be used as the testing set.

In [35]:
test_set_mask = (F.col('pickup_year') == 2020) & (F.col('pickup_month') >= 10)
df_test = df_transformed.filter(test_set_mask)

In [24]:
df_test.groupby(['pickup_year', 'pickup_month']).count().show()

+-----------+------------+-------+
|pickup_year|pickup_month|  count|
+-----------+------------+-------+
|       2020|          11|1575114|
|       2020|          12|1522612|
|       2020|          10|1751017|
+-----------+------------+-------+



In [25]:
df_train = df_transformed.filter(~test_set_mask)

In [64]:
df_train.groupby(['pickup_year', 'pickup_month']).count().sort(['pickup_year', 'pickup_month']).show(24)

+-----------+------------+-------+
|pickup_year|pickup_month|  count|
+-----------+------------+-------+
|       2019|           1|8210354|
|       2019|           2|7515946|
|       2019|           3|8348950|
|       2019|           4|7869061|
|       2019|           5|7985822|
|       2019|           6|7327653|
|       2019|           7|6701176|
|       2019|           8|6443544|
|       2019|           9|6934948|
|       2019|          10|7606108|
|       2019|          11|7246815|
|       2019|          12|7265200|
|       2020|           1|6776129|
|       2020|           2|6626305|
|       2020|           3|3194072|
|       2020|           4| 268174|
|       2020|           5| 398037|
|       2020|           6| 603136|
|       2020|           7| 859777|
|       2020|           8|1073624|
|       2020|           9|1407156|
+-----------+------------+-------+



In [47]:
df_train_path = processed_data_dir.joinpath('df_train').as_posix()
df_test_path = processed_data_dir.joinpath('df_test').as_posix()

In [28]:
df_train.write.parquet(df_train_path)
df_test.write.parquet(df_test_path)

In [48]:
df_train = spark.read.parquet(df_train_path)
df_test = spark.read.parquet(df_test_path)

# Save subsets

## 1%

In [31]:
fraction = 0.01
filename = f'df_train_{fraction}'.replace('.', '')
df_train_001 = df_train.sample(fraction=fraction)

In [7]:
df_train_001_path = processed_data_dir.joinpath(filename).as_posix()
# df_train_1m.write.parquet(df_train_1m_path)
df_train_001 = spark.read.parquet(df_train_001_path)

## 5%

In [49]:
fraction = 0.05
df_train_005 = df_train.sample(fraction=fraction)

In [57]:
filename = f'df_train_{fraction}'.replace('.', '')
df_train_005_path = processed_data_dir.joinpath(filename).as_posix()
# df_train_005.write.parquet(df_train_005_path)
df_train_005 = spark.read.parquet(df_train_005_path)

# Modelling 

## Random Forest on 1%
Training on 1% of the training data.

In [8]:
rf_001 = RandomForestRegressor(featuresCol='features', labelCol='Total_amount').fit(df_train_001)

In [10]:
train_preds = rf_005.transform(df_train_001)

In [39]:
test_preds = rf_1m.transform(df_test)

In [15]:
model_name = 'rf_001'
path = models_dir.joinpath(model_name).as_posix()
rf_001.save(path)

## Random Forest on 5%
Training on 5% of the training data.

In [58]:
rf_005 = RandomForestRegressor(featuresCol='features', labelCol='Total_amount').fit(df_train_005)

KeyboardInterrupt: 

In [63]:
train_preds = rf_005.transform(df_train_005)

NameError: name 'rf_005' is not defined

In [None]:
test_preds = rf_1m.transform(df_test)

In [None]:
model_name = 'rf_005'
path = models_dir.joinpath(model_name).as_posix()
rf_005.save(path)

## GBT on 5%
Training on 5% of the training data.

In [58]:
gbt_005 = GBTRegressor(featuresCol='features', labelCol='Total_amount').fit(df_train_005)

KeyboardInterrupt: 

In [None]:
train_preds = rf_005.transform(df_train_005)

In [None]:
test_preds = rf_1m.transform(df_test)

In [None]:
model_name = 'rf_005'
path = models_dir.joinpath(model_name).as_posix()
rf_005.save(path)

## Evaluation

In [19]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='Total_amount')

In [20]:
evaluator.evaluate(train_preds)

1.5544066754621952

In [40]:
evaluator.evaluate(test_preds)

2.5168938613512313