# Introduction

This notebook is the ETL for the yellow and green NYC taxi data for the years 2019-2020.

The data has been downloaded separately from https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page and stored in `../data/raw/`. The file names are in the format `<colour>_tripdata_yyyy-mm.csv`.

In [1]:
%load_ext autoreload
%autoreload 2

In [32]:
import pandas as pd
from pathlib import Path
from dotenv import find_dotenv
from functools import reduce
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql.types import (
    IntegerType,
    DateType,
    FloatType,
    StringType
)
from pyspark.ml.regression import (
    RandomForestRegressor,
    GBTRegressor
)
from pyspark.ml.feature import (
    StringIndexer,
    VectorAssembler,
    VectorIndexer,
    OneHotEncoderEstimator
)
from pyspark.ml.evaluation import (
    RegressionEvaluator
)
from pyspark.ml import Pipeline

ModuleNotFoundError: No module named 'pyspark.sql.DataFrame'

In [None]:
project_dir = Path(find_dotenv()).cwd().parent
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
processed_data_dir = data_dir / 'processed'
reports_dir = project_dir / 'reports'
models_dir = project_dir / 'models'

In [4]:
spark = (
    SparkSession
    .builder
    .appName('new_york_taxis')
    .getOrCreate()
)

In [5]:
spark.version

'2.4.5'

In [6]:
spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()

'3.0.0'

# Load data

In [7]:
path = processed_data_dir.joinpath('df_cleaned').as_posix()
df = spark.read.parquet(path)

# Choose features

**Drop  columns**
* The `Fare_amount` column is **not** to be used.
* The date columns are to be dropped because the features have already been extracted, e.g. `pickup_dayofyear`.

The label column is `Total_amount`.

In [8]:
df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- Passenger_count: integer (nullable = true)
 |-- Fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- Total_amount: float (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- congestion_surcharge: float (nullable = true)
 |-- colour: string (nullable = true)
 |-- pickup_year: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_dayofyear: integer (nullable = true)
 |-- pickup_dayofmonth: integer (nullable = true)
 |-- picku

In [9]:
drop_cols = [
    'pickup_datetime',
    'dropoff_datetime',
    'Fare_amount',
]

In [10]:
df_dropped = reduce(DataFrame.drop, drop_cols, df)

In [11]:
df_dropped.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- Passenger_count: integer (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- Total_amount: float (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- congestion_surcharge: float (nullable = true)
 |-- colour: string (nullable = true)
 |-- pickup_year: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_dayofyear: integer (nullable = true)
 |-- pickup_dayofmonth: integer (nullable = true)
 |-- pickup_dayofweek: integer (nullable = true)
 |-- pickup_weekofyear: integer (nullable = true)
 |-- pickup_hourofday: integer (nullable = true)
 |-- 

# Formatting

## Categorical columns

PySpark algorithms requires the data to be in a certain format, e.g. categorical values are to be represented as indexes.

In [12]:
string_cols = [item[0] for item in df_dropped.dtypes if item[1].startswith('string')]
string_cols

['VendorID',
 'Store_and_fwd_flag',
 'RatecodeID',
 'PULocationID',
 'DOLocationID',
 'Payment_type',
 'colour']

In [35]:
stages = []
ohe_cols = []
for col in string_cols:
    index_col = f'{col}_index'
    ohe_col = f'{col}_ohe'
    col_indexer = StringIndexer(inputCol=col, outputCol=index_col).setHandleInvalid('keep')
    col_encoder = OneHotEncoderEstimator(inputCols=[index_col], outputCols=[ohe_col])
    stages += [col_indexer, col_encoder]
    ohe_cols.append(ohe_col)

In [36]:
ohe_cols

['VendorID_ohe',
 'Store_and_fwd_flag_ohe',
 'RatecodeID_ohe',
 'PULocationID_ohe',
 'DOLocationID_ohe',
 'Payment_type_ohe',
 'colour_ohe']

## Numerical columns

In [25]:
num_cols = list(set(df_dropped.columns) - set(string_cols) - set(['Total_amount']))
num_cols

['pickup_weekofyear',
 'pickup_dayofyear',
 'pickup_dayofweek',
 'improvement_surcharge',
 'pickup_year',
 'tip_amount',
 'mta_tax',
 'trip_duration',
 'extra',
 'Passenger_count',
 'pickup_dayofmonth',
 'trip_distance_km',
 'pickup_month',
 'congestion_surcharge',
 'dropoff_year',
 'tolls_amount',
 'pickup_hourofday',
 'speed']

## Fillna

The categorical columns will be filled with "missing". `VectorAssembler` doesn't handle null values.

In [41]:
df_filled = (
    df_dropped
    .fillna(value='missing', subset=string_cols)
    .fillna(value=0, subset=num_cols)
)

## Assemble the columns

In [37]:
assembler = VectorAssembler(inputCols=num_cols + ohe_cols, outputCol='features')
stages += [assembler]

## Pipeline

In [56]:
stages

[StringIndexer_74d9d696ff31,
 OneHotEncoderEstimator_3f78119b65eb,
 StringIndexer_581dc28a7afc,
 OneHotEncoderEstimator_831f8fe1d5ae,
 StringIndexer_0f99705d934e,
 OneHotEncoderEstimator_2cdfccea4eb3,
 StringIndexer_bc570d18dfbd,
 OneHotEncoderEstimator_438a104ef2db,
 StringIndexer_14edce1b3154,
 OneHotEncoderEstimator_62269237b186,
 StringIndexer_770840f22e9c,
 OneHotEncoderEstimator_d0d1193ab3f9,
 StringIndexer_2ae6c3a1de0a,
 OneHotEncoderEstimator_f688e56e8016,
 VectorAssembler_577b694d400d]

In [39]:
pipeline = Pipeline(stages=stages)

In [57]:
pipeline_model = pipeline.fit(df_filled)

In [58]:
df_transformed = pipeline_model.transform(df_filled)

In [59]:
df_transformed.limit(10).show()

+--------+------------------+----------+------------+------------+---------------+-----+-------+----------+------------+---------------------+------------+------------+--------------------+------+-----------+------------+----------------+-----------------+----------------+-----------------+----------------+------------+-------------+------------------+------------------+--------------+-------------+------------------------+----------------------+----------------+-------------------+------------------+----------------+------------------+----------------+------------------+----------------+------------+-------------+--------------------+
|VendorID|Store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|Passenger_count|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|Total_amount|Payment_type|congestion_surcharge|colour|pickup_year|pickup_month|pickup_dayofyear|pickup_dayofmonth|pickup_dayofweek|pickup_weekofyear|pickup_hourofday|dropoff_year|trip_duration|  trip_distance_km|  

# Split data

The last three months of the data are to be used as the testing set.

In [61]:
test_set_mask = (F.col('pickup_year') == 2020) & (F.col('pickup_month') >= 10)
df_test = df_transformed.filter(test_set_mask)

In [62]:
df_test.groupby(['pickup_year', 'pickup_month']).count().show()

+-----------+------------+-------+
|pickup_year|pickup_month|  count|
+-----------+------------+-------+
|       2020|          11|1575114|
|       2020|          12|1522612|
|       2020|          10|1751017|
+-----------+------------+-------+



In [63]:
df_train = df_transformed.filter(~test_set_mask)

In [64]:
df_train.groupby(['pickup_year', 'pickup_month']).count().sort(['pickup_year', 'pickup_month']).show(24)

+-----------+------------+-------+
|pickup_year|pickup_month|  count|
+-----------+------------+-------+
|       2019|           1|8210354|
|       2019|           2|7515946|
|       2019|           3|8348950|
|       2019|           4|7869061|
|       2019|           5|7985822|
|       2019|           6|7327653|
|       2019|           7|6701176|
|       2019|           8|6443544|
|       2019|           9|6934948|
|       2019|          10|7606108|
|       2019|          11|7246815|
|       2019|          12|7265200|
|       2020|           1|6776129|
|       2020|           2|6626305|
|       2020|           3|3194072|
|       2020|           4| 268174|
|       2020|           5| 398037|
|       2020|           6| 603136|
|       2020|           7| 859777|
|       2020|           8|1073624|
|       2020|           9|1407156|
+-----------+------------+-------+



# Modelling

In [None]:
rf = RandomForestRegressor(featuresCol='features', labelCol='Total_amount').fit(df_train)

In [None]:
train_preds = rf.transform(df_test)

In [None]:
test_preds = rf.transform()

In [None]:
model_name = 'rf'
path = models_dir / model_name
rf.save(path, rf)

## Evaluation

In [None]:
evaluator = RegressionEvaluator(metricName='rmse')