# Introduction

This notebook is the ETL for the yellow and green NYC taxi data for the years 2019-2020.

The data has been downloaded separately from https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page and stored in `../data/raw/`. The file names are in the format `<colour>_tripdata_yyyy-mm.csv`.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path
from dotenv import find_dotenv
from functools import reduce
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql.types import (
    IntegerType,
    DateType,
    FloatType,
    StringType
)
from pyspark.ml.regression import (
    RandomForestRegressor,
    GBTRegressor,
    GeneralizedLinearRegression
)
from pyspark.ml.feature import (
    StringIndexer,
    VectorAssembler,
    VectorIndexer,
    OneHotEncoderEstimator
)
from pyspark.ml.evaluation import (
    RegressionEvaluator
)
from pyspark.ml import Pipeline

In [3]:
project_dir = Path(find_dotenv()).cwd().parent
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
processed_data_dir = data_dir / 'processed'
reports_dir = project_dir / 'reports'
models_dir = project_dir / 'models'
results_dir = processed_data_dir / 'results'

In [4]:
spark = (
    SparkSession
    .builder
    .appName('new_york_taxis')
    .getOrCreate()
)

In [5]:
spark.version

'2.4.5'

In [6]:
spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()

'3.0.0'

In [11]:
labelCol = 'Total_amount'

# Load data

In [8]:
path = processed_data_dir.joinpath('df_cleaned').as_posix()
df = spark.read.parquet(path)

# Choose features

**Drop  columns**
* The `Fare_amount` column is **not** to be used.
* The date columns are to be dropped because the features have already been extracted, e.g. `pickup_dayofyear`.

The label column is `Total_amount`.

In [10]:
df.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- Passenger_count: integer (nullable = true)
 |-- Fare_amount: float (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- Total_amount: float (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- congestion_surcharge: float (nullable = true)
 |-- colour: string (nullable = true)
 |-- pickup_year: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_dayofyear: integer (nullable = true)
 |-- pickup_dayofmonth: integer (nullable = true)
 |-- picku

In [11]:
drop_cols = [
    'pickup_datetime',
    'dropoff_datetime',
    'Fare_amount',
]

In [12]:
df_dropped = reduce(DataFrame.drop, drop_cols, df)

In [13]:
df_dropped.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- Store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- Passenger_count: integer (nullable = true)
 |-- extra: float (nullable = true)
 |-- mta_tax: float (nullable = true)
 |-- tip_amount: float (nullable = true)
 |-- tolls_amount: float (nullable = true)
 |-- improvement_surcharge: float (nullable = true)
 |-- Total_amount: float (nullable = true)
 |-- Payment_type: string (nullable = true)
 |-- congestion_surcharge: float (nullable = true)
 |-- colour: string (nullable = true)
 |-- pickup_year: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_dayofyear: integer (nullable = true)
 |-- pickup_dayofmonth: integer (nullable = true)
 |-- pickup_dayofweek: integer (nullable = true)
 |-- pickup_weekofyear: integer (nullable = true)
 |-- pickup_hourofday: integer (nullable = true)
 |-- 

# Formatting

## Categorical columns

PySpark algorithms requires the data to be in a certain format, e.g. categorical values are to be represented as indexes.

In [14]:
string_cols = [item[0] for item in df_dropped.dtypes if item[1].startswith('string')]
string_cols

['VendorID',
 'Store_and_fwd_flag',
 'RatecodeID',
 'PULocationID',
 'DOLocationID',
 'Payment_type',
 'colour']

In [15]:
stages = []
ohe_cols = []
for col in string_cols:
    index_col = f'{col}_index'
    ohe_col = f'{col}_ohe'
    col_indexer = StringIndexer(inputCol=col, outputCol=index_col).setHandleInvalid('keep')
    col_encoder = OneHotEncoderEstimator(inputCols=[index_col], outputCols=[ohe_col])
    stages += [col_indexer, col_encoder]
    ohe_cols.append(ohe_col)

In [16]:
ohe_cols

['VendorID_ohe',
 'Store_and_fwd_flag_ohe',
 'RatecodeID_ohe',
 'PULocationID_ohe',
 'DOLocationID_ohe',
 'Payment_type_ohe',
 'colour_ohe']

## Numerical columns

In [17]:
num_cols = list(set(df_dropped.columns) - set(string_cols) - set(['Total_amount']))
num_cols

['tolls_amount',
 'tip_amount',
 'pickup_dayofmonth',
 'pickup_hourofday',
 'pickup_month',
 'pickup_year',
 'dropoff_year',
 'pickup_dayofweek',
 'improvement_surcharge',
 'congestion_surcharge',
 'trip_distance_km',
 'mta_tax',
 'extra',
 'speed',
 'Passenger_count',
 'trip_duration',
 'pickup_dayofyear',
 'pickup_weekofyear']

## Fillna

The categorical columns will be filled with "missing". `VectorAssembler` doesn't handle null values.

In [18]:
df_filled = (
    df_dropped
    .fillna(value='missing', subset=string_cols)
    .fillna(value=0, subset=num_cols)
)

## Assemble the columns

In [19]:
assembler = VectorAssembler(inputCols=num_cols + ohe_cols, outputCol='features')
stages += [assembler]

## Pipeline

In [20]:
stages

[StringIndexer_91d32ac5a070,
 OneHotEncoderEstimator_b8a0f6257a58,
 StringIndexer_2b80ec0eb67b,
 OneHotEncoderEstimator_7402d5ad0959,
 StringIndexer_3f360700db91,
 OneHotEncoderEstimator_acab85cf6c94,
 StringIndexer_2f7dedbb407c,
 OneHotEncoderEstimator_52a9a3a3f4f2,
 StringIndexer_37378636444e,
 OneHotEncoderEstimator_a200e54e9640,
 StringIndexer_3700f881e298,
 OneHotEncoderEstimator_282dde2a35d3,
 StringIndexer_cf74eb556fc8,
 OneHotEncoderEstimator_7c7c56fec0b8,
 VectorAssembler_f5e17d877239]

In [21]:
pipeline = Pipeline(stages=stages)

In [22]:
pipeline_model = pipeline.fit(df_filled)

In [23]:
df_transformed = pipeline_model.transform(df_filled)

In [24]:
df_transformed.limit(10).show()

+--------+------------------+----------+------------+------------+---------------+-----+-------+----------+------------+---------------------+------------+------------+--------------------+------+-----------+------------+----------------+-----------------+----------------+-----------------+----------------+------------+-------------+------------------+------------------+--------------+-------------+------------------------+----------------------+----------------+--------------+------------------+----------------+------------------+----------------+------------------+----------------+------------+-------------+--------------------+
|VendorID|Store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|Passenger_count|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|Total_amount|Payment_type|congestion_surcharge|colour|pickup_year|pickup_month|pickup_dayofyear|pickup_dayofmonth|pickup_dayofweek|pickup_weekofyear|pickup_hourofday|dropoff_year|trip_duration|  trip_distance_km|       

# Split data

The last three months of the data are to be used as the testing set.

In [25]:
test_set_mask = (F.col('pickup_year') == 2020) & (F.col('pickup_month') >= 10)
df_test = df_transformed.filter(test_set_mask)

In [26]:
df_test.groupby(['pickup_year', 'pickup_month']).count().show()

+-----------+------------+-------+
|pickup_year|pickup_month|  count|
+-----------+------------+-------+
|       2020|          11|1575084|
|       2020|          12|1522569|
|       2020|          10|1750997|
+-----------+------------+-------+



In [27]:
df_train = df_transformed.filter(~test_set_mask)

In [28]:
df_train.groupby(['pickup_year', 'pickup_month']).count().sort(['pickup_year', 'pickup_month']).show(24)

+-----------+------------+-------+
|pickup_year|pickup_month|  count|
+-----------+------------+-------+
|       2019|           1|8210352|
|       2019|           2|7515945|
|       2019|           3|8348950|
|       2019|           4|7869060|
|       2019|           5|7985822|
|       2019|           6|7327652|
|       2019|           7|6701176|
|       2019|           8|6443544|
|       2019|           9|6934948|
|       2019|          10|7606108|
|       2019|          11|7246813|
|       2019|          12|7265198|
|       2020|           1|6776128|
|       2020|           2|6626300|
|       2020|           3|3194064|
|       2020|           4| 268170|
|       2020|           5| 398016|
|       2020|           6| 603131|
|       2020|           7| 859769|
|       2020|           8|1073604|
|       2020|           9|1407131|
+-----------+------------+-------+



In [8]:
df_train_path = processed_data_dir.joinpath('df_train').as_posix()
df_test_path = processed_data_dir.joinpath('df_test').as_posix()

In [31]:
df_train.write.parquet(df_train_path, mode='overwrite')
df_test.write.parquet(df_test_path, mode='overwrite')

In [9]:
df_train = spark.read.parquet(df_train_path)
df_test = spark.read.parquet(df_test_path)

# Save subsets

## 1%

In [34]:
fraction = 0.01
filename = f'df_train_{fraction}'.replace('.', '')
df_train_001 = df_train.sample(fraction=fraction)
df_train_001_path = processed_data_dir.joinpath(filename).as_posix()
df_train_001.write.parquet(df_train_001_path, mode='overwrite')

## 5%

In [35]:
fraction = 0.05
filename = f'df_train_{fraction}'.replace('.', '')

df_train_005 = df_train.sample(fraction=fraction)
df_train_005_path = processed_data_dir.joinpath(filename).as_posix()
df_train_005.write.parquet(df_train_005_path, mode='overwrite')

# Modelling 

In [14]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='Total_amount')

## GLM

In [37]:
model_name = 'glm'
model = GeneralizedLinearRegression(family="gaussian", 
                                    link="identity",
                                    maxIter=10,
                                    regParam=0.3,
                                    labelCol=labelCol).fit(df_train)

path = models_dir.joinpath(model_name).as_posix()
model.save(path)

In [38]:
train_preds = model.transform(df_train)
test_preds = model.transform(df_test)

train_score = evaluator.evaluate(train_preds)
test_score = evaluator.evaluate(test_preds)

In [40]:
print(f'Train RMSE for {model_name} model: {train_score}')
print(f'Test RMSE for {model_name} model: {test_score}')

Train RMSE for glm model: 129.00596329943053
Test RMSE for glm model: 488.29038922405243


## Random Forest

In [12]:
model_name = 'rf'
model = RandomForestRegressor(featuresCol='features',
                              labelCol=labelCol).fit(df_train)

path = models_dir.joinpath(model_name).as_posix()
model.save(path)

In [15]:
train_preds = model.transform(df_train)
test_preds = model.transform(df_test)

train_score = evaluator.evaluate(train_preds)
test_score = evaluator.evaluate(test_preds)

In [17]:
print(f'Train RMSE for {model_name} model: {train_score}')
print(f'Test RMSE for {model_name} model: {test_score}')

Train RMSE for rf model: 167.7277791381423
Test RMSE for rf model: 489.1960407601678


## Gradient Boosted Trees

In [18]:
model_name = 'gbt'
model = GBTRegressor(featuresCol='features',
                     labelCol=labelCol).fit(df_train)

path = models_dir.joinpath(model_name).as_posix()
model.save(path)

KeyboardInterrupt: 

In [None]:
train_preds = model.transform(df_train)
test_preds = model.transform(df_test)

train_score = evaluator.evaluate(train_preds)
test_score = evaluator.evaluate(test_preds)

In [None]:
print(f'Train RMSE for {model_name} model: {train_score}')
print(f'Test RMSE for {model_name} model: {test_score}')

In [19]:
spark.sparkContext.stop()