In [2]:
# Install libraries within the notebook scope
sc.install_pypi_package("boto3")
sc.install_pypi_package("pandas")
sc.install_pypi_package("requests")
sc.install_pypi_package("s3fs")
sc.install_pypi_package("fsspec")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting boto3
  Using cached https://files.pythonhosted.org/packages/f2/d9/e8a56bd0953914f60207af4c41bb3947c47ca03577b1fe26258249dd9af7/boto3-1.16.6-py2.py3-none-any.whl
Collecting botocore<1.20.0,>=1.19.6 (from boto3)
  Using cached https://files.pythonhosted.org/packages/15/6c/f5b074e14823f250e0a73e53714c1ed80d689d530468936d35a9d336f1dd/botocore-1.19.6-py2.py3-none-any.whl
Collecting s3transfer<0.4.0,>=0.3.0 (from boto3)
  Using cached https://files.pythonhosted.org/packages/69/79/e6afb3d8b0b4e96cefbdc690f741d7dd24547ff1f94240c997a26fa908d3/s3transfer-0.3.3-py2.py3-none-any.whl
Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.20.0,>=1.19.6->boto3)
  Using cached https://files.pythonhosted.org/packages/d4/70/d60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/python_dateutil-2.8.1-py2.py3-none-any.whl
Installing collected packages: python-dateutil, botocore, s3transfer, boto3
Successfully installed boto3-1.16.6 botocore-1.19.6 python-dateutil-2.8.1 s3transfer-0.3.3


In [3]:
import boto3
from datetime import datetime
import fsspec
import pandas as pd
from pyspark.sql.dataframe import DataFrame
from pyspark.sql import functions as f, types as t, Window
from pathlib import Path
import re
import requests
import s3fs
import subprocess
import timeit
from urllib.parse import urlparse

# Removes truncation of columns, column values in Pandas
# by default
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

# Monkey patching the DataFrame transform method for Spark 2.4
# This is available by default in Spark 3.0
def transform(self, f):
    return f(self)
DataFrame.transform = transform

# Override the timeit template to return the command's
# return value in addition to the time
# Reference: https://stackoverflow.com/questions/24812253/how-can-i-capture-return-value-with-python-timeit-module
timeit.template = """
def inner(_it, _timer{init}):
    {setup}
    _t0 = _timer()
    for _i in _it:
        retval = {stmt}
    _t1 = _timer()
    return _t1 - _t0, retval
"""

def shell_cmd(cmd):
    """
    Wrapper for running shell commands and printing the output
    Some helpful recipes:
    - List files on hdfs: shell_cmd("hdfs dfs -ls hdfs:///tmp/data/")
    - Remove files from hdfs: shell_cmd("hdfs dfs -rm -r hdfs:///tmp/data/test_pyspark")
    """
    for line in subprocess.check_output(cmd, shell=True).split(b'\n'):
        print(line)

def timer_method(cmd):
    """
    Wrapper for timeit that returns the value of a function and its runtime
    To use, pass a string of the function you wish to time
    Example: 
     run_time, result = timer_method("myfunction(arg1, arg2)")
    """
    # Setting globals = globals() enables the timeit function
    # to return the value generated by cmd
    return timeit.timeit(cmd, number=1, globals = globals())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Set your s3 bucket name
This should be data-scale-oreilly-{your name}   
If you dont remember check the [S3 console](https://s3.console.aws.amazon.com/)

In [4]:
MY_BUCKET_NAME = "data-scale-oreilly"

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Ingesting from an S3 bucket - NYC Taxi Data

https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
* Taxi data 
* Data dictionaries 
* Taxi zone lookup table

Data ingestion has the ultimate goal of collecting, aggregating, and surfacing data for a specific purpose; an analysis, an API, a dashboard, etc. Think about how you might use the taxi data to answer the following questions:

1. Which borough is the most popular pickup or drop off spot?
1. Are green taxis more popular for trips within the same borough vs yellow taxis?
1. Build a recommendation engine that predicts surge pricing for a given time of day based on historical data  

With this in mind, lets work through bringing this data onto the cluster

In [4]:
# Note, if you copy the link from the taxi data website you will see:
# https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv
# Two things - first, the portion of the URL following "aws.com" is the 
# bucket name. Second, in "trip+data" the "+" is a space
taxi_data_path = "s3://nyc-tlc/trip data/yellow_tripdata_2020-01.csv"

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
# Pandas uses s3fs to read_csv from s3:
pd_df_taxi= pd.read_csv(taxi_data_path, keep_default_na=False)
print(pd_df_taxi.head())
pd_df_taxi.dtypes

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

  VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count  \
0        1  2020-01-01 00:28:15   2020-01-01 00:33:03               1   
1        1  2020-01-01 00:35:39   2020-01-01 00:43:04               1   
2        1  2020-01-01 00:47:41   2020-01-01 00:53:52               1   
3        1  2020-01-01 00:55:23   2020-01-01 01:00:14               1   
4        2  2020-01-01 00:01:58   2020-01-01 00:04:16               1   

   trip_distance RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0            1.2          1                  N           238           239   
1            1.2          1                  N           239           238   
2            0.6          1                  N           238           238   
3            0.8          1                  N           238           151   
4            0.0          1                  N           193           193   

  payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \
0            1     

In [10]:
# For reference, look at the Spark DataFrameReader, csv:
# https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html
ps_df_taxi = spark.read.option('header', True).option('inferSchema', True).csv(taxi_data_path)
ps_df_taxi.show()
ps_df_taxi.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|       1| 2020-01-01 00:28:15|  2020-01-01 00:33:03|              1|          1.2|         1|                 N|         238|         239|           1|        6.0|  3.0|    0.5|      1.47|         0.0|                  0.3

In [12]:
# Talk through ingest practices around retaining original data vs augmenting
# For example, we may want to keep the data in its default format so we can
# refer back to it if there are bugs in our data ingestion code
ps_df_taxi.write.option("header", True).csv("hdfs:///tmp/input/taxi_data")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
# Discuss how spark writes files out
shell_cmd("hdfs dfs -ls hdfs:///tmp/input/taxi_data")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

b'Found 6 items'
b'-rw-r--r--   1 livy hadoop          0 2020-10-28 02:53 hdfs:///tmp/input/taxi_data/_SUCCESS'
b'-rw-r--r--   1 livy hadoop  154591771 2020-10-28 02:53 hdfs:///tmp/input/taxi_data/part-00000-935fbf80-41e9-4477-8419-94a78c1e7e3d-c000.csv'
b'-rw-r--r--   1 livy hadoop  154402085 2020-10-28 02:53 hdfs:///tmp/input/taxi_data/part-00001-935fbf80-41e9-4477-8419-94a78c1e7e3d-c000.csv'
b'-rw-r--r--   1 livy hadoop  154437757 2020-10-28 02:53 hdfs:///tmp/input/taxi_data/part-00002-935fbf80-41e9-4477-8419-94a78c1e7e3d-c000.csv'
b'-rw-r--r--   1 livy hadoop  154401471 2020-10-28 02:53 hdfs:///tmp/input/taxi_data/part-00003-935fbf80-41e9-4477-8419-94a78c1e7e3d-c000.csv'
b'-rw-r--r--   1 livy hadoop   65977667 2020-10-28 02:53 hdfs:///tmp/input/taxi_data/part-00004-935fbf80-41e9-4477-8419-94a78c1e7e3d-c000.csv'
b''

In [16]:
column_subset = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'tip_amount']
ps_df_taxi.select(*column_subset).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------------------+---------------+-------------+------------+------------+-----------+----------+
|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|PULocationID|DOLocationID|fare_amount|tip_amount|
+--------------------+---------------------+---------------+-------------+------------+------------+-----------+----------+
| 2020-01-01 00:28:15|  2020-01-01 00:33:03|              1|          1.2|         238|         239|        6.0|      1.47|
| 2020-01-01 00:35:39|  2020-01-01 00:43:04|              1|          1.2|         239|         238|        7.0|       1.5|
| 2020-01-01 00:47:41|  2020-01-01 00:53:52|              1|          0.6|         238|         238|        6.0|       1.0|
| 2020-01-01 00:55:23|  2020-01-01 01:00:14|              1|          0.8|         238|         151|        5.5|      1.36|
| 2020-01-01 00:01:58|  2020-01-01 00:04:16|              1|          0.0|         193|         193|        3.5|       0.0|
| 2020-0

In [17]:
ps_df_taxi.select(*column_subset).describe().show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+------------------+-----------------+------------------+-----------------+------------------+------------------+
|summary|   passenger_count|    trip_distance|      PULocationID|     DOLocationID|       fare_amount|        tip_amount|
+-------+------------------+-----------------+------------------+-----------------+------------------+------------------+
|  count|           6339567|          6405008|           6405008|          6405008|           6405008|           6405008|
|   mean|1.5153326717739555|2.929643933309735|164.73225778952968|162.6626908194338|12.694108119770615|2.1893418306433965|
| stddev| 1.151594213427813| 83.1591059732502| 65.54373944111758|69.91260629496094|12.127295340046553| 2.760028392378395|
|    min|                 0|           -30.62|                 1|                1|           -1238.0|             -91.0|
|    max|                 9|        210240.07|               265|              265|            4265.0|            1100.0|
+-------+---------------

### Lab 2.1 - Write an ingestion function that does the following:
Given a file path to a taxi data csv (i.e. s3://nyc-tlc/trip data/green_tripdata_2020-01.csv) create a function that does the following:
1. Read the file into a Spark dataframe
1. Limit to the `column_subset` columns
1. Write the data as json to hdfs in append mode to `hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json`

Function signature:  
`def ingest_taxi_data(file_name)`

See the subsequent cell for more info on how the `ingest_taxi_data` function will be used   
Reference: https://spark.apache.org/docs/2.4.5/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter  

When you're done, run the next 2 cells to ingest several taxi data files and examine the result

In [20]:
def ingest_taxi_data(file_name):
    # Enclosing code in () allows multi line
    (spark
         .read
         .option('header', True)
         .option("inferSchema", True)
         .csv(file_name)
         .select(*column_subset)           
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
# Run the ingest for several files
taxi_data_prefix = "s3://nyc-tlc/trip data"
taxi_data_files = ["yellow_tripdata_2019-01.csv", "yellow_tripdata_2018-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}/{file_name}"
    ingest_taxi_data(taxi_data_path)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
# How did the types fare in this conversion?
# Turns out its a bug! 
# https://issues.apache.org/jira/browse/SPARK-26325
# https://stackoverflow.com/questions/53697388/interpret-timestamp-fields-in-spark-while-reading-json
df = spark.read.json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
df.printSchema()
df.show(5, False)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- DOLocationID: long (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- trip_distance: double (nullable = true)

+------------+------------+-----------+---------------+----------+------------------------+------------------------+-------------+
|DOLocationID|PULocationID|fare_amount|passenger_count|tip_amount|tpep_dropoff_datetime   |tpep_pickup_datetime    |trip_distance|
+------------+------------+-----------+---------------+----------+------------------------+------------------------+-------------+
|239         |151         |7.0        |1              |1.65      |2019-01-01T00:53:20.000Z|2019-01-01T00:46:40.000Z|1.5          |
|246         |239         |14.0       |1              |1.0       |2019-01-01T01:18:59.000Z|2019-01-01T00:5

#### Transforming data types 

Available pyspark types are listed in the pyspark.sql.types module https://spark.apache.org/docs/2.2.0/api/python/pyspark.sql.html#module-pyspark.sql.types

pyspark.types is imported as t, so to apply the IntegerType use t.IntegerType()

For pandas, see the following resources on converting types https://stackoverflow.com/questions/15891038/change-column-type-in-pandas

In [25]:
# Pyspark
(df.select("tpep_dropoff_datetime")
 .withColumn("tpep_dropoff_datetime", f.col("tpep_dropoff_datetime").cast(t.TimestampType()))
).dtypes

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('tpep_dropoff_datetime', 'timestamp')]

In [26]:
# Casting pandas columns to a type - this will give an error on empty cells
(pd_df_taxi[[*column_subset]]
        .astype({'passenger_count': 'Int64'}))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

invalid literal for int() with base 10: ''
Traceback (most recent call last):
  File "/tmp/1603853034423-0/lib/python3.7/site-packages/pandas/core/generic.py", line 5531, in astype
    col.astype(dtype=dtype[col_name], copy=copy, errors=errors)
  File "/tmp/1603853034423-0/lib/python3.7/site-packages/pandas/core/generic.py", line 5546, in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,)
  File "/tmp/1603853034423-0/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 595, in astype
    return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
  File "/tmp/1603853034423-0/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 406, in apply
    applied = getattr(b, f)(**kwargs)
  File "/tmp/1603853034423-0/lib/python3.7/site-packages/pandas/core/internals/blocks.py", line 595, in astype
    values = astype_nansafe(vals1d, dtype, copy=True)
  File "/tmp/1603853034423-0/lib/python3.7/site-packages/pandas/core/dtypes/cast.

In [67]:
# To convert to Integer using pandas, we have to first deal with the null values
# to_numeric with 'coerce' will fill invalid integer values with np.NaN
# the Int64 type in later versions of pandas will convert np.NaN to a nullable
# integer type: https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html
pd.to_numeric(pd_df_taxi.passenger_count, errors='coerce').astype('Int64').dtypes

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Int64Dtype()

In [27]:
# Modified taxi_data_ingest with transformed timestamps
def ingest_taxi_data(file_name):
    # Enclosing code in () allows multi line
    (spark
         .read
         .option('header', True)
         .option("inferSchema", True)
         .csv(file_name)
         .select(*column_subset)
         .withColumn("tpep_pickup_date", f.col("tpep_pickup_datetime").cast(t.DateType()))
         .withColumn("tpep_dropoff_date", f.col("tpep_dropoff_datetime").cast(t.DateType()))
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
# Remove previous data
shell_cmd("hdfs dfs -rm -r hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

b'Deleted hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json'
b''

In [30]:
taxi_data_prefix = "s3://nyc-tlc/trip data"
taxi_data_files = ["yellow_tripdata_2019-01.csv", "yellow_tripdata_2018-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}/{file_name}"
    ingest_taxi_data(taxi_data_path)
    
df = spark.read.json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
df.show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------------+-----------+---------------+----------+-----------------+---------------------+----------------+--------------------+-------------+
|DOLocationID|PULocationID|fare_amount|passenger_count|tip_amount|tpep_dropoff_date|tpep_dropoff_datetime|tpep_pickup_date|tpep_pickup_datetime|trip_distance|
+------------+------------+-----------+---------------+----------+-----------------+---------------------+----------------+--------------------+-------------+
|          24|          41|        4.5|              1|       0.0|       2018-01-01| 2018-01-01T00:24:...|      2018-01-01|2018-01-01T00:21:...|          0.5|
|         140|         239|       14.0|              1|       0.0|       2018-01-01| 2018-01-01T01:03:...|      2018-01-01|2018-01-01T00:44:...|          2.7|
|         141|         262|        6.0|              2|       1.0|       2018-01-01| 2018-01-01T00:14:...|      2018-01-01|2018-01-01T00:08:...|          0.8|
|         257|         140|       33.5|       

### Testing ingestion code

The `ingest_taxi_data` method is not well structured for testing:
* Writes to the file system
* Requires an input file to test
* What other shortcomings?

To make this code more testable, split out the transformation logic so it can be unit tested.  
Definining a transformation function that takes a dataframe and returns a dataframe provides a better interface for unit testing, and a more extensible structure in case we need to add more dataframe functions before or after the transformation step.

In [31]:
def transform_taxi_data(df):
    return (df
            .withColumn("tpep_pickup_date", f.col("tpep_pickup_datetime").cast(t.DateType()))
            .withColumn("tpep_dropoff_date", f.col("tpep_dropoff_datetime").cast(t.DateType()))
           )

# Option 1
def ingest_taxi_data_method(file_name):
    df_input = (spark
         .read
         .option('header', True).csv(taxi_data_path)
         .select(*column_subset))
    
    (transform_taxi_data(df_input)
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

# Option 2
def ingest_taxi_data_transform(file_name):
    # Requires patching of Dataframe.transform method in Spark 2.4, but available natively
    # in Spark 3.0 https://mungingdata.com/pyspark/chaining-dataframe-transformations/
    df_input = (spark
         .read
         .option('header', True).csv(taxi_data_path)
         .select(*column_subset)
         .transform(transform_taxi_data)
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [34]:
test_data = [
    "{'tpep_pickup_datetime': '2020-05-23', 'tpep_dropoff_datetime': '2020-05-23'}",
    "{'tpep_pickup_datetime': '2020-10-01', 'tpep_dropoff_datetime': '2020-10-01'}",
    "{'tpep_pickup_datetime': '2020-02-02', 'tpep_dropoff_datetime': '2020-02-03'}"
]
expected_types = {'tpep_dropoff_date': 'date', 'tpep_pickup_date': 'date', 'tpep_pickup_datetime':'string', 'tpep_dropoff_datetime':'string'}

test_df = spark.read.json(sc.parallelize(test_data))
print(test_df.dtypes)
test = transform_taxi_data(test_df)
test_types = {item[0]:item[1] for item in test.dtypes}

print(expected_types == test_types)

test.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('tpep_dropoff_datetime', 'string'), ('tpep_pickup_datetime', 'string')]
True
root
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_pickup_date: date (nullable = true)
 |-- tpep_dropoff_date: date (nullable = true)

#### Lets try running the ingestion code on the other taxi data sets

In [35]:
# Try using the ingest code we created for yellow taxi for all the taxis
# This will fail because the datetime fields have different names across different servcies

taxi_data_prefix = "s3://nyc-tlc/trip data/"
taxi_data_files = ["green_tripdata_2020-01.csv", "fhv_tripdata_2020-01.csv", "fhvhv_tripdata_2020-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}{file_name}"
    ingest_taxi_data_transform(file_name)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

"cannot resolve '`tpep_pickup_datetime`' given input columns: [passenger_count, mta_tax, tolls_amount, trip_distance, trip_type, payment_type, total_amount, ehail_fee, fare_amount, congestion_surcharge, lpep_dropoff_datetime, tip_amount, extra, DOLocationID, VendorID, lpep_pickup_datetime, store_and_fwd_flag, improvement_surcharge, RatecodeID, PULocationID];;\n'Project ['tpep_pickup_datetime, 'tpep_dropoff_datetime, passenger_count#1938, trip_distance#1939, PULocationID#1936, DOLocationID#1937, fare_amount#1940, tip_amount#1943]\n+- Relation[VendorID#1931,lpep_pickup_datetime#1932,lpep_dropoff_datetime#1933,store_and_fwd_flag#1934,RatecodeID#1935,PULocationID#1936,DOLocationID#1937,passenger_count#1938,trip_distance#1939,fare_amount#1940,extra#1941,mta_tax#1942,tip_amount#1943,tolls_amount#1944,ehail_fee#1945,improvement_surcharge#1946,total_amount#1947,payment_type#1948,trip_type#1949,congestion_surcharge#1950] csv\n"
Traceback (most recent call last):
  File "<stdin>", line 27, in in

#### How can we ingest all taxi services AND be able to tell them apart?

Taxi file names: 
* yellow_tripdata_2020-01.csv
* green_tripdata_2020-01.csv
* fhv_tripdata_2020-01.csv
* fhvhv_tripdata_2020-01.csv

The file name provides information including:
* Service type (yellow, green, etc)
* File date

We want to augment the taxi data with this information so we can refer back to it in analysis.

Is there other data we might want to augment the raw data with? Some things to consider:
* Additional fields that could help with analysis
* Metadata, such as when the record was last updated

In [8]:
# Using matched groups, we can extract information from the taxi file names
# i.e. yellow_tripdata_2020-01.csv
TAXI_DATA_PATTERN = "(?P<service>[a-zA-Z0-9]+)_tripdata_(?P<year>[0-9]{4})-(?P<month>[0-9]{2}).csv"

def extract_file_info(file_name):
    # Returns (service, year, month) given a taxi file name
    m = re.match(TAXI_DATA_PATTERN, file_name)
    if m is not None:
        return (m.group(1), m.group(2), m.group(3))
    
extract_file_info("yellow_tripdata_2020-01.csv")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

('yellow', '2020', '01')

### Lab 2.2 - Ingesting multiple taxi service types

See the [Taxi data website](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) for reference

Using the template in the next cell, create the following functions:
* Service specific transformations to match the schema below
* A general transformation function to apply metadata and other common transformations

Schema:

* pickup_datetime Timestamp
* dropoff_datetime Timestamp
* pickup_date Date
* dropoff_date Date
* passenger_count Integer
* fare_amount Float
* tip_amount Float
* trip_distance Float
* PULocationID Integer
* DOLocationID Integer

Metadata fields:  explore `f.lit` to add these columns
* service
* year
* month

Refer to `ingest_taxi_data_multi_service` to see how these functions will be used    

You may find some helpful info here: https://spark.apache.org/docs/latest/api/python/pyspark.sql.html  

In [5]:
def transform_yellow_taxi(df):
    subset = ['pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'tip_amount']
    return (df.withColumnRenamed("tpep_pickup_datetime", "pickup_datetime")
        .withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")
        .select(*subset)
        .withColumn("dropoff_date", f.col("dropoff_datetime").cast(t.DateType()))
        .withColumn("pickup_date", f.col("pickup_datetime").cast(t.DateType()))

        )
        
def transform_green_taxi(df):
    subset = ['pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'tip_amount']
    return (df.withColumnRenamed("lpep_pickup_datetime", "pickup_datetime")
        .withColumnRenamed("lpep_dropoff_datetime", "dropoff_datetime")
        .select(*subset)
        .withColumn("dropoff_date", f.col("dropoff_datetime").cast(t.DateType()))
        .withColumn("pickup_date", f.col("pickup_datetime").cast(t.DateType()))
        )

def transform_fhv(df):
    return df.select(*["pickup_datetime", "dropoff_datetime", "PULocationID", "DOLocationID"])

def transform_all(df, service, year, month):
    return (df.withColumn("service", f.lit(service))
         .withColumn("year", f.lit(year))
         .withColumn("month", f.lit(month))
         .withColumn("dropoff_date", f.col("dropoff_datetime").cast(t.DateType()))
         .withColumn("pickup_date", f.col("pickup_datetime").cast(t.DateType())))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
def ingest_taxi_data_multi_service(file_name, ingested_on):
    print(f"Processing {file_name}")
    (service, year, month) = extract_file_info(Path(file_name).name)
    input_df = spark.read.option('header', True).option('inferSchema', True).csv(file_name)
    
    if service == 'yellow':
        df_transform = transform_yellow_taxi(input_df)
    elif service == 'green':
        df_transform = transform_green_taxi(input_df)
    else:
        # FHV. What happens if there are more taxi services added?
        df_transform = transform_fhv(input_df)
        
    print(df_transform.dtypes)

    (transform_all(df_transform, service, year, month)
         .withColumn("ingested_on", f.lit(ingest_timestamp))
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [82]:
shell_cmd("hdfs dfs -rm -r hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

b'Deleted hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json'
b''

In [None]:
ingest_timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S%z")
taxi_data_prefix = "s3://nyc-tlc/trip data/"
taxi_data_files = ["yellow_tripdata_2020-01.csv"]#, "green_tripdata_2020-01.csv", "fhv_tripdata_2020-01.csv", "fhvhv_tripdata_2020-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}{file_name}"
    ingest_taxi_data_multi_service(taxi_data_path, ingest_timestamp)
    
df_taxi_output = spark.read.json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
df_taxi_output.show(5)
df_taxi_output.groupby("service").count().show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Handling bad data
How to design for the inevitability of bad data  
Reference: https://blog.knoldus.com/apache-spark-handle-corrupt-bad-records/

In [28]:
bad_data = [
    "{'pickup_datetime': '2020-05-23 21:05:23', 'fare_amount': '0.05'}",
    "{'pickup_datetime': '2020-05-23 08:05:23', 'fare_amount': '10.05'}",
    "{'pickup_datetime': '2020-05-23 21:05:23', 'fare_amount}"
]

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="PERMISSIVE", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-----------+-------------------+
|     _corrupt_record|fare_amount|    pickup_datetime|
+--------------------+-----------+-------------------+
|                null|       0.05|2020-05-23 21:05:23|
|                null|      10.05|2020-05-23 08:05:23|
|{'pickup_datetime...|       null|               null|
+--------------------+-----------+-------------------+

In [30]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="DROPMALFORMED", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+-------------------+
|fare_amount|    pickup_datetime|
+-----------+-------------------+
|       0.05|2020-05-23 21:05:23|
|      10.05|2020-05-23 08:05:23|
+-----------+-------------------+

In [31]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="FAILFAST", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error occurred while calling o587.json.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 15 in stage 58.0 failed 4 times, most recent failure: Lost task 15.3 in stage 58.0 (TID 737, ip-172-31-5-146.ec2.internal, executor 1): org.apache.spark.SparkException: Malformed records are detected in schema inference. Parse Mode: FAILFAST.
	at org.apache.spark.sql.catalyst.json.JsonInferSchema$$anonfun$1$$anonfun$apply$1.apply(JsonInferSchema.scala:66)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema$$anonfun$1$$anonfun$apply$1.apply(JsonInferSchema.scala:53)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
	at scala.collection.Iterator$class.isEmpty(Iterator.scala:331)
	at scala.collection.AbstractIterator.isEmpty(Iterator.scala:1334)
	at scala.collection.TraversableOnce$class.reduceLeftOption(TraversableOnce.scala:203)
	at scala.collection.AbstractIterator.reduceLeftOption(Ite

### Lab 2.3 - Write an ingestion for the taxi zone lookup
File location - Yes, there is a space between taxi and the '_'  

s3://nyc-tlc/misc/taxi _zone_lookup.csv

`def ingest_taxi_lookup():`
1. Read taxi lookup data, ensuring data types are correct
1. Add relevant metadata
1. Save to hdfs:///tmp/data/nyc-taxi/zone-lookup/output/section2/json
1. What write mode should be used?

Refer back to Taxi Data page for more info: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [68]:
def taxi_zone_transform(df):
    return df.withColumn("ingested_on", f.lit(ingest_timestamp))

def ingest_taxi_lookup(ingest_timestamp):
    (spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("s3://nyc-tlc/misc/taxi _zone_lookup.csv")
    .transform(taxi_zone_transform)
    .write
    .mode("overwrite")
    .json("hdfs:///tmp/data/nyc-taxi/zone-lookup/output/section2/json"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [69]:
ingest_timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S%z")
print(timer_method("ingest_taxi_lookup(ingest_timestamp)"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(8.81471834199965, None)

# Break

### Lab 2.4 - Case Study 1: Month over month, get the total count of of pickups per borough
#### Do not blindly run hese cells, you can bork your cluster

In [70]:
taxiPath = "hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json/"
taxiLookupPath = "hdfs:///tmp/data/nyc-taxi/zone-lookup/output/section2/json/"

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [35]:
# Join boroughs
# Expected error cartesian join. most likely a carryover bug from 2.0
spark.conf.set("spark.sql.crossJoin.enabled", "false") #<-- default
df_taxi = spark.read.json(taxiPath)
df_taxi_lookup = spark.read.json(taxiLookupPath)

taxi_filtered = (df_taxi
 .filter(df_taxi.pickup_datetime.isNotNull())
 .filter(df_taxi.dropoff_datetime.isNotNull()))
taxi_pu = (taxi_filtered
.join(df_taxi_lookup
       .select("LocationID", "Borough")
       .withColumnRenamed("Borough", "PUBorough"), 
       df_taxi_lookup.LocationID == df_taxi.PULocationID))
taxi = (taxi_pu.join(df_taxi_lookup
       .select("LocationID", "Borough")
       .withColumnRenamed("Borough", "DOBorough"), 
       df_taxi_lookup.LocationID == taxi_pu.DOLocationID))
taxi_pu.show()
taxi.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'Detected implicit cartesian product for INNER join between logical plans\nJoin Inner, ((LocationID#1692L = PULocationID#1662L) && (LocationID#1692L = DOLocationID#1661L))\n:- Filter ((isnotnull(pickup_datetime#1668) && isnotnull(dropoff_datetime#1663)) && ((isnotnull(PULocationID#1662L) && (DOLocationID#1661L = PULocationID#1662L)) && isnotnull(DOLocationID#1661L)))\n:  +- Relation[DOLocationID#1661L,PULocationID#1662L,dropoff_datetime#1663,fare_amount#1664,ingested_on#1665,month#1666,passenger_count#1667L,pickup_datetime#1668,service#1669,tip_amount#1670,trip_distance#1671,year#1672] json\n+- Project [LocationID#1692L, Borough#1691 AS PUBorough#1703]\n   +- Filter isnotnull(LocationID#1692L)\n      +- Relation[Borough#1691,LocationID#1692L,Zone#1693,ingested_on#1694,service_zone#1695] json\nand\nProject [LocationID#1754L, Borough#1753 AS DOBorough#1750]\n+- Relation[Borough#1753,LocationID#1754L,Zone#1755,ingested_on#1756,service_zone#1757] json\nJoin condition is missing or trivial.

In [71]:
spark.conf.set("spark.sql.crossJoin.enabled", "true")
df_taxi = spark.read.json(taxiPath)
df_taxi_lookup = spark.read.json(taxiLookupPath)

taxi_filtered = (df_taxi
 .filter(df_taxi.pickup_datetime.isNotNull())
 .filter(df_taxi.dropoff_datetime.isNotNull()))
taxi_pu = (taxi_filtered
.join(df_taxi_lookup
       .select("LocationID", "Borough")
       .withColumnRenamed("Borough", "PUBorough"), 
       df_taxi_lookup.LocationID == df_taxi.PULocationID))
taxi = (taxi_pu.join(df_taxi_lookup
       .select("LocationID", "Borough")
       .withColumnRenamed("Borough", "DOBorough"), 
       df_taxi_lookup.LocationID == taxi_pu.DOLocationID))
taxi_pu.show()
taxi.explain()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------------+------------+--------------------+-----------+-------------------+-----+---------------+-----------+--------------------+-------+----------+-------------+----+----------+---------+
|DOLocationID|PULocationID|dropoff_date|    dropoff_datetime|fare_amount|        ingested_on|month|passenger_count|pickup_date|     pickup_datetime|service|tip_amount|trip_distance|year|LocationID|PUBorough|
+------------+------------+------------+--------------------+-----------+-------------------+-----+---------------+-----------+--------------------+-------+----------+-------------+----+----------+---------+
|         239|         238|  2020-01-01|2020-01-01T00:33:...|        6.0|2020-10-28 04:15:09|   01|              1| 2020-01-01|2020-01-01T00:28:...| yellow|      1.47|          1.2|2020|       238|Manhattan|
|         238|         239|  2020-01-01|2020-01-01T00:43:...|        7.0|2020-10-28 04:15:09|   01|              1| 2020-01-01|2020-01-01T00:35:...| yellow|       1.5| 

In [37]:
def get_monthly_totals_pyspark(taxiPath, taxiLookupPath):
    taxi = spark.read.json(taxiPath)
    taxi_lookup = spark.read.json(taxiLookupPath)
    taxi_filtered = (taxi
     .filter(taxi.pickup_datetime.isNotNull())
     .filter(taxi.dropoff_datetime.isNotNull()))
                     
    groupDF = taxi_filtered.join(taxi_lookup, taxi_filtered.PULocationID == taxi_lookup.LocationID)
    groupDF.select("ingested_on").show() # expected error
    return groupDF

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [38]:
print(timer_method("get_monthly_totals_pyspark(taxiPath, taxiLookupPath)"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

"Reference 'ingested_on' is ambiguous, could be: ingested_on, ingested_on.;"
Traceback (most recent call last):
  File "<stdin>", line 58, in timer_method
  File "/usr/lib64/python3.7/timeit.py", line 233, in timeit
    return Timer(stmt, setup, timer, globals).timeit(number)
  File "/usr/lib64/python3.7/timeit.py", line 177, in timeit
    timing = self.inner(it, self.timer)
  File "<timeit-src>", line 6, in inner
  File "<stdin>", line 9, in get_monthly_totals_pyspark
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 1326, in select
    jdf = self._jdf.select(self._jcols(*cols))
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 69, in deco
    raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: "Reference 'ingested_on' is ambiguous, co

In [39]:
def get_monthly_totals_pandas(taxiPath, taxiLookupPath):
    taxi = pd.read_json(taxiPath)
    taxi_lookup = pd.read_json(taxiLookupPath)
    taxi_filtered = tax.dropna(subset=['pickup_datetime', 'dropoff_datetime'])
    
    groupDF = taxi_filtered.join(taxi_lookup.set_index('LocationID'), on='PULocationID')
    groupDF['pickup_month'] = pd.to_datetime(groupDF['pickup_datetime'], format='%m%Y')
    groupDF = groupDF.groupby('pickup_month', 'borough').agg('count').sort_values(by=['count', 'borough'], ascending=[False, True])
    groupDF
    return groupDF

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [40]:
print(timer_method("get_monthly_totals_pandas(taxiPath, taxiLookupPath)"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

pyarrow and local java libraries required for HDFS
Traceback (most recent call last):
  File "<stdin>", line 58, in timer_method
  File "/usr/lib64/python3.7/timeit.py", line 233, in timeit
    return Timer(stmt, setup, timer, globals).timeit(number)
  File "/usr/lib64/python3.7/timeit.py", line 177, in timeit
    timing = self.inner(it, self.timer)
  File "<timeit-src>", line 6, in inner
  File "<stdin>", line 2, in get_monthly_totals_pandas
  File "/tmp/1603838861189-0/lib/python3.7/site-packages/pandas/util/_decorators.py", line 199, in wrapper
    return func(*args, **kwargs)
  File "/tmp/1603838861189-0/lib/python3.7/site-packages/pandas/util/_decorators.py", line 296, in wrapper
    return func(*args, **kwargs)
  File "/tmp/1603838861189-0/lib/python3.7/site-packages/pandas/io/json/_json.py", line 594, in read_json
    path_or_buf, encoding=encoding, compression=compression
  File "/tmp/1603838861189-0/lib/python3.7/site-packages/pandas/io/common.py", line 222, in get_filepath_or

In [72]:
def get_monthly_totals_pandas(taxiPath, taxiLookupPath):
    taxiPySpark = spark.read.json(taxiPath)
    taxiLookupPySpark = spark.read.json(taxiLookupPath)
    
    taxi = taxiPySpark.toPandas()
    taxiLookup = taxiLookupPySpark.toPandas()
    taxiFiltered = taxi.dropna(subset=['pickup_datetime', 'dropoff_datetime'])
    
    groupDF = taxiFiltered.join(taxiLookup[["Borough", "LocationID"]].set_index('LocationID'), on='PULocationID')
    
    groupDF['pickup_month'] = pd.to_datetime(groupDF['pickup_datetime']).dt.strftime('%Y%m')
    returnGroupDF = groupDF.groupby(['pickup_month', 'Borough']).size().reset_index(name='count').sort_values(by=['pickup_month', 'count', 'Borough'], ascending=[False, False, True])
    return returnGroupDF

def get_monthly_totals_pyspark(taxiPath, taxiLookupPath):
    taxi = spark.read.json(taxiPath)
    taxiLookup = spark.read.json(taxiLookupPath)
    
    taxiFiltered = (taxi
     .filter(taxi.pickup_datetime.isNotNull())
     .filter(taxi.dropoff_datetime.isNotNull()))
                     
    groupDF = taxiFiltered.join(taxiLookup.select("Borough", "LocationID"), taxiFiltered.PULocationID == taxiLookup.LocationID)
    groupDF = groupDF.withColumn("pickup_month", f.date_format("pickup_datetime", "yyyyMM"))
    groupDF = groupDF.groupBy("pickup_month", "borough").count().orderBy(f.desc("pickup_month"), f.desc("count"), "borough")
    groupDF.show()
    return groupDF

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [42]:
# Running this command with the original cluster size, will crash the cluster
# All functions utilizing pandas from this command forward, need an upscaled driver node
print(timer_method("get_monthly_totals_pandas(taxiPath, taxiLookupPath)"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(460.658796917,    pickup_month        Borough     count
31       202101      Manhattan         3
30       202007      Manhattan         6
29       202006      Manhattan         1
28       202005      Manhattan         5
27       202004      Manhattan         1
26       202003      Manhattan         5
23       202002      Manhattan        34
24       202002         Queens         9
22       202002       Brooklyn         3
21       202002          Bronx         1
25       202002        Unknown         1
17       202001      Manhattan  14817800
15       202001       Brooklyn   5628274
18       202001         Queens   4509457
14       202001          Bronx   2536611
20       202001        Unknown   1615247
19       202001  Staten Island    260258
16       202001            EWR      3779
11       201912      Manhattan       129
12       201912         Queens        17
9        201912          Bronx         2
10       201912       Brooklyn         1
13       201912        Unknown         1


In [73]:
print(timer_method("get_monthly_totals_pyspark(taxiPath, taxiLookupPath)"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-------------+--------+
|pickup_month|      borough|   count|
+------------+-------------+--------+
|      202101|    Manhattan|       3|
|      202007|    Manhattan|       6|
|      202006|    Manhattan|       1|
|      202005|    Manhattan|       5|
|      202004|    Manhattan|       1|
|      202003|    Manhattan|       5|
|      202002|    Manhattan|      34|
|      202002|       Queens|       9|
|      202002|     Brooklyn|       3|
|      202002|        Bronx|       1|
|      202002|      Unknown|       1|
|      202001|    Manhattan|14817800|
|      202001|     Brooklyn| 5628274|
|      202001|       Queens| 4509457|
|      202001|        Bronx| 2536611|
|      202001|      Unknown| 1615247|
|      202001|Staten Island|  260258|
|      202001|          EWR|    3779|
|      201912|    Manhattan|     129|
|      201912|       Queens|      17|
+------------+-------------+--------+
only showing top 20 rows

(36.60000383999977, DataFrame[pickup_month: string, borough: s

In [None]:
## Expected error for maxResultSize: This won't work. Could try the subsequent cells
## Those restart the state of the notebook and don't work as expected
## Need to restart the cluster and edit the Software config with: [{"classification":"spark-defaults", "properties":{"spark.driver.maxResultSize":"5G", "spark.ui.killEnabled":"true"}, "configurations":[]}]
## Then need to reun the taxi and taxi lookup ingests
## Run -> Run All Above Selected Cell
## Second expected error for {"msg":"requirement failed: Session isn't active."} and will hang. Driver node ran out of mem. Will need to go and upscale
print(spark.conf.get('spark.driver.maxResultSize'))
spark.conf.set("spark.driver.maxResultSize", "5G")
print(spark.conf.get('spark.driver.maxResultSize'))

In [None]:
%%configure -f
{"conf":{"spark.driver.maxResultSize":"5G"}}

In [None]:
%%info

In [74]:
def get_monthly_totals_concat_pandas(taxiPath, taxiLookupPath):
    taxiPySpark = spark.read.json(taxiPath)
    taxiLookupPySpark = spark.read.json(taxiLookupPath)
    
    taxi = taxiPySpark.toPandas()
    taxiLookup = taxiLookupPySpark.toPandas()
    taxiFiltered = taxi.dropna(subset=['pickup_datetime', 'dropoff_datetime'])
    
    groupDF = taxiFiltered.join(taxiLookup[["Borough", "LocationID"]].set_index('LocationID'), on='PULocationID')
    groupDF['pickup_month'] = groupDF['year'] + groupDF['month']
    groupDF = groupDF.groupby(['pickup_month', 'Borough']).size().reset_index(name='count').sort_values(by=['pickup_month', 'count', 'Borough'], ascending=[False, False, True])
    return groupDF
    
def get_monthly_totals_concat_pyspark(taxiPath, taxiLookupPath):
    taxi = spark.read.json(taxiPath)
    taxiLookup = spark.read.json(taxiLookupPath)
    taxiFiltered = (taxi
     .filter(taxi.pickup_datetime.isNotNull())
     .filter(taxi.dropoff_datetime.isNotNull()))
        
    groupDF = taxiFiltered.join(taxiLookup, taxiFiltered.PULocationID == taxiLookup.LocationID)
    groupDF = groupDF.withColumn("pickup_month", f.concat("year", "month")).select("pickup_datetime", "borough", "pickup_month")
    groupDF = groupDF.groupBy("pickup_month", "borough").count().orderBy(f.desc("pickup_month"), f.desc("count"), "borough")
    groupDF.show()
    return groupDF

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [45]:
print(timer_method("get_monthly_totals_concat_pandas(taxiPath, taxiLookupPath)"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(322.6805907979999,   pickup_month        Borough     count
3       202001      Manhattan  14818011
1       202001       Brooklyn   5628278
4       202001         Queens   4509499
0       202001          Bronx   2536615
6       202001        Unknown   1615251
5       202001  Staten Island    260258
2       202001            EWR      3779)

In [75]:
print(timer_method("get_monthly_totals_concat_pyspark(taxiPath, taxiLookupPath)"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-------------+--------+
|pickup_month|      borough|   count|
+------------+-------------+--------+
|      202001|    Manhattan|14818011|
|      202001|     Brooklyn| 5628278|
|      202001|       Queens| 4509499|
|      202001|        Bronx| 2536615|
|      202001|      Unknown| 1615251|
|      202001|Staten Island|  260258|
|      202001|          EWR|    3779|
+------------+-------------+--------+

(19.413056725000388, DataFrame[pickup_month: string, borough: string, count: bigint])

### Lab 2.5 - Case Study 2: Month over month, get the borough with the most amount of pickups per month

In [76]:
def get_most_pickups_per_month_pandas(taxiPath, taxiLookupPath):
    inputDF = get_monthly_totals_pandas(taxiPath, taxiLookupPath)
    firstDF = inputDF.groupby("pickup_month").head(1).reset_index(drop=True)#.first()#sort_values(by=['pickup_month', 'count'], ascending=[True, False]).head(1).reset_index(drop=True)
    firstDF
    return firstDF

def get_most_pickups_per_month_pyspark(taxiPath, taxiLookupPath):
    inputDF = get_monthly_totals_pyspark(taxiPath, taxiLookupPath)
    firstDF = inputDF.orderBy(f.desc("pickup_month"), f.desc("count")).groupBy("pickup_month").agg(f.first("borough")).orderBy(f.desc("pickup_month"))
    firstDF.explain()
    firstDF.show()
    return firstDF

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [53]:
print(timer_method("get_most_pickups_per_month_pandas(taxiPath, taxiLookupPath)"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(519.8932285199999,    pickup_month    Borough     count
0        202101  Manhattan         3
1        202007  Manhattan         6
2        202006  Manhattan         1
3        202005  Manhattan         5
4        202004  Manhattan         1
5        202003  Manhattan         5
6        202002  Manhattan        34
7        202001  Manhattan  14817800
8        201912  Manhattan       129
9        201009     Queens         3
10       200901  Manhattan        19
11       200812  Manhattan         8
12       200301     Queens         1)

In [77]:
print(timer_method("get_most_pickups_per_month_pyspark(taxiPath, taxiLookupPath)"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-------------+--------+
|pickup_month|      borough|   count|
+------------+-------------+--------+
|      202101|    Manhattan|       3|
|      202007|    Manhattan|       6|
|      202006|    Manhattan|       1|
|      202005|    Manhattan|       5|
|      202004|    Manhattan|       1|
|      202003|    Manhattan|       5|
|      202002|    Manhattan|      34|
|      202002|       Queens|       9|
|      202002|     Brooklyn|       3|
|      202002|        Bronx|       1|
|      202002|      Unknown|       1|
|      202001|    Manhattan|14817800|
|      202001|     Brooklyn| 5628274|
|      202001|       Queens| 4509457|
|      202001|        Bronx| 2536611|
|      202001|      Unknown| 1615247|
|      202001|Staten Island|  260258|
|      202001|          EWR|    3779|
|      201912|    Manhattan|     129|
|      201912|       Queens|      17|
+------------+-------------+--------+
only showing top 20 rows

== Physical Plan ==
AdaptiveSparkPlan(isFinalPlan=false)
+- So

In [78]:
def get_most_pickups_per_month_window_pyspark(taxiPath, taxiLookupPath):
    from pyspark.sql import Window
    inputDF = get_monthly_totals_pyspark(taxiPath, taxiLookupPath)
    win = Window.partitionBy("pickup_month").orderBy(f.desc("count"))
    firstDF = inputDF.withColumn("row_num", f.row_number().over(win)).where("row_num == 1")
    firstDF = firstDF.orderBy(f.desc("pickup_month"))
    firstDF.explain()
    firstDF.show(firstDF.count())
    return firstDF

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [79]:
print(timer_method("get_most_pickups_per_month_window_pyspark(taxiPath, taxiLookupPath)"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+-------------+--------+
|pickup_month|      borough|   count|
+------------+-------------+--------+
|      202101|    Manhattan|       3|
|      202007|    Manhattan|       6|
|      202006|    Manhattan|       1|
|      202005|    Manhattan|       5|
|      202004|    Manhattan|       1|
|      202003|    Manhattan|       5|
|      202002|    Manhattan|      34|
|      202002|       Queens|       9|
|      202002|     Brooklyn|       3|
|      202002|        Bronx|       1|
|      202002|      Unknown|       1|
|      202001|    Manhattan|14817800|
|      202001|     Brooklyn| 5628274|
|      202001|       Queens| 4509457|
|      202001|        Bronx| 2536611|
|      202001|      Unknown| 1615247|
|      202001|Staten Island|  260258|
|      202001|          EWR|    3779|
|      201912|    Manhattan|     129|
|      201912|       Queens|      17|
+------------+-------------+--------+
only showing top 20 rows

== Physical Plan ==
AdaptiveSparkPlan(isFinalPlan=false)
+- So

### Lab - 2.6 Run and time the overall pipeline

In [None]:
# Reset notebook kernel
def ingest_main():
    ingest_taxi_data_multi_service("s3://nyc-tlc/trip data/yellow_tripdata_2020-01.csv")
    ingest_taxi_lookup("s3://nyc-tlc/misc/taxi _zone_lookup.csv")
    get_most_pickups_per_month_window_pyspark()

In [None]:
print(timer_method("ingest_main()"))