In [None]:
# Install libraries within the notebook scope
sc.install_pypi_package("boto3")
sc.install_pypi_package("pandas==1.0.0")
sc.install_pypi_package("requests")
sc.install_pypi_package("s3fs")
sc.install_pypi_package("fsspec")

In [None]:
import boto3
from datetime import datetime
import fsspec
import pandas as pd
from pyspark.sql.dataframe import DataFrame
from pyspark.sql import functions as f, types as t, Window
from pathlib import Path
import re
import requests
import s3fs
import subprocess
import timeit
from urllib.parse import urlparse

# Removes truncation of columns, column values in Pandas
# by default
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

# Monkey patching the DataFrame transform method for Spark 2.4
# This is available by default in Spark 3.0
def transform(self, f):
    return f(self)
DataFrame.transform = transform

# Override the timeit template to return the command's
# return value in addition to the time
# Reference: https://stackoverflow.com/questions/24812253/how-can-i-capture-return-value-with-python-timeit-module
timeit.template = """
def inner(_it, _timer{init}):
    {setup}
    _t0 = _timer()
    for _i in _it:
        retval = {stmt}
    _t1 = _timer()
    return _t1 - _t0, retval
"""

def shell_cmd(cmd):
    """
    Wrapper for running shell commands and printing the output
    Some helpful recipes:
    - List files on hdfs: shell_cmd("hdfs dfs -ls hdfs:///tmp/data/")
    - Remove files from hdfs: shell_cmd("hdfs dfs -rm -r hdfs:///tmp/data/test_pyspark")
    """
    for line in subprocess.check_output(cmd, shell=True).split(b'\n'):
        print(line)

def timer_method(cmd):
    """
    Wrapper for timeit that returns the value of a function and its runtime
    To use, pass a string of the function you wish to time
    Example: 
     run_time, result = timer_method("myfunction(arg1, arg2)")
    """
    # Setting globals = globals() enables the timeit function
    # to return the value generated by cmd
    return timeit.timeit(cmd, number=1, globals = globals())

### Set your s3 bucket name
This should be data-scale-oreilly-{your name}   
If you dont remember check the [S3 console](https://s3.console.aws.amazon.com/)

In [None]:
MY_BUCKET_NAME = "data-scale-oreilly"

# Ingesting from an S3 bucket - NYC Taxi Data

https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
* Taxi data 
* Data dictionaries 
* Taxi zone lookup table

Data ingestion has the ultimate goal of collecting, aggregating, and surfacing data for a specific purpose; an analysis, an API, a dashboard, etc. Think about how you might use the taxi data to answer the following questions:

1. Which borough is the most popular pickup or drop off spot?
1. Are green taxis more popular for trips within the same borough vs yellow taxis?
1. Build a recommendation engine that predicts surge pricing for a given time of day based on historical data  

With this in mind, lets work through bringing this data onto the cluster

In [None]:
# Note, if you copy the link from the taxi data website you will see:
# https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv
# Two things - first, the portion of the URL following "aws.com" is the 
# bucket name. Second, in "trip+data" the "+" is a space
taxi_data_path = "s3://nyc-tlc/trip data/yellow_tripdata_2020-01.csv"

In [None]:
# Pandas uses s3fs to read_csv from s3:
pd_df_taxi= pd.read_csv(taxi_data_path, keep_default_na=False)
print(pd_df_taxi.head())
pd_df_taxi.dtypes

In [None]:
# For reference, look at the Spark DataFrameReader, csv:
# https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html
ps_df_taxi = spark.read.option('header', True).option('inferSchema', True).csv(taxi_data_path)
ps_df_taxi.show()
ps_df_taxi.printSchema()

In [None]:
# Talk through ingest practices around retaining original data vs augmenting
# For example, we may want to keep the data in its default format so we can
# refer back to it if there are bugs in our data ingestion code
ps_df_taxi.write.option("header", True).csv("hdfs:///tmp/input/taxi_data")

In [None]:
# Discuss how spark writes files out
shell_cmd("hdfs dfs -ls hdfs:///tmp/input/taxi_data")

In [None]:
column_subset = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'tip_amount']
ps_df_taxi.select(*column_subset).show()

In [None]:
ps_df_taxi.select(*column_subset).describe().show()

### Lab 2.1 - Write an ingestion function that does the following:
Given a file path to a taxi data csv (i.e. s3://nyc-tlc/trip data/green_tripdata_2020-01.csv) create a function that does the following:
1. Read the file into a Spark dataframe
1. Limit to the `column_subset` columns
1. Write the data as json to hdfs in append mode to `hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json`

Function signature:  
`def ingest_taxi_data(file_name)`

See the subsequent cell for more info on how the `ingest_taxi_data` function will be used   
Reference: https://spark.apache.org/docs/2.4.5/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter  

When you're done, run the next 2 cells to ingest several taxi data files and examine the result

In [None]:
def ingest_taxi_data(file_name):
    # Enclosing code in () allows multi line
    (spark
         .read
         .option('header', True)
         .option("inferSchema", True)
         .csv(file_name)
         .select(*column_subset)           
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

In [None]:
# Run the ingest for several files
taxi_data_prefix = "s3://nyc-tlc/trip data"
taxi_data_files = ["yellow_tripdata_2019-01.csv", "yellow_tripdata_2018-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}/{file_name}"
    ingest_taxi_data(taxi_data_path)

In [None]:
# How did the types fare in this conversion?
# Turns out its a bug! 
# https://issues.apache.org/jira/browse/SPARK-26325
# https://stackoverflow.com/questions/53697388/interpret-timestamp-fields-in-spark-while-reading-json
df = spark.read.json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
df.printSchema()
df.show(5, False)

#### Transforming data types 

Available pyspark types are listed in the pyspark.sql.types module https://spark.apache.org/docs/2.2.0/api/python/pyspark.sql.html#module-pyspark.sql.types

pyspark.types is imported as t, so to apply the IntegerType use t.IntegerType()

For pandas, see the following resources on converting types https://stackoverflow.com/questions/15891038/change-column-type-in-pandas

In [None]:
# Pyspark
(df.select("tpep_dropoff_datetime")
 .withColumn("tpep_dropoff_datetime", f.col("tpep_dropoff_datetime").cast(t.TimestampType()))
).dtypes

In [None]:
# Casting pandas columns to a type - this will give an error on empty cells
(pd_df_taxi[[*column_subset]]
        .astype({'passenger_count': 'Int64'}))

In [None]:
# To convert to Integer using pandas, we have to first deal with the null values
# to_numeric with 'coerce' will fill invalid integer values with np.NaN
# the Int64 type in later versions of pandas will convert np.NaN to a nullable
# integer type: https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html
pd.to_numeric(pd_df_taxi.passenger_count, errors='coerce').astype('Int64').dtypes

In [None]:
# Modified taxi_data_ingest with transformed timestamps
def ingest_taxi_data(file_name):
    # Enclosing code in () allows multi line
    (spark
         .read
         .option('header', True)
         .option("inferSchema", True)
         .csv(file_name)
         .select(*column_subset)
         .withColumn("tpep_pickup_date", f.col("tpep_pickup_datetime").cast(t.DateType()))
         .withColumn("tpep_dropoff_date", f.col("tpep_dropoff_datetime").cast(t.DateType()))
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

In [None]:
# Remove previous data
shell_cmd("hdfs dfs -rm -r hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json/")

In [None]:
taxi_data_prefix = "s3://nyc-tlc/trip data"
taxi_data_files = ["yellow_tripdata_2019-01.csv", "yellow_tripdata_2018-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}/{file_name}"
    ingest_taxi_data(taxi_data_path)
    
df = spark.read.json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
df.show(5)

### Testing ingestion code

The `ingest_taxi_data` method is not well structured for testing:
* Writes to the file system
* Requires an input file to test
* What other shortcomings?

To make this code more testable, split out the transformation logic so it can be unit tested.  
Definining a transformation function that takes a dataframe and returns a dataframe provides a better interface for unit testing, and a more extensible structure in case we need to add more dataframe functions before or after the transformation step.

In [None]:
def transform_taxi_data(df):
    return (df
            .withColumn("tpep_pickup_date", f.col("tpep_pickup_datetime").cast(t.DateType()))
            .withColumn("tpep_dropoff_date", f.col("tpep_dropoff_datetime").cast(t.DateType()))
           )

# Option 1
def ingest_taxi_data_method(file_name):
    df_input = (spark
         .read
         .option('header', True).csv(taxi_data_path)
         .select(*column_subset))
    
    (transform_taxi_data(df_input)
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

# Option 2
def ingest_taxi_data_transform(file_name):
    # Requires patching of Dataframe.transform method in Spark 2.4, but available natively
    # in Spark 3.0 https://mungingdata.com/pyspark/chaining-dataframe-transformations/
    df_input = (spark
         .read
         .option('header', True).csv(taxi_data_path)
         .select(*column_subset)
         .transform(transform_taxi_data)
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

In [None]:
test_data = [
    "{'tpep_pickup_datetime': '2020-05-23', 'tpep_dropoff_datetime': '2020-05-23'}",
    "{'tpep_pickup_datetime': '2020-10-01', 'tpep_dropoff_datetime': '2020-10-01'}",
    "{'tpep_pickup_datetime': '2020-02-02', 'tpep_dropoff_datetime': '2020-02-03'}"
]
expected_types = {'tpep_dropoff_date': 'date', 'tpep_pickup_date': 'date', 'tpep_pickup_datetime':'string', 'tpep_dropoff_datetime':'string'}

test_df = spark.read.json(sc.parallelize(test_data))
print(test_df.dtypes)
test = transform_taxi_data(test_df)
test_types = {item[0]:item[1] for item in test.dtypes}

print(expected_types == test_types)

test.printSchema()

#### Lets try running the ingestion code on the other taxi data sets

In [None]:
# Try using the ingest code we created for yellow taxi for all the taxis
# This will fail because the datetime fields have different names across different servcies

taxi_data_prefix = "s3://nyc-tlc/trip data/"
taxi_data_files = ["green_tripdata_2020-01.csv", "fhv_tripdata_2020-01.csv", "fhvhv_tripdata_2020-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}{file_name}"
    ingest_taxi_data_transform(file_name)

#### How can we ingest all taxi services AND be able to tell them apart?

Taxi file names: 
* yellow_tripdata_2020-01.csv
* green_tripdata_2020-01.csv
* fhv_tripdata_2020-01.csv
* fhvhv_tripdata_2020-01.csv

The file name provides information including:
* Service type (yellow, green, etc)
* File date

We want to augment the taxi data with this information so we can refer back to it in analysis.

Is there other data we might want to augment the raw data with? Some things to consider:
* Additional fields that could help with analysis
* Metadata, such as when the record was last updated

In [None]:
# Using matched groups, we can extract information from the taxi file names
# i.e. yellow_tripdata_2020-01.csv
TAXI_DATA_PATTERN = "(?P<service>[a-zA-Z0-9]+)_tripdata_(?P<year>[0-9]{4})-(?P<month>[0-9]{2}).csv"

def extract_file_info(file_name):
    # Returns (service, year, month) given a taxi file name
    m = re.match(TAXI_DATA_PATTERN, file_name)
    if m is not None:
        return (m.group(1), m.group(2), m.group(3))
    
extract_file_info("yellow_tripdata_2020-01.csv")

### Lab 2.2 - Ingesting multiple taxi service types

See the [Taxi data website](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) for reference

Using the template in the next cell, create the following functions:
* Service specific transformations to match the schema below
* A general transformation function to apply metadata and other common transformations

Schema:

* pickup_datetime Timestamp
* dropoff_datetime Timestamp
* pickup_date Date
* dropoff_date Date
* passenger_count Integer
* fare_amount Float
* tip_amount Float
* trip_distance Float
* PULocationID Integer
* DOLocationID Integer

Metadata fields:  explore `f.lit` to add these columns
* service
* year
* month

Refer to `ingest_taxi_data_multi_service` to see how these functions will be used    

You may find some helpful info here: https://spark.apache.org/docs/latest/api/python/pyspark.sql.html  

In [None]:
def transform_yellow_taxi(df):
    subset = ['pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'tip_amount']
    return (df.withColumnRenamed("tpep_pickup_datetime", "pickup_datetime")
        .withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")
        .select(*subset)
        .withColumn("dropoff_date", f.col("dropoff_datetime").cast(t.DateType()))
        .withColumn("pickup_date", f.col("pickup_datetime").cast(t.DateType()))

        )
        
def transform_green_taxi(df):
    subset = ['pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'tip_amount']
    return (df.withColumnRenamed("lpep_pickup_datetime", "pickup_datetime")
        .withColumnRenamed("lpep_dropoff_datetime", "dropoff_datetime")
        .select(*subset)
        .withColumn("dropoff_date", f.col("dropoff_datetime").cast(t.DateType()))
        .withColumn("pickup_date", f.col("pickup_datetime").cast(t.DateType()))
        )

def transform_fhv(df):
    return df.select(*["pickup_datetime", "dropoff_datetime", "PULocationID", "DOLocationID"])

def transform_all(df, service, year, month):
    return (df.withColumn("service", f.lit(service))
         .withColumn("year", f.lit(year))
         .withColumn("month", f.lit(month))
         .withColumn("dropoff_date", f.col("dropoff_datetime").cast(t.DateType()))
         .withColumn("pickup_date", f.col("pickup_datetime").cast(t.DateType())))

In [None]:
def ingest_taxi_data_multi_service(file_name, ingested_on):
    print(f"Processing {file_name}")
    (service, year, month) = extract_file_info(Path(file_name).name)
    input_df = spark.read.option('header', True).option('inferSchema', True).csv(file_name)
    
    if service == 'yellow':
        df_transform = transform_yellow_taxi(input_df)
    elif service == 'green':
        df_transform = transform_green_taxi(input_df)
    else:
        # FHV. What happens if there are more taxi services added?
        df_transform = transform_fhv(input_df)
        
    print(df_transform.dtypes)

    (transform_all(df_transform, service, year, month)
         .withColumn("ingested_on", f.lit(ingest_timestamp))
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

In [None]:
shell_cmd("hdfs dfs -rm -r hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json/")

In [None]:
ingest_timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S%z")
taxi_data_prefix = "s3://nyc-tlc/trip data/"
taxi_data_files = ["yellow_tripdata_2020-01.csv"]#, "green_tripdata_2020-01.csv", "fhv_tripdata_2020-01.csv", "fhvhv_tripdata_2020-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}{file_name}"
    ingest_taxi_data_multi_service(taxi_data_path, ingest_timestamp)
    
df_taxi_output = spark.read.json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
df_taxi_output.show(5)
df_taxi_output.groupby("service").count().show()

### Handling bad data
How to design for the inevitability of bad data  
Reference: https://blog.knoldus.com/apache-spark-handle-corrupt-bad-records/

In [None]:
bad_data = [
    "{'pickup_datetime': '2020-05-23 21:05:23', 'fare_amount': '0.05'}",
    "{'pickup_datetime': '2020-05-23 08:05:23', 'fare_amount': '10.05'}",
    "{'pickup_datetime': '2020-05-23 21:05:23', 'fare_amount}"
]

In [None]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="PERMISSIVE", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.show()

In [None]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="DROPMALFORMED", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.show()

In [None]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="FAILFAST", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.show()

### Lab 2.3 - Write an ingestion for the taxi zone lookup
File location - Yes, there is a space between taxi and the '_'  

s3://nyc-tlc/misc/taxi _zone_lookup.csv

`def ingest_taxi_lookup():`
1. Read taxi lookup data, ensuring data types are correct
1. Add relevant metadata
1. Save to hdfs:///tmp/data/nyc-taxi/zone-lookup/output/section2/json
1. What write mode should be used?

Refer back to Taxi Data page for more info: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [None]:
def taxi_zone_transform(df):
    return df.withColumn("ingested_on", f.lit(ingest_timestamp))

def ingest_taxi_lookup(ingest_timestamp):
    (spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("s3://nyc-tlc/misc/taxi _zone_lookup.csv")
    .transform(taxi_zone_transform)
    .write
    .mode("overwrite")
    .json("hdfs:///tmp/data/nyc-taxi/zone-lookup/output/section2/json"))

In [None]:
ingest_timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S%z")
print(timer_method("ingest_taxi_lookup(ingest_timestamp)"))

# Break

### Lab 2.4 - Case Study 1: Month over month, get the total count of of pickups per borough
#### Do not blindly run hese cells, you can bork your cluster

In [None]:
taxiPath = "hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json/"
taxiLookupPath = "hdfs:///tmp/data/nyc-taxi/zone-lookup/output/section2/json/"

In [None]:
# Join boroughs
# Expected error cartesian join. most likely a carryover bug from 2.0
spark.conf.set("spark.sql.crossJoin.enabled", "false") #<-- default
df_taxi = spark.read.json(taxiPath)
df_taxi_lookup = spark.read.json(taxiLookupPath)

taxi_filtered = (df_taxi
 .filter(df_taxi.pickup_datetime.isNotNull())
 .filter(df_taxi.dropoff_datetime.isNotNull()))
taxi_pu = (taxi_filtered
.join(df_taxi_lookup
       .select("LocationID", "Borough")
       .withColumnRenamed("Borough", "PUBorough"), 
       df_taxi_lookup.LocationID == df_taxi.PULocationID))
taxi = (taxi_pu.join(df_taxi_lookup
       .select("LocationID", "Borough")
       .withColumnRenamed("Borough", "DOBorough"), 
       df_taxi_lookup.LocationID == taxi_pu.DOLocationID))
taxi_pu.show()
taxi.show()

In [None]:
spark.conf.set("spark.sql.crossJoin.enabled", "true")
df_taxi = spark.read.json(taxiPath)
df_taxi_lookup = spark.read.json(taxiLookupPath)

taxi_filtered = (df_taxi
 .filter(df_taxi.pickup_datetime.isNotNull())
 .filter(df_taxi.dropoff_datetime.isNotNull()))
taxi_pu = (taxi_filtered
.join(df_taxi_lookup
       .select("LocationID", "Borough")
       .withColumnRenamed("Borough", "PUBorough"), 
       df_taxi_lookup.LocationID == df_taxi.PULocationID))
taxi = (taxi_pu.join(df_taxi_lookup
       .select("LocationID", "Borough")
       .withColumnRenamed("Borough", "DOBorough"), 
       df_taxi_lookup.LocationID == taxi_pu.DOLocationID))
taxi_pu.show()
taxi.explain()

In [None]:
def get_monthly_totals_pyspark(taxiPath, taxiLookupPath):
    taxi = spark.read.json(taxiPath)
    taxi_lookup = spark.read.json(taxiLookupPath)
    taxi_filtered = (taxi
     .filter(taxi.pickup_datetime.isNotNull())
     .filter(taxi.dropoff_datetime.isNotNull()))
                     
    groupDF = taxi_filtered.join(taxi_lookup, taxi_filtered.PULocationID == taxi_lookup.LocationID)
    groupDF.select("ingested_on").show() # expected error
    return groupDF

In [None]:
print(timer_method("get_monthly_totals_pyspark(taxiPath, taxiLookupPath)"))

In [None]:
def get_monthly_totals_pandas(taxiPath, taxiLookupPath):
    taxi = pd.read_json(taxiPath)
    taxi_lookup = pd.read_json(taxiLookupPath)
    taxi_filtered = tax.dropna(subset=['pickup_datetime', 'dropoff_datetime'])
    
    groupDF = taxi_filtered.join(taxi_lookup.set_index('LocationID'), on='PULocationID')
    groupDF['pickup_month'] = pd.to_datetime(groupDF['pickup_datetime'], format='%m%Y')
    groupDF = groupDF.groupby('pickup_month', 'borough').agg('count').sort_values(by=['count', 'borough'], ascending=[False, True])
    groupDF
    return groupDF

In [None]:
print(timer_method("get_monthly_totals_pandas(taxiPath, taxiLookupPath)"))

In [None]:
def get_monthly_totals_pandas(taxiPath, taxiLookupPath):
    taxiPySpark = spark.read.json(taxiPath)
    taxiLookupPySpark = spark.read.json(taxiLookupPath)
    
    taxi = taxiPySpark.toPandas()
    taxiLookup = taxiLookupPySpark.toPandas()
    taxiFiltered = taxi.dropna(subset=['pickup_datetime', 'dropoff_datetime'])
    
    groupDF = taxiFiltered.join(taxiLookup[["Borough", "LocationID"]].set_index('LocationID'), on='PULocationID')
    
    groupDF['pickup_month'] = pd.to_datetime(groupDF['pickup_datetime']).dt.strftime('%Y%m')
    returnGroupDF = groupDF.groupby(['pickup_month', 'Borough']).size().reset_index(name='count').sort_values(by=['pickup_month', 'count', 'Borough'], ascending=[False, False, True])
    return returnGroupDF

def get_monthly_totals_pyspark(taxiPath, taxiLookupPath):
    taxi = spark.read.json(taxiPath)
    taxiLookup = spark.read.json(taxiLookupPath)
    
    taxiFiltered = (taxi
     .filter(taxi.pickup_datetime.isNotNull())
     .filter(taxi.dropoff_datetime.isNotNull()))
                     
    groupDF = taxiFiltered.join(taxiLookup.select("Borough", "LocationID"), taxiFiltered.PULocationID == taxiLookup.LocationID)
    groupDF = groupDF.withColumn("pickup_month", f.date_format("pickup_datetime", "yyyyMM"))
    groupDF = groupDF.groupBy("pickup_month", "borough").count().orderBy(f.desc("pickup_month"), f.desc("count"), "borough")
    groupDF.show()
    return groupDF

In [None]:
# Running this command with the original cluster size, will crash the cluster
# All functions utilizing pandas from this command forward, need an upscaled driver node
print(timer_method("get_monthly_totals_pandas(taxiPath, taxiLookupPath)"))

In [None]:
print(timer_method("get_monthly_totals_pyspark(taxiPath, taxiLookupPath)"))

In [None]:
## Expected error for maxResultSize: This won't work. Could try the subsequent cells
## Those restart the state of the notebook and don't work as expected
## Need to restart the cluster and edit the Software config with: [{"classification":"spark-defaults", "properties":{"spark.driver.maxResultSize":"5G", "spark.ui.killEnabled":"true"}, "configurations":[]}]
## Then need to reun the taxi and taxi lookup ingests
## Run -> Run All Above Selected Cell
## Second expected error for {"msg":"requirement failed: Session isn't active."} and will hang. Driver node ran out of mem. Will need to go and upscale
print(spark.conf.get('spark.driver.maxResultSize'))
spark.conf.set("spark.driver.maxResultSize", "5G")
print(spark.conf.get('spark.driver.maxResultSize'))

In [None]:
%%configure -f
{"conf":{"spark.driver.maxResultSize":"5G"}}

In [None]:
%%info

In [None]:
def get_monthly_totals_concat_pandas(taxiPath, taxiLookupPath):
    taxiPySpark = spark.read.json(taxiPath)
    taxiLookupPySpark = spark.read.json(taxiLookupPath)
    
    taxi = taxiPySpark.toPandas()
    taxiLookup = taxiLookupPySpark.toPandas()
    taxiFiltered = taxi.dropna(subset=['pickup_datetime', 'dropoff_datetime'])
    
    groupDF = taxiFiltered.join(taxiLookup[["Borough", "LocationID"]].set_index('LocationID'), on='PULocationID')
    groupDF['pickup_month'] = groupDF['year'] + groupDF['month']
    groupDF = groupDF.groupby(['pickup_month', 'Borough']).size().reset_index(name='count').sort_values(by=['pickup_month', 'count', 'Borough'], ascending=[False, False, True])
    return groupDF
    
def get_monthly_totals_concat_pyspark(taxiPath, taxiLookupPath):
    taxi = spark.read.json(taxiPath)
    taxiLookup = spark.read.json(taxiLookupPath)
    taxiFiltered = (taxi
     .filter(taxi.pickup_datetime.isNotNull())
     .filter(taxi.dropoff_datetime.isNotNull()))
        
    groupDF = taxiFiltered.join(taxiLookup, taxiFiltered.PULocationID == taxiLookup.LocationID)
    groupDF = groupDF.withColumn("pickup_month", f.concat("year", "month")).select("pickup_datetime", "borough", "pickup_month")
    groupDF = groupDF.groupBy("pickup_month", "borough").count().orderBy(f.desc("pickup_month"), f.desc("count"), "borough")
    groupDF.show()
    return groupDF

In [None]:
print(timer_method("get_monthly_totals_concat_pandas(taxiPath, taxiLookupPath)"))

In [None]:
print(timer_method("get_monthly_totals_concat_pyspark(taxiPath, taxiLookupPath)"))

### Lab 2.5 - Case Study 2: Month over month, get the borough with the most amount of pickups per month

In [None]:
def get_most_pickups_per_month_pandas(taxiPath, taxiLookupPath):
    inputDF = get_monthly_totals_pandas(taxiPath, taxiLookupPath)
    firstDF = inputDF.groupby("pickup_month").head(1).reset_index(drop=True)#.first()#sort_values(by=['pickup_month', 'count'], ascending=[True, False]).head(1).reset_index(drop=True)
    firstDF
    return firstDF

def get_most_pickups_per_month_pyspark(taxiPath, taxiLookupPath):
    inputDF = get_monthly_totals_pyspark(taxiPath, taxiLookupPath)
    firstDF = inputDF.orderBy(f.desc("pickup_month"), f.desc("count")).groupBy("pickup_month").agg(f.first("borough")).orderBy(f.desc("pickup_month"))
    firstDF.explain()
    firstDF.show()
    return firstDF

In [None]:
print(timer_method("get_most_pickups_per_month_pandas(taxiPath, taxiLookupPath)"))

In [None]:
print(timer_method("get_most_pickups_per_month_pyspark(taxiPath, taxiLookupPath)"))

In [None]:
def get_most_pickups_per_month_window_pyspark(taxiPath, taxiLookupPath):
    from pyspark.sql import Window
    inputDF = get_monthly_totals_pyspark(taxiPath, taxiLookupPath)
    win = Window.partitionBy("pickup_month").orderBy(f.desc("count"))
    firstDF = inputDF.withColumn("row_num", f.row_number().over(win)).where("row_num == 1")
    firstDF = firstDF.orderBy(f.desc("pickup_month"))
    firstDF.explain()
    firstDF.show(firstDF.count())
    return firstDF

In [None]:
print(timer_method("get_most_pickups_per_month_window_pyspark(taxiPath, taxiLookupPath)"))

### Lab - 2.6 Run and time the overall pipeline

In [None]:
# Reset notebook kernel
def ingest_main():
    ingest_taxi_data_multi_service("s3://nyc-tlc/trip data/yellow_tripdata_2020-01.csv")
    ingest_taxi_lookup("s3://nyc-tlc/misc/taxi _zone_lookup.csv")
    get_most_pickups_per_month_window_pyspark()

In [None]:
print(timer_method("ingest_main()"))