In [None]:
# Install libraries within the notebook scope
sc.install_pypi_package("boto3")
sc.install_pypi_package("pandas")
sc.install_pypi_package("requests")
sc.install_pypi_package("s3fs")

In [None]:
import boto3
from datetime import datetime
import pandas as pd
from pyspark.sql.dataframe import DataFrame
from pyspark.sql import functions as f, types as t
from pathlib import Path
import re
import requests
import s3fs
import subprocess
import timeit
from urllib.parse import urlparse

# Removes truncation of columns, column values in Pandas
# by default
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

# Monkey patching the DataFrame transform method for Spark 2.4
# This is available by default in Spark 3.0
def transform(self, f):
    return f(self)
DataFrame.transform = transform

# Override the timeit template to return the command's
# return value in addition to the time
# Reference: https://stackoverflow.com/questions/24812253/how-can-i-capture-return-value-with-python-timeit-module
timeit.template = """
def inner(_it, _timer{init}):
    {setup}
    _t0 = _timer()
    for _i in _it:
        retval = {stmt}
    _t1 = _timer()
    return _t1 - _t0, retval
"""

def shell_cmd(cmd):
    for line in subprocess.check_output(cmd, shell=True).split(b'\n'):
        print(line)

def timer_method(cmd):
    # Setting globals = globals() enables the timeit function
    # to return the value generated by cmd
    return timeit.timeit(cmd, number=1, globals = globals())

# Ingesting from an S3 bucket - NYC Taxi Data

https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
* Taxi data 
* Data dictionaries 
* Taxi zone lookup table

Take a few minutes to look through the Data Dictionaries and Metadata and the Taxi Zone Maps and Lookup Tables. What are some things you notice about the data?

Data ingestion has the ultimate goal of collecting, aggregating, and surfacing data for a specific purpose; an analysis, an API, a dashboard, etc. Think about how you might use the taxi data to answer the following questions:

1. Which borough is the most popular pickup or drop off spot?
1. Are green taxis more popular for trips within the same borough vs yellow taxis?
1. Build a recommendation engine that predicts surge pricing for a given time of day based on historical data  

With this in mind, lets work through bringing this data onto the cluster

In [None]:
# Note, if you copy the link from the taxi data website you will see:
# https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv
# Two things - first, the portion of the URL following "aws.com" is the 
# bucket name. Second, in "trip+data" the "+" is a space
taxi_data_path = "s3://nyc-tlc/trip data/yellow_tripdata_2020-01.csv"

# When working with big data it can be challenging to view the data. How would you 
# go about getting a sample of this data? (download it, use requests, pandas, etc)

# Pandas uses s3fs to read_csv from s3:
pd_run_time, pd_df_taxi = timer_method("pd.read_csv(taxi_data_path, keep_default_na=False)")
print(f"runtime: {pd_run_time}")

In [None]:
# Take a look at the data. Notice how pandas will try to assign types. Is this desirable?
# Why or why not?
# Since we have column names it also seems this data has a header
pd_df_taxi.dtypes

In [None]:
# For reference, look at the Spark DataFrameReader, csv:
# https://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html
ps_run_time, ps_df_taxi = timer_method("spark.read.option('header', True).csv(taxi_data_path)")
print(f"runtime: {ps_run_time}")

# Talk through the spark UI here, partcularly note that the cell number will show up in the SparkUI
# Job list next to the Job id. For example. this would be Job 2 (cell number)
# Look at runtime and talk about lazy evaluation

In [None]:
ps_df_taxi.show(10, False)

In [None]:
# Lets see how the spark dataframe reader interpreted the data
# Talk about nullable vs non nullable and maybe a small bit about data schemas
ps_df_taxi.printSchema()

In [None]:
# Talk through ingest practices around retaining original data vs augmenting
# For example, we may want to keep the data in its default format so we can
# refer back to it if there are bugs in our data ingestion code
ps_df_taxi.write.option("header", True).csv("hdfs:///tmp/input/taxi_data")

In [None]:
# Discuss how spark writes files out
shell_cmd("hdfs dfs -ls hdfs:///tmp/input/taxi_data")

In [None]:
# Examine the data, what do you notice?
column_subset = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'tip_amount']
ps_df_taxi.select(*column_subset).show(10, False)

In [None]:
# Keep in mind the datatype when considering these results
ps_df_taxi.select(*column_subset).describe().show()

In [None]:
# Casting columns to a type
(ps_df_taxi.select("passenger_count")
 .withColumn("passenger_count", f.col("passenger_count").cast(t.IntegerType()))
).distinct().show()

In [None]:
# Casting pandas columns to a type - this will give an error on empty cells
(pd_df_taxi[[*column_subset]]
         .astype({'passenger_count': 'Int64'}))

In [None]:
# To convert to Integer using pandas, we have to first deal with the null values
# to_numeric with 'coerce' will fill invalid integer values with np.NaN
# the Int64 type in later versions of pandas will convert np.NaN to a nullable
# integer type: https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html
pd.to_numeric(pd_df_taxi.passenger_count, errors='coerce').astype('Int64').unique()

# Some other type conversions you may find useful:
# pd.to_datetime(pd_df_taxi.tpep_pickup_datetime)
# pd.to_numeric produces float type

### Lab 2.1 - Transform the column_subset of taxi data to data types that accurately represent the data

Result: a transformed_taxi dataframe with the column_subset columns cast to an appropriate type

Available types are listed in the pyspark.sql.types module https://spark.apache.org/docs/2.2.0/api/python/pyspark.sql.html#module-pyspark.sql.types  

This is imported as `t`, so to apply the IntegerType use `t.IntegerType()`

For pandas, see the following resources on converting types
https://stackoverflow.com/questions/15891038/change-column-type-in-pandas

In [None]:
# Talk about transforming
# the data into a type schema to surface for analytical operations - i.e. pay the penalty of time
# on the ingest to transform strings to timestamps so the analysis side can use datetime methods
# without having to remember to cast


def transform_taxi_ps(ps_df_taxi):
    """
    ps_df_taxi: pyspark dataframe
    returns: dataframe of column_subset with types applied to all fields
    """

def transform_taxi_pd(pd_df_taxi):
    """
    pd_df_taxi: pandas dataframe
    returns: dataframe of column_subset with types applied to all fields
    """



In [None]:
ps_run_time, ps_transform_taxi = timer_method("transform_taxi_ps(ps_df_taxi)")
pd_run_time, pd_transform_taxi = timer_method("transform_taxi_pd(pd_df_taxi)")
print(f"pyspark runtime: {ps_run_time} pandas runtime {pd_run_time}")

In [None]:
pd_transform_taxi.head()

In [None]:
ps_transform_taxi.show(5)

In [None]:
# Note that this only shows results for numeric and string columns - now that the data has been cast to 
# types, we can explore it a bit more
ps_transform_taxi.describe().show()

In [None]:
# Take a look at the date ranges...
ps_transform_taxi.select("tpep_pickup_datetime").sort(f.asc("tpep_pickup_datetime")).show(10, False)

In [None]:
# Write out transformed data to EBS
ps_transform_taxi.write.mode("append").json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")

### Lab 2.2 - Write an ingestion function that does the following:
Given a file path to a taxi data csv (i.e. s3://nyc-tlc/trip data/green_tripdata_2020-01.csv)
1. Read the file into a Spark dataframe
2. Transform the column_subset
3. Write the data as json to hdfs in append mode to `hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json`

Function signature:  
`def ingest_taxi_data(file_name)`

Inputs can be created from:  
`taxi_data_prefix = "s3://nyc-tlc/trip data/"
taxi_data_files = ["yellow_tripdata_2019-01.csv", "yellow_tripdata_2018-01.csv", "yellow_tripdata_2017-01.csv"]  `

In [None]:
def ingest_taxi_data(file_name):
    """
    file_name: path to file, i.e. s3://bucket/file.csv
    """

In [None]:
# Run the ingest for several files
taxi_data_prefix = "s3://nyc-tlc/trip data/"
taxi_data_files = ["yellow_tripdata_2019-01.csv", "yellow_tripdata_2018-01.csv", "yellow_tripdata_2017-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}{file_name}"
    ingest_taxi_data(taxi_data_path)

In [None]:
df = spark.read.json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
df.printSchema()

# Talk through how the read.json interpreted the Integers as Longs, set stage for using schemas

### Testing ingestion code

The `ingest_taxi_data` method is not well structured for testing:
* Writes to the file system
* Requires an input file to test
* What other shortcomings?

To make this code more testable, split out the transformation logic so it can be unit tested.  
Definining a transformation function that takes a dataframe and returns a dataframe provides a better interface for unit testing, and a more extensible structure in case we need to add more dataframe functions before or after the transformation step.

In [None]:
def transform_taxi_data(df):
    return (df.withColumn("tpep_pickup_datetime", f.col("tpep_pickup_datetime").cast(t.TimestampType()))
     .withColumn("tpep_dropoff_datetime", f.col("tpep_dropoff_datetime").cast(t.TimestampType()))
     .withColumn("passenger_count", f.col("passenger_count").cast(t.IntegerType()))
     .withColumn("trip_distance", f.col("trip_distance").cast(t.FloatType()))
     .withColumn("PULocationID", f.col("PULocationID").cast(t.IntegerType()))
     .withColumn("DOLocationID", f.col("DOLocationID").cast(t.IntegerType()))
     .withColumn("fare_amount", f.col("fare_amount").cast(t.FloatType()))
     .withColumn("tip_amount", f.col("tip_amount").cast(t.FloatType())))
    
def ingest_taxi_data_transform(file_name):
    # Requires patching of Dataframe.transform method in Spark 2.4, but available natively
    # in Spark 3.0 https://mungingdata.com/pyspark/chaining-dataframe-transformations/
    df_input = (spark
         .read
         .option('header', True).csv(taxi_data_path)
         .select(*column_subset)
         .transform(transform_taxi_data)
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )
    
def ingest_taxi_data_method(file_name):
    # Equivalent code without using the monkey-patched transform method for DataFrame
    df_input = (spark
         .read
         .option('header', True).csv(taxi_data_path)
         .select(*column_subset))
    
    (transform_taxi_data(df_input)
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

In [None]:
test_data = [
    "{'tpep_pickup_datetime': '2020-05-23 21:05:23', 'tpep_dropoff_datetime': '2020-05-23 08:05:23', 'passenger_count': 0, 'trip_distance': 10.5, 'PULocationID': 1, 'DOLocationID': 254, 'fare_amount': 0.05, 'tip_amount': 1.00}",
    "{'tpep_pickup_datetime': '2020-10-01 01:05:23', 'tpep_dropoff_datetime': '2020-10-01 02:05:23', 'passenger_count': 1, 'trip_distance': 0.1, 'PULocationID': 45, 'DOLocationID': 3, 'fare_amount': 10.0, 'tip_amount': 5.00}",
    "{'tpep_pickup_datetime': '2020-02-02 15:22:23', 'tpep_dropoff_datetime': '2020-02-03 15:44:23', 'passenger_count': 3, 'trip_distance': 3.25, 'PULocationID': 10, 'DOLocationID': 24, 'fare_amount': 5.05, 'tip_amount': 1.00}"
]
expected_types = {'DOLocationID': 'int', 'PULocationID': 'int', 'fare_amount': 'float', 'passenger_count': 'int', 'tip_amount': 'float', 'tpep_dropoff_datetime': 'timestamp', 'tpep_pickup_datetime': 'timestamp', 'trip_distance': 'float'}

test_df = spark.read.json(sc.parallelize(test_data))
test = transform_taxi_data(test_df)
test_types = {item[0]:item[1] for item in test.dtypes}

assert expected_types == test_types

In [None]:
shell_cmd("hdfs dfs -rm hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json/*")

In [None]:
# Re run the ingestion using the functions with the transformation broken out
taxi_data_prefix = "s3://nyc-tlc/trip data/"
taxi_data_files = ["yellow_tripdata_2019-01.csv", "yellow_tripdata_2018-01.csv", "yellow_tripdata_2017-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}{file_name}"
    ingest_taxi_data_method(file_name)
    #ingest_taxi_data_transform(file_name)

In [None]:
df = spark.read.json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
df.count()

#### Lets try running the ingestion code on the other taxi data sets

In [None]:
# Try using the ingest code we created for yellow taxi for all the taxis
# This will fail because the datetime fields have different names across different servcies

taxi_data_prefix = "s3://nyc-tlc/trip data/"
taxi_data_files = ["green_tripdata_2020-01.csv", "fhv_tripdata_2020-01.csv", "fhvhv_tripdata_2020-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}{file_name}"
    ingest_taxi_data_transform(file_name)

### Data modeling
Create a model for all taxi data, given that there are differences across the services in the kind of data collected

Commmon fields across all services:
* PULocationID
* DOLocationID

Fields we want to normalize across all services - this data is in all services but is named differently
* pickup datetime
* drop off datetime

Service specific fields. These are only in green or yellow data
* Passenger_count
* Trip_distance
* Fare_amount
* Tip_amount


### Lab 2.3 - Using pyspark, write different transformation functions for each taxi service type to match the following signature and schema:
Fields / Types:

* pickup_datetime Timestamp
* dropoff_datetime Timestamp
* passenger_count Integer
* fare_amount Float
* tip_amount Float
* PULocationID Integer
* DOLocationID Integer

`def transform_function(dataframe):  
    return transformed_dataframe
`

Once the transformation functions are done, rewrite `ingest_taxi_data` to use these new functions depending on the file being processed

In [None]:
def transform_yellow_taxi(df):

        
def transform_green_taxi(df):

 
def transform_fhv(df):
    

Now that we have the ability to read all the taxi data into the same dataset, how will we be able to tell where the original data came from? The file name provides information including:
* Service type (yellow, green, etc)
* File date

We want to augment the taxi data with this information so we can refer back to it in analysis.

Is there other data we might want to augment the raw data with? Some things to consider:
* Additional fields that could help with analysis
* Metadata, like when the record was last updated

In [None]:
# Using matched groups, we can extract information from the taxi file names
TAXI_DATA_PATTERN = "(?P<service>[a-zA-Z0-9]+)_tripdata_(?P<year>[0-9]{4})-(?P<month>[0-9]{2}).csv"

def extract_file_info(file_name):
    m = re.match(TAXI_DATA_PATTERN, file_name)
    if m is not None:
        return (m.group(1), m.group(2), m.group(3))


In [None]:
def ingest_taxi_data_multi_service(file_name):
    print(f"Processing {file_name}")
    (service, year, month) = extract_file_info(Path(file_name).name)
    input_df = spark.read.option('header', True).csv(file_name)
    
    if service == 'yellow':
        df_transform = transform_yellow_taxi(input_df)
    elif service == 'green':
        df_transform = transform_green_taxi(input_df)
    else:
        # FHV. What happens if there are more taxi services added?
        df_transform = transform_fhv(input_df)

    (df_transform
         .withColumn("service", f.lit(service))
         .withColumn("year", f.lit(year))
         .withColumn("month", f.lit(month))
         .write
         .mode("append")
         .json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")
    )

In [None]:
shell_cmd("hdfs dfs -rm -r hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json/")

In [None]:
taxi_data_prefix = "s3://nyc-tlc/trip data/"
taxi_data_files = ["yellow_tripdata_2020-01.csv", "green_tripdata_2020-01.csv", "fhv_tripdata_2020-01.csv", "fhvhv_tripdata_2020-01.csv"]
for file_name in taxi_data_files: 
    taxi_data_path = f"{taxi_data_prefix}{file_name}"
    ingest_taxi_data_multi_service(taxi_data_path)

In [None]:
df_taxi_output = spark.read.json("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json")

In [None]:
df_taxi_output.groupby("service").count().show()

### Handling bad data
How to design for the inevitability of bad data  
Reference: https://blog.knoldus.com/apache-spark-handle-corrupt-bad-records/

In [None]:
bad_data = [
    "{'pickup_datetime': '2020-05-23 21:05:23', 'fare_amount': '0.05'}",
    "{'pickup_datetime': '2020-05-23 08:05:23', 'fare_amount': '10.05'}",
    "{'pickup_datetime': '2020-05-23 21:05:23', 'fare_amount}"
]

In [None]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="PERMISSIVE", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.show()

In [None]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="DROPMALFORMED", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.show()

In [None]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="FAILFAST", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.show()

### Lab 2.4 - Write an ingestion for the taxi zone lookup
File location - Yes, there is a space between taxi and the '_'  

s3://nyc-tlc/misc/taxi _zone_lookup.csv

`def ingest_taxi_lookup():`
1. Read taxi data
1. Cast to correct data types
1. Save to hdfs:///tmp/data/nyc-taxi/zone-lookup/output/section2/json

Refer back to Taxi Data page for more info: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page  
Remember to structure your code for testability

In [None]:
def ingest_taxi_lookup():


In [None]:
print(timer_method("ingest_taxi_lookup()"))

# Break

# Weather data ingestion

NOAA GHCND dataset  
https://docs.opendata.aws/noaa-ghcn-pds/readme.html  

Scroll down to 'FORMAT OF “ghcnd-stations.txt” file' for the schema of the fixed-width stations data


In [None]:
ghcnd_stations_path = "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt"
stations_s3 = "s3://data-scale-oreilly/data/ghcnd/stations/input/ghcnd_stations.txt"
stations_local = "hdfs:///tmp/data/ghcnd/stations/input/ghcnd-stations.txt"
stations_output = "hdfs:///tmp/data/ghcnd/stations/output/ghcnd-stations.txt"

In [None]:
# Workaround reading HTTPS -> HDFS, HTTPS -> S3 -> HDFS
# Spark cant read data directly from HTTP, so copy the file to S3 and read into a dataframe from there
# Then save the file to HDFS for further processing
ingest_timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S%z")
resp = requests.get(ghcnd_stations_path)
if resp.status_code != 200:
    print("Couldn't get station data")
else:
    s3 = boto3.client('s3')
    res = s3.put_object(Body=resp.content, Bucket="data-scale-oreilly", Key=f"data/ghcnd/stations/input/ghcnd_stations.txt")
    if res['ResponseMetadata']['HTTPStatusCode'] != 200:
        print(f"Unable to create ghcnd_stations.txt in s3, response {res['ResponseMetadata']['HTTPStatusCode']}")
    else:
        (spark
         .read
         .text(stations_s3)
         .write
         .format("text")
         .mode("overwrite")
         .save(stations_local))

In [None]:
# Take a look at the stations file we just saved to HDFS
stations = spark.read.text(stations_local)
stations.show(10, False)

In [None]:
# Example of doing a substring transformation
(stations
    .withColumn("id", f.col("value").substr(0, 11))
    .drop("value")
).show(10, False)

In [None]:
# How does this work using pandas?
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_fwf.html
pd.read_fwf(stations_s3, [(0,10)])

### Lab 2.5 - Build the ingestion for the weather stations data

Reference the fixed width schema provided under **FORMAT OF “ghcnd-stations.txt” file**   
https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt  
#### Important note - FORMAT OF is incorrect for: elevation - should be 7 char not 6

Create transformation code to convert the value column into the following schema:
* id String
* lat Float
* long Float
* elevation Float
* state String
* name String

Drop the value column, save the data in JSON format to s3://data-scale-oreilly/data/ghcnd/stations/output/section2_json 

In [None]:
def transform_stations_ps(stations_local):
    """
    stations_local: Path to fixed width data
    returns: pyspark dataframe
    """

def transform_stations_pd(stations_s3):
    """
    stations_s3: Path to fixed width data
    returns: pandas dataframe
    """
    
    

In [None]:
ps_run_time, stations_df = timer_method("transform_stations_ps(stations_local)")
pd_run_time, pd_stations_df = timer_method("transform_stations_pd(stations_s3)")
print(f"pyspark runtime: {ps_run_time} pandas runtime {pd_run_time}")

#### A key aspect of designing scalable systems is to be judicious about the data being stored and processed. 
#### The GHCND stations file contains data on stations across the US, but we are only interested in data near NYC

In [None]:
stations_df.count()

In [None]:
# Lets look at some performance tradeoffs between pyspark and pandas
# The .toPandas() function in pyspark can be convenient if you are familiar with pandas manipulations
# but this can quickly become very expensive as it collects all data on the driver to do the conversion.

def filter_ny_stations_pandas(stations_df):
    ny_stations = stations_df.filter("state == 'NY'")

    # filter down to just stations in NY in NYC. Lat of south Yonkers ~40.9124
    ny_pandas = ny_stations.toPandas()
    ny_pandas[ny_pandas.columns] = ny_pandas.apply(lambda x: x.str.strip())
    nyc_stations = ny_pandas[ny_pandas['lat'].apply(lambda x: float(x)) < 40.9124]
    return spark.createDataFrame(nyc_stations)

def filter_ny_stations_pyspark(stations_df):
    print("Filtering stations to NY only")
    ny_stations = stations_df.filter("state == 'NY'")
    return (ny_stations
            .withColumn("lat", f.col("lat").cast(t.FloatType()))
            .filter("lat < 40.9124"))

In [None]:
pd_run_time, pd_ny_stations = timer_method("filter_ny_stations_pandas(stations_df)")
print(f"runtime {pd_run_time}")

In [None]:
# Are these runtimes really comprable with lazy eval? I think we need to do some combined actions, 
# like filter and aggregate, to look at performance tradeoffs
ps_run_time, ps_ny_stations = timer_method("filter_ny_stations_pyspark(stations_df)")
print(f"runtime {ps_run_time}")

### Lab 2.6 - Write two ingest functions for the stations data, one using filter_ny_stations_pandas and the other using filter_ny_stations_pyspark. What do you notice about the differences?

The functions should:
* Read the station data from local or s3
* Transform the station data into columns from the fixed width format
* Use the above filter functions
* Write the output to stations_output as json in overwrite mode

In [None]:
def ingest_station_pandas():
    
    
def ingest_station_pyspark():


In [None]:
#shell_cmd("hdfs dfs -rm -r hdfs:///tmp/data/test_pyspark")

In [None]:
ps_result = timer_method("ingest_station_pyspark()")
pd_result = timer_method("ingest_station_pandas()")
print(f"pandas: {pd_result} pyspark: {ps_result}")

### Lab 2.7 - Case Study 1: Month over month, get the total count of of raxi rides per borough

These functions should:
* Read the taxi and taxi lookup data from local
* Join the taxi lookup table to the taxi data to get the name of the relevant boroughs for both 
* Get the count of taxi rides per month using both pandas and pyspark
* Display and return the dataframe containing the month in the format YYYYMM, the borough, and the taxi counts for that month and borough

In [None]:
# These should point to the paths where the data from the taxi and taxi-lookup ingests were written
taxiPath = "hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json/"
taxiLookupPath = "hdfs:///tmp/data/nyc-taxi/zone-lookup/output/section2/json/"

In [None]:
# A note about joins
# Try running this cell to join the boroughs for the pickup and  dropoff locations 
df_taxi = spark.read.json(taxiPath)
df_taxi_lookup = spark.read.json(taxiLookupPath)

taxi_filtered = (df_taxi
 .filter(df_taxi.pickup_datetime.isNotNull())
 .filter(df_taxi.dropoff_datetime.isNotNull()))
taxi_pu = (taxi_filtered
.join(df_taxi_lookup
       .select("LocationID", "Borough")
       .withColumnRenamed("Borough", "PUBorough"), 
       df_taxi_lookup.LocationID == df_taxi.PULocationID))
taxi = (taxi_pu.join(df_taxi_lookup
       .select("LocationID", "Borough")
       .withColumnRenamed("Borough", "DOBorough"), 
       df_taxi_lookup.LocationID == taxi_pu.DOLocationID))
taxi_pu.show()
taxi.show()

In [None]:
def get_monthly_totals_pyspark(taxiPath, taxiLookupPath):


In [None]:
print(timer_method("get_monthly_totals_pyspark(taxiPath, taxiLookupPath)"))

In [None]:
def get_monthly_totals_pandas(taxiPath, taxiLookupPath):


In [None]:
print(timer_method("get_monthly_totals_pandas(taxiPath, taxiLookupPath)"))

### Lab 2.8 - Case Study 2: Month over month, get the borough with the most amount of pickups per month

These functions should:
* Utilize the functions from the previous lab to get the monthly taxi ride counts per borough
* Utilizing both pandas and pyspark: display and return a dataframe containing the month, the borough with the most taxi rides for that month, and the count of taxi rides

In [None]:
def get_most_pickups_per_month_pandas(taxiPath, taxiLookupPath):

def get_most_pickups_per_month_pyspark(taxiPath, taxiLookupPath):

In [None]:
print(timer_method("get_most_pickups_per_month_pandas(taxiPath, taxiLookupPath)"))
print(timer_method("get_most_pickups_per_month_pyspark(taxiPath, taxiLookupPath)"))

### Lab - 2.9 Run and time the overall pipeline

This function should:
* Ingest the taxi and taxi-lookup data from their source
* Perform the aggregations from the previous labs and write the data to local ebs storage

In [None]:
# Reset notebook kernel
def ingest_main():


In [None]:
print(timer_method("ingest_main"))