In [None]:
# Install libraries within the notebook scope
sc.install_pypi_package("boto3")
sc.install_pypi_package("pandas")
sc.install_pypi_package("requests")
sc.install_pypi_package("s3fs")

In [None]:
import boto3
from datetime import datetime
from datetime import timedelta
import pandas as pd
from pyspark.sql.dataframe import DataFrame
from pyspark.sql import functions as f, types as t, Window
from pathlib import Path
import requests
import s3fs
import subprocess
import timeit
from urllib.parse import urlparse

# Removes truncation of columns, column values in Pandas
# by default
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)

# Monkey patching the DataFrame transform method for spark 2.4
def transform(self, f):
    return f(self)
DataFrame.transform = transform

# Override the timeit template to return the command's
# return value in addition to the time
# Reference: https://stackoverflow.com/questions/24812253/how-can-i-capture-return-value-with-python-timeit-module
timeit.template = """
def inner(_it, _timer{init}):
    {setup}
    _t0 = _timer()
    for _i in _it:
        retval = {stmt}
    _t1 = _timer()
    return _t1 - _t0, retval
"""

def shell_cmd(cmd):
    for line in subprocess.check_output(cmd, shell=True).split(b'\n'):
        print(line)

def timer_method(cmd):
    # Setting globals = globals() enables the timeit function
    # to return the value generated by cmd
    return timeit.timeit(cmd, number=1, globals = globals())

### Set your s3 bucket name
This should be data-scale-oreilly-{your name}   
If you dont remember check the [S3 console](https://s3.console.aws.amazon.com/)

In [None]:
MY_BUCKET_NAME = ""

### Lab 3.1 Leveraging File Types

Write functions to:
* Write out the taxi-lookup dataset to local storage as csv, json, and parquet files
* Write a function to read the files and to print/return a dataframe containing the counts of zones per borough
* Time how long it takes to write out the different file types and to perform the aggregation using each file type

In [None]:
# These should point to the paths where the data from the taxi and taxi-lookup ingests were written
taxiPath = "hdfs:///tmp/data/nyc-taxi/taxi-data/output/section2/json/"
taxiLookupPath = "hdfs:///tmp/data/nyc-taxi/zone-lookup/output/section2/json/"

In [None]:
# Spark 2.4.6 docs: https://spark.apache.org/docs/2.4.6/
def write_csv(inputDF):


def write_json(inputDF):


def write_parquet(inputDF):

        
def read_test_csv():

def read_test_json():

def read_test_parquet():


In [None]:
inputDF = (spark
              .read
              .json(taxiLookupPath))
inputDF.printSchema()
inputDF.show()

print(f'CSV write time: {timer_method("write_csv(inputDF)")}')
print(f'JSON write time: {timer_method("write_json(inputDF)")}')
print(f'Parquet write time: {timer_method("write_parquet(inputDF)")}')

In [None]:
print(f'CSV read and transform time: {timer_method("read_test_csv()")}')
print(f'JSON read and transform time: {timer_method("read_test_json()")}')
print(f'Parquet read and transform time: {timer_method("read_test_parquet()")}')

### Lab 3.2 Partitioning

Write functions to:
* Based on the class discussion , update the following functions with the most approrpiate partitioning values
* Write a function to read the written files and to print/return a dataframe containing the counts of taxi rides per pickup_month
* Time how long it takes to write out the data using the different partitioning methodologies and to perform the aggregation using strategy

In [None]:
def write_paritioned_parquet(inputDF):
    (inputDF
        .write
        .mode("overwrite")
        .partitionBy('')
        .parquet("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section3/partitioned"))
            
def write_coalesce_parquet(inputDF):
    (inputDF
        .coalesce()
        .write     
        .mode("overwrite")
        .partitionBy('')
        .parquet("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section3/coalesced"))
            
def write_repartition_parquet(inputDF):
    (inputDF
        .repartition()
        .write
        .mode("overwrite")
        .partitionBy('')
        .parquet("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section3/repartitioned"))
            
def write_sorted_parquet(inputDF): #RLE, need to verify that this still matters
    (inputDF.orderBy('pickup_month','passenger_count', 'PULocationID', 'DOLocationID', 'trip_distance', 'fare_amount', 'tip_amount', 'tpep_dropoff_datetime', 'tpep_pickup_datetime')
        .coalesce()     
        .write
        .mode("overwrite")
        .partitionBy('')
        .parquet("hdfs:///tmp/data/nyc-taxi/taxi-data/output/section3/sorted"))

def read_test_parquet(readPath):
    readDF = (spark
                 .read
                 .parquet(readPath).sort("pickup_month", "passenger_count"))
    return readDF
def agg_test_parquet(baseDF):


In [None]:
# Will talk through how to access the spark ui/view progress of the running code while this operation is running
inputDF = (spark
              .read
              .json(taxiPath)
              .withColumn("pickup_month", f.date_format("pickup_datetime", "yyyyMM")))
inputDF.show()

print(timer_method("write_paritioned_parquet(inputDF)"))
print(timer_method("write_coalesce_parquet(inputDF)"))
print(timer_method("write_repartition_parquet(inputDF)"))
print(timer_method("write_sorted_parquet(inputDF)"))

In [None]:
testPaths = [
    'hdfs:///tmp/data/nyc-taxi/taxi-data/output/section3/partitioned',
    'hdfs:///tmp/data/nyc-taxi/taxi-data/output/section3/coalesced',
    'hdfs:///tmp/data/nyc-taxi/taxi-data/output/section3/repartitioned',
    'hdfs:///tmp/data/nyc-taxi/taxi-data/output/section3/sorted',
]

for testPath in testPaths:
    print(timer_method(f"read_test_parquet('{testPath}')"))
    baseDF = read_test_parquet(testPath)
    print(timer_method("agg_test_parquet(baseDF)"))

### Schema Management

In [None]:
# Normally want to leverage a respository
# In scala you could apply the types to the dataframe to create a dataset. That is not possible in PySpark 2.4
# Can leverage a SQL DDL or a spark structType. 
taxiSchema = inputDF.schema
print(spark.read.schema(taxiSchema).parquet('hdfs:///tmp/data/nyc-taxi/taxi-data/output/section3/sorted').count())

In [None]:
taxiSchema = t.StructType([ \
    t.StructField('DOLocationID',t.LongType(),True), \
    t.StructField('PULocationID',t.LongType(),True), \
    t.StructField('dropoff_datetime',t.StringType(),True), \
    t.StructField('dropoff_date',t.StringType(),True), \
    t.StructField('fare_amount',t.DoubleType(),True), \
    t.StructField('ingested_on',t.StringType(),True), \
    t.StructField('month',t.StringType(),True), \
    t.StructField('passanger_count',t.LongType(),True), \
    t.StructField('pickup_datetime',t.StringType(),True), \
    t.StructField('pickup_date',t.StringType(),True), \
    t.StructField('service',t.StringType(),True), \
    t.StructField('tip_amount',t.DoubleType(),True), \
    t.StructField('trip_distance',t.DoubleType(),True), \
    # t.StructField('year',t.StringType(),True), \
    t.StructField('pickup_month',t.StringType(),True)])
    
print(spark.read.schema(taxiSchema).parquet('hdfs:///tmp/data/nyc-taxi/taxi-data/output/section3/sorted').count())

# stations = "col1 STRING, col2 INT"
spark.createDataFrame(inputDF.rdd,schema=taxiSchema,verifySchema=True).show()

### Lab 3.3 - Case Study 3: Find the average taxi rides per zip code on the 10 worst air quality days of each month

Write functions to: 
* Select the best iteration of the taxi data for this workload
* Ingest the borough zipcode mapping and explode the zip code columns
* Ingest air quality data from: https://www.airnowapi.org/aq/observation/zipCode/historical/?format=application/json&zipCode={zipCode}&date={date}T00-0000&distance=100&API_KEY={apiKey}
    * https://docs.airnowapi.org/HistoricalObservationsByZip/docs
    * Make sure you have created a account at: https://docs.airnowapi.org/account/request/
    * Join the borough zipcode mapping data to the taxi ride dataset
    * Due to API call limitations only pull 100 days of air quality for the following zipcodes (11212, 10023) starting on 2020/06/01
    * #### Be aware that you only have 500 requests per hour to that api endpoint
* Find the average taxi rides per borough on the 10 worst air quality days of each month
* Persist the resulting agg dataframe by calling .cache() on the dataframe
* Run and time the full ingest

Make sure to write the ingests with an eye on efficiency for this specific workload 

In [None]:
def get_taxi_df():
    return groupDF

def get_zip_code_mapping_df():
    return returnZipDF

def get_air_quality_df(zipDF):
    return returnAirDF

def calculate_hottest_days(taxiDF, airQualityDF):
    return aggDF

In [None]:
def run_case_study():
    taxiDF = get_taxi_df()
    zipDF = get_zip_code_mapping_df()
    airQualityDF = get_air_quality_df(zipDF)
    aggedDF = calculate_hottest_days()
    aggedDF.cache()
    print(aggedDF.count())
    # .cache is lazy evaluated, so we do the count to force the action

In [None]:
print(timer_method("run_case_study()"))

### Lab 3.4 Write out data out to S3 for long term storage

Write functions to:
* Write the taxi/taxi-lookup data to S3 using the most appropriate storage format/partitioning methodology
* Write the cached dataframe from the previous usecase to S3 using the most appropriate storage format/partitioning methodology


In [None]:
# Normally want to avoid writing to EBS then copying to S3
# Intermediate workloads should be written to local EBS, while finished workloads/longterm storage should be read directly from/written directly to S3
# Luckily, EMRFS let's us write out directly to S3

def write_to_s3():