### SparkSession

In [None]:
# library
import pyspark
from pyspark.sql import SparkSession

In [None]:
# see init file in PySpark
pyspark.__file__

In [None]:
# create a SparkSession named test
# local[*]: run Spark locally with as many worker threads as logical cores on local machine
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('test') \
    .getOrCreate()

### Reading

In [None]:
# download 1 Parquet file from url
! wget https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2021-01.parquet

In [None]:
# read Parquet file
df = spark.read.parquet("fhvhv_tripdata_2021-01.parquet")

In [None]:
# test some Spark DataFrame attributes & methods
# df.show(5)
# df.head()
# df.schema
# df.count()

### Writing

#### Writing to CSV file with 1000 first records

In [None]:
# take 1000 first rows
df_head = df.limit(1000)

In [None]:
# Writing to CSV file
df_head.coalesce(1).write.option('header', 'true').mode('overwrite').csv('results')

In [None]:
# Testing whether file is saved successfully
df_test = spark.read \
    .option('header', 'true') \
    .option('inferSchema', 'true') \
    .csv("results/head.csv").show(5)

#### Saving to Parquet file (with partitioned data)

In [None]:
df = spark.read.parquet("fhvhv_tripdata_2021-01.parquet")

In [None]:
# Divide DataFrame into 24 partitions (each partition can be executed parallel)
df = df.repartition(24)

In [None]:
# write parquet files
df.write.parquet('fhvhv/2021/01/')

### Transformation & Action

In [None]:
# read Parquet files
df = spark.read.parquet('fhvhv/2021/01/')

In [None]:
# df.show()
# df.printSchema()

In [None]:
# test transformation & action
df.select('pickup_datetime', 'dropoff_datetime', 'PUlocationID', 'DOlocationID') \
    .filter(df.hvfhs_license_num == 'HV0003') \
    .show()

### Functions & UDF

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types

In [None]:
# take dispatching_base_num and convert to hexa format
def crazy_stuff(base_num):
    num = int(base_num[1:])
    if num % 7 == 0:
        return f's/{num:03x}'
    elif num % 3 == 0:
        return f'a/{num:03x}'
    else:
        return f'e/{num:03x}'

In [None]:
# unit test
crazy_stuff('B02884')

In [None]:
# create a UDF
crazy_stuff_udf = F.udf(crazy_stuff, returnType=types.StringType())

In [None]:
# transformation
df = df \
    .withColumn('base_num_in_hex', crazy_stuff_udf(df.dispatching_base_num)) \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .select('base_num_in_hex', 'pickup_date', 'dropoff_date', 'PUlocationID', 'DOlocationID')

In [None]:
# test result
df.show()

In [25]:
# stop SparkSession
if 'spark' in locals() and spark:
    spark.stop()