# Module 5 Homework Solutions

## Import libraries

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType
from pyspark.sql import functions as F
import pandas as pd

## Create a local Spark session

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("spark homework") \
    .getOrCreate()

24/02/25 18:12:43 WARN Utils: Your hostname, GRAD0365UBUNTU resolves to a loopback address: 127.0.1.1; using 192.168.68.103 instead (on interface wlp0s20f3)
24/02/25 18:12:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/02/25 18:12:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Question 1

**Execute `spark.version`. What's the output?**

In [3]:
spark.version

'3.3.3'

## Read the data and save it to Parquet

For this homework we will be using the FHV 2019-10 data found here: [FHV Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz).

Read the data into a Spark DataFrame with a schema as we did in the lessons. Repartition the DataFrame to 6 partitions and save it to Parquet.

### Download the CSV file

In [4]:
!mkdir ../data/fhv
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz \
    -O ../data/fhv/fhv_tripdata_2019-10.csv.gz

mkdir: no se puede crear el directorio «../data/fhv»: El archivo ya existe
--2024-02-25 18:12:45--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-10.csv.gz
Resolviendo github.com (github.com)... 140.82.121.3
Conectando con github.com (github.com)[140.82.121.3]:443... conectado.
Petición HTTP enviada, esperando respuesta... 302 Found
Ubicación: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/efdfcf82-6d5c-44d1-a138-4e8ea3c3a3b6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240225%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240225T171245Z&X-Amz-Expires=300&X-Amz-Signature=644da50ef163d179731b5fa95c155ba77ac3ef2578e0a45f0072ee2b0e2e4418&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dfhv_tripdata_2019-10.csv.gz&response-content-type=application%2Foctet-stream [siguiente]
--2024-02-25 18:12:46--  https://objects.

### Infer datatypes

In [5]:
df_pandas = pd.read_csv("../data/fhv/fhv_tripdata_2019-10.csv.gz", nrows=1000)
df_pandas.dtypes

dispatching_base_num       object
pickup_datetime            object
dropOff_datetime           object
PUlocationID              float64
DOlocationID              float64
SR_Flag                   float64
Affiliated_base_number     object
dtype: object

In [6]:
df_pandas.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2019-10-01 00:23:00,2019-10-01 00:35:00,264.0,264.0,,B00009
1,B00013,2019-10-01 00:11:29,2019-10-01 00:13:22,264.0,264.0,,B00013
2,B00014,2019-10-01 00:11:43,2019-10-01 00:37:20,264.0,264.0,,B00014
3,B00014,2019-10-01 00:56:29,2019-10-01 00:57:47,264.0,264.0,,B00014
4,B00014,2019-10-01 00:23:09,2019-10-01 00:28:27,264.0,264.0,,B00014


In [7]:
df_pandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   dispatching_base_num    1000 non-null   object 
 1   pickup_datetime         1000 non-null   object 
 2   dropOff_datetime        1000 non-null   object 
 3   PUlocationID            999 non-null    float64
 4   DOlocationID            999 non-null    float64
 5   SR_Flag                 0 non-null      float64
 6   Affiliated_base_number  997 non-null    object 
dtypes: float64(3), object(4)
memory usage: 54.8+ KB


In [8]:
# create a schema for the dataframe
schema = StructType([
        StructField("dispatching_base_num", StringType(), True),
        StructField("pickup_datetime", TimestampType(), True), 
        StructField("dropoff_datetime", TimestampType(), True), 
        StructField("PULocationID", IntegerType(), True), 
        StructField("DOLocationID", IntegerType(), True), 
        StructField("SR_Flag", StringType(), True),
        StructField("Affiliated_base_number", StringType(), True)
])

In [9]:
# read the data as a Spark DataFrame using the schema we want
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv("../data/fhv/fhv_tripdata_2019-10.csv.gz")
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [10]:
# repartition the DataFrame and save it to Parquet
df.repartition(6).write.parquet("../data/fhv/2019/10", mode="overwrite")

                                                                                

### Question 2

**What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)?**

In [11]:
!ls -lh ../data/fhv/2019/10/*.parquet

-rw-r--r-- 1 sgrodriguez usuarios del dominio 6,4M feb 25 18:13 ../data/fhv/2019/10/part-00000-556011c9-d8d5-4c60-9bec-3138a466bbe7-c000.snappy.parquet
-rw-r--r-- 1 sgrodriguez usuarios del dominio 6,4M feb 25 18:13 ../data/fhv/2019/10/part-00001-556011c9-d8d5-4c60-9bec-3138a466bbe7-c000.snappy.parquet
-rw-r--r-- 1 sgrodriguez usuarios del dominio 6,4M feb 25 18:13 ../data/fhv/2019/10/part-00002-556011c9-d8d5-4c60-9bec-3138a466bbe7-c000.snappy.parquet
-rw-r--r-- 1 sgrodriguez usuarios del dominio 6,4M feb 25 18:13 ../data/fhv/2019/10/part-00003-556011c9-d8d5-4c60-9bec-3138a466bbe7-c000.snappy.parquet
-rw-r--r-- 1 sgrodriguez usuarios del dominio 6,4M feb 25 18:13 ../data/fhv/2019/10/part-00004-556011c9-d8d5-4c60-9bec-3138a466bbe7-c000.snappy.parquet
-rw-r--r-- 1 sgrodriguez usuarios del dominio 6,4M feb 25 18:13 ../data/fhv/2019/10/part-00005-556011c9-d8d5-4c60-9bec-3138a466bbe7-c000.snappy.parquet


**Answer:** 6,4MB

### Question 3

**How many taxi trips were there on the 15th of October? Consider only trips that started on the 15th of October.**

In [12]:
# via Spark DataFrame
df \
    .withColumn("pickup_date", F.to_date(df.pickup_datetime)) \
    .filter(F.col("pickup_date") == "2019-10-15") \
    .count()

                                                                                

62610

In [13]:
# via SparkSQL
df.createOrReplaceTempView("fhv_table")

spark.sql("""
SELECT COUNT(1) AS num_trips
FROM fhv_table
WHERE TO_DATE(pickup_datetime) = '2019-10-15';
""").show()

[Stage 6:>                                                          (0 + 1) / 1]

+---------+
|num_trips|
+---------+
|    62610|
+---------+



                                                                                

### Question 4

**What is the length of the longest trip in the dataset in hours?**

In [14]:
# via Spark DataFrame
df \
    .withColumn("trip_duration_h", (F.unix_timestamp(df.dropoff_datetime) - F.unix_timestamp(df.pickup_datetime)) / 3600) \
    .withColumn("pickup_date", F.to_date(df.pickup_datetime)) \
    .select("pickup_date", "trip_duration_h") \
    .groupBy("pickup_date") \
    .agg(F.max("trip_duration_h").alias("longest_trip_duration_h")) \
    .orderBy("longest_trip_duration_h", ascending=False) \
    .limit(5) \
    .show()

[Stage 9:>                                                          (0 + 1) / 1]

+-----------+-----------------------+
|pickup_date|longest_trip_duration_h|
+-----------+-----------------------+
| 2019-10-28|               631152.5|
| 2019-10-11|               631152.5|
| 2019-10-31|      87672.44083333333|
| 2019-10-01|      70128.02805555555|
| 2019-10-17|                 8794.0|
+-----------+-----------------------+



                                                                                

In [15]:
# via SparkSQL
spark.sql("""
SELECT 
    TO_DATE(pickup_datetime) AS pickup_date,
    MAX((UNIX_TIMESTAMP(dropoff_datetime) - UNIX_TIMESTAMP(pickup_datetime)) / 3600) AS longest_trip_duration_h
FROM fhv_table
GROUP BY pickup_date
ORDER BY longest_trip_duration_h DESC
LIMIT 5;
""").show()

[Stage 12:>                                                         (0 + 1) / 1]

+-----------+-----------------------+
|pickup_date|longest_trip_duration_h|
+-----------+-----------------------+
| 2019-10-28|               631152.5|
| 2019-10-11|               631152.5|
| 2019-10-31|      87672.44083333333|
| 2019-10-01|      70128.02805555555|
| 2019-10-17|                 8794.0|
+-----------+-----------------------+



                                                                                

**Answer:** 631,152.5 hours.

### Question 5

**Spark’s User Interface which shows the application's dashboard runs on which local port?**

**Answer:** 4040 (`localhost:4040`).

## Join two tables

Load the zone lookup data into a temp view in Spark: [Zone Data](https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv).

In [16]:
# download zone lookup data
# !wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv -O ../data/taxi_zone_lookup.csv

# read data into a dataframe
df_zones = spark.read \
    .option("header", True) \
    .csv("../data/taxi_zone_lookup.csv")
df_zones.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



### Question 6

**Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?**

In [17]:
# join both tables
df.join(df_zones, on=df["PUlocationID"] == df_zones["LocationID"], how="left") \
    .groupBy("Zone") \
    .count() \
    .orderBy("count", ascending=True) \
    .limit(5) \
    .show()

[Stage 18:>                                                         (0 + 1) / 1]

+--------------------+-----+
|                Zone|count|
+--------------------+-----+
|         Jamaica Bay|    1|
|Governor's Island...|    2|
| Green-Wood Cemetery|    5|
|       Broad Channel|    8|
|     Highbridge Park|   14|
+--------------------+-----+



                                                                                

In [18]:
# via Spark SQL
df_zones.createOrReplaceTempView("zones")

spark.sql("""
SELECT
    zones.Zone AS zone,
    COUNT(1) as count_trips
FROM fhv_table
LEFT JOIN zones
ON fhv_table.PUlocationID = zones.LocationID
GROUP BY zone
ORDER BY count_trips ASC
LIMIT 5;
""").show()

[Stage 22:>                                                         (0 + 1) / 1]

+--------------------+-----------+
|                zone|count_trips|
+--------------------+-----------+
|         Jamaica Bay|          1|
|Governor's Island...|          2|
| Green-Wood Cemetery|          5|
|       Broad Channel|          8|
|     Highbridge Park|         14|
+--------------------+-----------+



                                                                                

**Answer:** Jamaica Bay (1 trip).