# Convert CSV files to Parquet

## Import libraries

In [1]:
import pyspark
from pyspark.sql import SparkSession

## Create a Spark session

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("parquetize datasets") \
    .getOrCreate()

24/02/23 10:45:19 WARN Utils: Your hostname, GRAD0365UBUNTU resolves to a loopback address: 127.0.1.1; using 192.168.68.103 instead (on interface wlp0s20f3)
24/02/23 10:45:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/02/23 10:45:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read CSV files

In [3]:
df_green = spark.read \
    .option("header", True) \
    .csv("../../data/nyc_tlc/raw/green/2021/01")

In [4]:
df_green.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- lpep_dropoff_datetime: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- trip_type: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)



## Infer datatypes

In [5]:
import pandas as pd

In [6]:
df_green_pd = pd.read_csv("../../data/nyc_tlc/raw/green/2021/01/green_tripdata_2021_01.csv.gz", nrows=1000)
df_green_pd.dtypes

VendorID                   int64
lpep_pickup_datetime      object
lpep_dropoff_datetime     object
store_and_fwd_flag        object
RatecodeID                 int64
PULocationID               int64
DOLocationID               int64
passenger_count            int64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
ehail_fee                float64
improvement_surcharge    float64
total_amount             float64
payment_type               int64
trip_type                  int64
congestion_surcharge     float64
dtype: object

In [7]:
# workaround to avoid AttributeError: 'DataFrame' object has no attribute 'iteritems'
pd.DataFrame.iteritems = pd.DataFrame.items

spark.createDataFrame(df_green_pd).schema

StructType([StructField('VendorID', LongType(), True), StructField('lpep_pickup_datetime', StringType(), True), StructField('lpep_dropoff_datetime', StringType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('RatecodeID', LongType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('ehail_fee', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('payment_type', LongType(), True), StructField('trip_type', LongType(), True), StructField('congestion_surcharge', DoubleType(), True)])

In [8]:
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, IntegerType, DoubleType

In [9]:
green_schema = StructType([
    StructField('VendorID', IntegerType(), True), 
    StructField('lpep_pickup_datetime', TimestampType(), True), 
    StructField('lpep_dropoff_datetime', TimestampType(), True), 
    StructField('store_and_fwd_flag', StringType(), True), 
    StructField('RatecodeID', IntegerType(), True), 
    StructField('PULocationID', IntegerType(), True), 
    StructField('DOLocationID', IntegerType(), True), 
    StructField('passenger_count', IntegerType(), True), 
    StructField('trip_distance', DoubleType(), True), 
    StructField('fare_amount', DoubleType(), True), 
    StructField('extra', DoubleType(), True), 
    StructField('mta_tax', DoubleType(), True), 
    StructField('tip_amount', DoubleType(), True), 
    StructField('tolls_amount', DoubleType(), True), 
    StructField('ehail_fee', DoubleType(), True), 
    StructField('improvement_surcharge', DoubleType(), True), 
    StructField('total_amount', DoubleType(), True), 
    StructField('payment_type', IntegerType(), True), 
    StructField('trip_type', IntegerType(), True), 
    StructField('congestion_surcharge', DoubleType(), True)
])

In [10]:
df_green = spark.read \
    .option("header", True) \
    .schema(green_schema) \
    .csv("../../data/nyc_tlc/raw/green/2021/01")

df_green.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [11]:
# a quicker way to have an appropriate schema
# in this case, the only difference in this case is ehail_fee, which is treated as string instead of double
df_green_infer = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv("../../data/nyc_tlc/raw/green/2021/01")

df_green_infer.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [12]:
df_yellow_pd = pd.read_csv("../../data/nyc_tlc/raw/yellow/2021/01/yellow_tripdata_2021_01.csv.gz", nrows=1000)
spark.createDataFrame(df_yellow_pd).schema

StructType([StructField('VendorID', LongType(), True), StructField('tpep_pickup_datetime', StringType(), True), StructField('tpep_dropoff_datetime', StringType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('RatecodeID', LongType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True)])

In [13]:
yellow_schema = StructType([
    StructField('VendorID', IntegerType(), True),
    StructField('tpep_pickup_datetime', TimestampType(), True),
    StructField('tpep_dropoff_datetime', TimestampType(), True),
    StructField('passenger_count', IntegerType(), True),
    StructField('trip_distance', DoubleType(), True),
    StructField('RatecodeID', IntegerType(), True),
    StructField('store_and_fwd_flag', StringType(), True),
    StructField('PULocationID', IntegerType(), True),
    StructField('DOLocationID', IntegerType(), True),
    StructField('payment_type', IntegerType(), True),
    StructField('fare_amount', DoubleType(), True),
    StructField('extra', DoubleType(), True),
    StructField('mta_tax', DoubleType(), True),
    StructField('tip_amount', DoubleType(), True),
    StructField('tolls_amount', DoubleType(), True),
    StructField('improvement_surcharge', DoubleType(), True),
    StructField('total_amount', DoubleType(), True),
    StructField('congestion_surcharge', DoubleType(), True)
])

In [14]:
df_yellow = spark.read \
    .option("header", True) \
    .schema(yellow_schema) \
    .csv("../../data/nyc_tlc/raw/yellow/2021/01")

df_yellow.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [15]:
# a quicker way to have an appropriate schema
df_yellow_infer = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv("../../data/nyc_tlc/raw/yellow/2021/01")

df_yellow_infer.printSchema()

[Stage 4:>                                                          (0 + 1) / 1]

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



                                                                                

## Save as parquet

In [16]:
schemas = {
    "yellow": yellow_schema,
    "green": green_schema
}
year = 2021
months = range(1, 4)

In [17]:
from pyspark.sql.utils import AnalysisException

taxi_type = "green"

for month in months:
    print(f"Processing {taxi_type} taxi data for {year}/{month:02d}")
    input_path = f"../../data/nyc_tlc/raw/{taxi_type}/{year}/{month:02d}"
    output_path = f"../../data/nyc_tlc/pq/{taxi_type}/{year}/{month:02d}"

    try:
        df = spark.read \
            .option("header", True) \
            .schema(schemas[taxi_type]) \
            .csv(input_path)
    except AnalysisException as e:
        print(f"Path {input_path} does not exist.")
        break

    try:
        df \
            .repartition(4) \
            .write.parquet(output_path)
    except AnalysisException as e:
        print(f"Path {output_path} already exists.")
        continue

Processing green taxi data for 2021/01


                                                                                

Processing green taxi data for 2021/02
Processing green taxi data for 2021/03


In [18]:
taxi_type = "yellow"

for month in months:
    print(f"Processing {taxi_type} taxi data for {year}/{month:02d}")
    input_path = f"../../data/nyc_tlc/raw/{taxi_type}/{year}/{month:02d}"
    output_path = f"../../data/nyc_tlc/pq/{taxi_type}/{year}/{month:02d}"

    try:
        df = spark.read \
            .option("header", True) \
            .schema(schemas[taxi_type]) \
            .csv(input_path)
    except AnalysisException as e:
        print(f"Path {input_path} does not exist.")
        break

    try:
        df \
            .repartition(4) \
            .write.parquet(output_path)
    except AnalysisException as e:
        print(f"Path {output_path} already exists.")
        continue

Processing yellow taxi data for 2021/01


                                                                                

Processing yellow taxi data for 2021/02


                                                                                

Processing yellow taxi data for 2021/03


                                                                                