In [12]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types

In [11]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('zoomcamp') \
    .config('spark.executor.memory', '12g') \
    .getOrCreate()
spark

In [19]:
g_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("lpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("lpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("ehail_fee", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("trip_type", types.IntegerType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

y_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("tpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("tpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

In [17]:
year, month, end = 2020, 1, 12
while month <= end:
    ip_path = f"../data/raw/green/{year}/{month:02d}/"
    op_path = f"../data/pq/green/{year}/{month:02d}/"

    print(f'Processing data for {year}/{month}')
    df = spark.read \
        .option("header", "true") \
        .schema(g_schema) \
        .csv(ip_path)
    
    df.repartition(4) \
        .write.parquet(op_path)
    
    if month == 12:
        year, month, end = 2021, 1, 7
    month += 1


Processing data for 2020/1


                                                                                

Processing data for 2020/2


                                                                                

Processing data for 2020/3


                                                                                

Processing data for 2020/4
Processing data for 2020/5
Processing data for 2020/6
Processing data for 2020/7
Processing data for 2020/8
Processing data for 2020/9
Processing data for 2020/10
Processing data for 2020/11
Processing data for 2020/12
Processing data for 2021/2
Processing data for 2021/3
Processing data for 2021/4
Processing data for 2021/5


                                                                                

Processing data for 2021/6
Processing data for 2021/7


In [20]:
year, month, end = 2020, 1, 12
while month <= end:
    ip_path = f"../data/raw/yellow/{year}/{month:02d}/"
    op_path = f"../data/pq/yellow/{year}/{month:02d}/"

    print(f'Processing data for {year}/{month}')
    df = spark.read \
        .option("header", "true") \
        .schema(y_schema) \
        .csv(ip_path)
    
    df.repartition(4) \
        .write.parquet(op_path)
    
    if month == 12:
        year, month, end = 2021, 1, 7
    month += 1


Processing data for 2020/1


                                                                                

Processing data for 2020/2


                                                                                

Processing data for 2020/3


                                                                                

Processing data for 2020/4


                                                                                

Processing data for 2020/5


                                                                                

Processing data for 2020/6


                                                                                

Processing data for 2020/7


                                                                                

Processing data for 2020/8


                                                                                

Processing data for 2020/9


                                                                                

Processing data for 2020/10


                                                                                

Processing data for 2020/11


                                                                                

Processing data for 2020/12


                                                                                

Processing data for 2021/2


                                                                                

Processing data for 2021/3


                                                                                

Processing data for 2021/4


                                                                                

Processing data for 2021/5


                                                                                

Processing data for 2021/6


                                                                                

Processing data for 2021/7


                                                                                