In order to handle an issue where writing in parquet files resulted in a memory error, the files were read in as csv, had preprocessing applied to them and then written to another parquet file.

In [1]:
# Importing and starting a spark session
from pyspark.sql import SparkSession
from pyspark import SparkContext
#Supress warnings
spark = SparkSession.builder.getOrCreate()
sc = SparkContext.getOrCreate()
sc.setLogLevel('WARN')

spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)
#Make the spark files present well
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

21/08/09 21:52:40 WARN Utils: Your hostname, LAPTOP-D5HGLKLK resolves to a loopback address: 127.0.1.1; using 172.23.50.214 instead (on interface eth0)
21/08/09 21:52:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/08/09 21:52:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# reference from Akira Wang's Github
#forming a schema for the dataframes
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import col

#setting datatypes for each individual column
ints = ('VendorID', 'passenger_count', 'RateCodeID', 'RatecodeID','payment_type', 'PULocationID', 'DOLocationID')
doubles = ('trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount'
          , 'congestion_surcharge')
strings = ('store_and_fwd_flag',)
dtimes = ('tpep_pickup_datetime', 'tpep_dropoff_datetime')


dtypes = {column: IntegerType() for column in ints}
dtypes.update({column: DoubleType() for column in doubles})
dtypes.update({column: StringType() for column in strings})
dtypes.update({column: TimestampType() for column in dtimes})

taxi_dir = "../raw_data/yellow_tripdata_20"
#Using a dataset to form the schema
sdf = spark.read.csv(f"{taxi_dir}18-01.csv", header = True)

schema = StructType()
for column in sdf.columns:
    schema.add(column, # column name
               dtypes[column], # data type
               True # is nullable?
              )
#importing the taxi datasets in dictionaries with schemas


taxi18 = {str(i).zfill(2): spark.read.csv(f"{taxi_dir}18-{str(i).zfill(2)}.csv",
                                                     header = True, schema = schema) for i in range(1, 13)}
taxi19 = {str(i).zfill(2): spark.read.csv(f"{taxi_dir}19-{str(i).zfill(2)}.csv",
                                                     header = True, schema = schema) for i in range(1, 13)}


[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [3]:
from pyspark.sql.functions import *
#summarising all the 2018 and 2019 data into dataframes
taxi_2018 = taxi18['01']
taxi_2019 = taxi19['01']
for i in range(2, 13):
    taxi_2018 = taxi_2018.union(taxi18[str(i).zfill(2)])
    taxi_2019 = taxi_2019.union(taxi18[str(i).zfill(2)])
    

In [4]:


def preprocessing(dataset) :
    #filter out all payment types not credit card
    dataset = dataset.filter(dataset.payment_type == 1)

    #filter out all payment types that involve Newark Airport
    dataset = dataset.filter(dataset.PULocationID != 1)
    dataset = dataset.filter(dataset.DOLocationID != 1)

    #filter out all negative trip distances
    dataset = dataset.filter(dataset.trip_distance > 0)

    #remove all ratecodes not contained within New York City
    dataset = dataset.filter((dataset.RatecodeID != 3) & (dataset.RatecodeID != 4) & 
                             (dataset.RatecodeID != 99))

    #remove all LocationIDs that are unexplained (posed to be out of state)
    dataset = dataset.filter((dataset.PULocationID != 264) & (dataset.PULocationID != 265)
                            & (dataset.DOLocationID != 264) & (dataset.DOLocationID != 265))

    #remove all trips not in 2018
    dataset = dataset.filter(year(dataset.tpep_pickup_datetime) == 2018)

    #creating a new column corresponding to trip duration in minutes
    dataset = dataset.withColumn("trip_duration", 
                                (unix_timestamp(dataset.tpep_dropoff_datetime) - 
                                unix_timestamp(dataset.tpep_pickup_datetime))/60)

    #filter out all negative trip durations
    dataset = dataset.filter(dataset.trip_duration > 0)
    # drop out all columns that are not of interest
    dataset = dataset.drop(*('tolls_amount', 'fare_amount', 'extra', 
                             'mta_tax', 'tolls_amount', 'total_amount',
                            'improvement_surcharge', 'payment_type',
                            'store_and_fwd_flag'))
    dataset = dataset.withColumnRenamed('tpep_pickup_datetime', 'pickup_time') \
    .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_time')
    return dataset

In [5]:
taxi_2018 = preprocessing(taxi_2018)
taxi_2019 = preprocessing(taxi_2019)

In [6]:
#writing the data into a parquet file
taxi_2018.write.format('parquet').save('../preprocessed_data/preprocessed_taxi_2018.parquet')
taxi_2019.write.format('parquet').save('../preprocessed_data/preprocessed_taxi_2019.parquet')

21/08/09 21:52:59 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 21:52:59 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 21:54:41 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 21:54:42 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 21:54:46 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 21:54:47 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 21:54:47 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

21/08/09 22:00:39 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:00:39 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:02:29 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:02:30 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:02:30 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:02:30 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:02:30 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

21/08/09 22:07:53 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:07:54 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:07:54 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:07:55 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:07:55 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:09:35 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:09:35 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

21/08/09 22:18:36 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:18:36 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:18:49 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:18:49 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:18:58 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:18:58 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:18:58 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

21/08/09 22:23:42 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:23:42 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:23:44 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:23:44 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:23:44 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:23:44 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:23:44 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

21/08/09 22:29:18 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:29:18 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:29:18 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:29:18 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:29:18 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 22:29:18 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 22:29:18 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014