# Extracting and Unzipping Data

In [None]:
# referenced from Akira's git, code to download the data directly from NYC website
from os.path import getsize
from urllib.request import urlretrieve

output_dir = "../raw_data"
fname_templates = "yellow_tripdata_2018"

for m in range(1, 13):
    month = str(m).zfill(2)
    out = f'{fname_template}-{month}.csv'
    url = f"https://s3.amazonaws.com/nyc-tlc/trip+data/{out}"
    urlretrieve(url, f"{output_dir}/{out}")

    print(f"Done downloading {out} to {output_dir} with size {getsize(f'{output_dir}/{out}') / 1073741824:.2f}GB")

In [None]:
# referenced from Akira's git, code to download the data directly from NYC website
from os.path import getsize
from urllib.request import urlretrieve

output_dir = "../raw_data"
fname_template = "yellow_tripdata_2019"

for m in range(1, 13):
    month = str(m).zfill(2)
    out = f'{fname_template}-{month}.csv'
    url = f"https://s3.amazonaws.com/nyc-tlc/trip+data/{out}"
    urlretrieve(url, f"{output_dir}/{out}")

    print(f"Done downloading {out} to {output_dir} with size {getsize(f'{output_dir}/{out}') / 1073741824:.2f}GB")

In [None]:
#unzipping the taxi zones, NTA and census files
import zipfile

directory = "../raw_data_lite/"
taxi_zones = f"{directory}taxi_zones.zip"
nta = f"{directory}nynta_21b.zip"
census_1year = f"{directory}2018 1-year estimates.zip"
census_5year = f"{directory}2018 5-year estimates.zip"
zipfiles = (taxi_zones, nta, census_1year, census_5year)
for file in zipfiles:
    with zipfile.ZipFile(file) as z:
        # make sure they are in the same directory
        z.extractall("../raw_data_lite/")
        

# Serializing Taxi Data

In [1]:
# Importing and starting a spark session
from pyspark.sql import SparkSession
from pyspark import SparkContext
#Supress warnings
spark = SparkSession.builder.getOrCreate()
sc = SparkContext.getOrCreate()
sc.setLogLevel('WARN')

spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)
#Make the spark files present well
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

21/08/09 15:02:26 WARN Utils: Your hostname, LAPTOP-D5HGLKLK resolves to a loopback address: 127.0.1.1; using 172.23.50.214 instead (on interface eth0)
21/08/09 15:02:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/08/09 15:02:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# reference from Akira Wang's Github
#forming a schema for the dataframes
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import col

#setting datatypes for each individual column
ints = ('VendorID', 'passenger_count', 'RateCodeID', 'RatecodeID','payment_type', 'PULocationID', 'DOLocationID')
doubles = ('trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount'
          , 'congestion_surcharge')
strings = ('store_and_fwd_flag',)
dtimes = ('tpep_pickup_datetime', 'tpep_dropoff_datetime')


dtypes = {column: IntegerType() for column in ints}
dtypes.update({column: DoubleType() for column in doubles})
dtypes.update({column: StringType() for column in strings})
dtypes.update({column: TimestampType() for column in dtimes})

taxi_dir = "../raw_data/yellow_tripdata_20"
#Using a dataset to form the schema
sdf = spark.read.csv(f"{taxi_dir}18-01.csv", header = True)

schema = StructType()
for column in sdf.columns:
    schema.add(column, # column name
               dtypes[column], # data type
               True # is nullable?
              )
#importing the taxi datasets in dictionaries with schemas


taxi18 = {str(i).zfill(2): spark.read.csv(f"{taxi_dir}18-{str(i).zfill(2)}.csv",
                                                     header = True, schema = schema) for i in range(1, 13)}
taxi19 = {str(i).zfill(2): spark.read.csv(f"{taxi_dir}19-{str(i).zfill(2)}.csv",
                                                     header = True, schema = schema) for i in range(1, 13)}


In [3]:
from pyspark.sql.functions import *
#summarising all the 2018 and 2019 data into dataframes
taxi_2018 = taxi18['01']
taxi_2019 = taxi19['01']
for i in range(2, 13):
    taxi_2018 = taxi_2018.union(taxi18[str(i).zfill(2)])
    taxi_2019 = taxi_2019.union(taxi18[str(i).zfill(2)])
    

The 2019 data has an additonal congestion surchage attribute. Since the area of analysis is not related to this, the column will be dropped without question to enjoy consistency with 2018 data.

In [5]:
taxi_2018.write.format('parquet').save('../preprocessed_data/taxi_2018.parquet')
taxi_2019.write.format('parquet').save('../preprocessed_data/taxi_2019.parquet')

21/08/09 15:02:51 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:02:51 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:04:02 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:04:02 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:04:05 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:04:05 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:04:05 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

21/08/09 15:09:47 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:09:48 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:09:48 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:09:48 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:09:48 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:09:48 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:11:07 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

21/08/09 15:15:02 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:15:03 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:15:04 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:15:04 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:15:04 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:15:04 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:15:04 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

21/08/09 15:20:24 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:20:24 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:20:24 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:20:24 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:20:24 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:21:47 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:21:47 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

21/08/09 15:26:21 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:26:21 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:26:21 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:26:21 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:26:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:26:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:26:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014

21/08/09 15:30:14 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:30:14 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:30:14 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:30:14 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:30:15 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
21/08/09 15:30:15 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
21/08/09 15:31:24 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014