In [1]:
from urllib.request import urlretrieve
from pyspark.sql import SparkSession
import os

spark = (
    SparkSession.builder.appName("MAST30034 Project 1 Raw Data Collection")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

24/08/26 01:04:31 WARN Utils: Your hostname, DESKTOP-H6V94HM resolves to a loopback address: 127.0.1.1; using 192.168.0.100 instead (on interface eth0)
24/08/26 01:04:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/26 01:04:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Check if raw folder exist, if not create one
output_dir = '../data/raw'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# TLC Trips Records Data


In [3]:
def trips_data_retrieval(data_range, service_types, dir):
    """
        This function retrieves data from a specified year, month, and service type. The data is then stored in the designated directory
            `data_range`: dictionary - {'2023': 3, '2024': range(1,6)}
            `service_types`: list - ['yellow', 'green', 'fhv', 'fhvhv']
            `dir`: string
    """

    URL_TEMPLATE = {
        'yellow': "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_",
        'green': "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_",
        'fhv': "https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_",
        'fhvhv': "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_"
    }

    for service in service_types:
        if service not in URL_TEMPLATE.keys():
            print(f"The following service(s) does not exist: {service}")
            next
        if not os.path.exists(f"{dir}/{service.lower()}"):
            os.makedirs(f"{dir}/{service.lower()}")
        
        for YEAR, MONTHS in data_range.items():
            for month in MONTHS:
                month = str(month).zfill(2)

                if os.path.exists(f"{dir}/{service.lower()}/{YEAR}-{month}.parquet"):
                    next

                # Generate URL to retrieve data
                url = f"{URL_TEMPLATE[service.lower()]}{YEAR}-{month}.parquet"

                # Setting up directory to store data
                output_dir = f"{dir}/{service.lower()}/{YEAR}-{month}.parquet"

                # Download data
                urlretrieve(url, output_dir)

In [23]:
# Defining range of data that we want to get
data_range = {'2023': range(3,13), '2024': range(1,6)}

# Retrieve data
trips_data_retrieval(data_range, ["yellow", "green"], output_dir)

In [4]:
# Retrive taxi zone lookup data
output_dir = '../data/raw'

urlretrieve("https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv",
            "../data/raw/taxi_zone_lookup.csv")

import zipfile

# Retrieve the shape files of the zones
if not os.path.exists("../data/raw/taxi_zone_shape"):
    os.mkdir("../data/raw/taxi_zone_shape")
    
urlretrieve("https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip",
            "../data/raw/taxi_zones.zip")

with zipfile.ZipFile("../data/raw/taxi_zones.zip", 'r') as zip_ref:
    zip_ref.extractall("../data/raw/taxi_zone_shape")

# Remove the zip file
os.remove("../data/raw/taxi_zones.zip")

# Weather Data


Retrieving data's weather from National Centers for Environmental Information's Integrated Surfaced Dataset, ranging from March 2023 to May 2024

In [5]:
output_dir ="../data/raw/weather"

if not os.path.exists(output_dir):
    os.makedirs(output_dir) 

WEATHER_URL = {'2023': 'https://www.ncei.noaa.gov/data/global-hourly/access/2023/72503014732.csv', 
               '2024': 'https://www.ncei.noaa.gov/data/global-hourly/access/2024/72503014732.csv'}

for period, url in WEATHER_URL.items():
    destination = f"{output_dir}/{period}-weather.csv"

    urlretrieve(url, destination)


Transforming the files from `.csv` to `.parquet`

In [6]:
for file in os.listdir("../data/raw/weather"):
    df = spark.read.csv("../data/raw/weather/" + file, header=True, inferSchema=False)

    if not os.path.exists(f"../data/raw/{file[0:12]}"):
        df.write.parquet(f"../data/raw/{file[0:12]}")

    os.remove("../data/raw/weather/" + file)
    
os.removedirs("../data/raw/weather")

24/08/26 01:13:27 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

---


# Socio-Economic Data

We simply downloaded the data on unemployment rate of New York City on at [U.S. BUREAU OF LABOR STATISTICS](https://data.bls.gov/dataViewer/view/timeseries/LAUBS360000000000003) on the website as there is no retrievable URL.