In [0]:
# ====================Step 1: Data Acquisition AND Data Load to Azure Blob Storage.=================================
# I am loading the NYC Yellow Taxi Trip Records data from the source webpage. All the records from 2009 till date (2024) has been loaded.
# setting up the logging and spark session initialization
import logging
from pyspark.sql import SparkSession
import requests
import os
from bs4 import BeautifulSoup


# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def _init_spark(self):
    logger.info("Initializing Spark Session")
    spark = SparkSession.builder \
            .appName("NYC Taxi Data Processor") \
            .getOrCreate()
    return spark

INFO:py4j.clientserver:Received command c on object id p1
INFO:py4j.clientserver:Received command c on object id p0


In [0]:
# Step 1.1:  I am fetching the "Yellow Taxi Trip" records from the NYC Taxi dataset from the source URL (https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page).
# The Class "ParqueDownloader" has 3 functions. 
# function 1: "get_parque_links": function that fetches all the parquet links from the URL.
# function 2: "display_parque_links": function to display all the fetched parquet links in function1 (quick validation to check if parquet links are present) 
# function 3: "download_parquet_files" function that downloads all the parquet files.

class ParqueDownloader:
    def __init__(self, url):
        self.url = url
        print(f"Initialized ParqueDownloader with URL: {url}")

    def get_parque_links(self):
        print(f"Fetching content from URL: {self.url}")
        try:
            response = requests.get(self.url)
            response.raise_for_status()
            print("Successfully fetched content.")
        except requests.RequestException as e:
            print(f"Failed to retrieve content from the URL. Error: {e}")
            return []

        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        print("Parsed the content using BeautifulSoup.")

        # Extract links that contain "Yellow Taxi Trip Records" and end with ".parquet"
        parquet_links = [urljoin(self.url, a['href']) for a in soup.find_all('a', href=True) if 'Yellow Taxi Trip Records' in a.text and a['href'].endswith('.parquet')]

        if parquet_links:
            print(f"Found {len(parquet_links)} PARQUET links.")
        else:
            print("No PARQUET links found.")
        return parquet_links
    
    def display_parque_links(self):
        print("Getting PARQUET links...")
        parquet_links = self.get_parque_links()
        if parquet_links:
            print("PARQUET files found:")
            for link in parquet_links:
                print(link)
        else:
            print("No PARQUET files found.")
    def download_parquet_files(self, save_dir):
        print("Downloading PARQUET files...")
        parquet_links = self.get_parque_links()
        if parquet_links:
            for link in parquet_links:
                filename = os.path.join(save_dir, link.split('/')[-1])
                print(f"Downloading {link} to {filename}")
                response = requests.get(link)
                with open(filename, 'wb') as file:
                    file.write(response.content)
                print(f"Downloaded {filename}")
            return [os.path.join(save_dir, link.split('/')[-1]) for link in parquet_links]
        else:
            print("No PARQUET files found to download.")

INFO:py4j.clientserver:Received command c on object id p1


In [0]:
from urllib.parse import urljoin

url = input("Enter the URL from where you want to download the dataset")
# https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

# creating a class instance
fetcher = ParqueDownloader(url)

# look for parque links in the provided URL and get those links
parque_links = fetcher.display_parque_links()

# Step 1.2: Create a temporary storage in databricks to store the downloaded Yellow Taxi Trip Records
# Use DBFS for temporary storage in Databricks
save_dir = "/dbfs/tmp/nyc_taxi_data"  

# Create the directory if it doesn't exist
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
print("make directory succeeded!")

# Create an instance of ParqueDownloader and fetch all the files from all the years starting 2009. This is a huge dataset, make sure you download only those files for next steps as needed
downloaded_files = fetcher.download_parquet_files(save_dir)
print("Saved")

In [0]:
# Step 1.3: Storing the parquet file links to my Azure Storage >> Container>> Blobs

# Azure Blob Storage configurations
storage_account_name = "shilsdemostorage"
container_name = "nycyellowtaxidata-raw"
sas_token = "sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2024-06-21T08:32:47Z&st=2024-06-14T00:32:47Z&spr=https&sig=fnbkJRccRRUnQdmw2hXtTv4%2FkKHpMbCUPaUupCOIJ6w%3D" 

# Mount the Azure Blob Storage to Databricks File System (DBFS)
# We use the dbutils.fs.mount command here
mount_point = "/mnt/nyc_taxi_data"

try:
    dbutils.fs.mount(
        source = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net",
        mount_point = mount_point,
        extra_configs = {f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net": sas_token}
    )
    print("mount succeeded!")
except Exception as e:
    print("mount exception", e)
print('mount completed')

# List all files in the save directory
all_files = os.listdir(save_dir)
print(all_files)

year_to_upload = 2022
filtered_files = [file for file in all_files if f"yellow_tripdata_{year_to_upload}-" in file]
print(f"Found {len(filtered_files)} files for the year {year_to_upload}.")

dbfs_paths = [f"dbfs:/tmp/nyc_taxi_data/{file}" for file in filtered_files]

# Read all the filtered files using their absolute DBFS paths and make a single combined dataframe. This dataframe is further read.
combine_df = spark.read.parquet(*dbfs_paths)
combine_df.show()

# Upload the filtered PARQUET files to Azure Blob Storage under a year-specific directory
blob_container = 'nycyellowtaxidata-raw'

# Write the final output stored as a new partition on Azure Blob Storage.
output_filePath = "wasbs://" + blob_container + "@" + storage_account_name + ".blob.core.windows.net/nycyellowtaxidata-raw/2022"

spark.conf.set(f"fs.azure.sas.{blob_container}.{storage_account_name}.blob.core.windows.net", sas_token)

# Write the DataFrame to Azure blob storage partitioned by year
combine_df.write.mode("append").parquet(output_filePath)

print(f"Data has been written to Azure Blob Storage at {output_filePath}")



In [0]:
combine_df.count()

INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0
INFO:py4j.clientserver:Received command c on object id p0


39656098

INFO:py4j.clientserver:Received command c on object id p0
