## Ingesting Data to Storage Account - Bronze Layer
- Load data based on the date provided by the user
- Define the schema for the data
- Write data to an external location (bronze layer) using partitioning (year-month) and apply the schema
- Create an external table in Unity Catalog pointing to this location


In [0]:
import requests
import os
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
dbutils.widgets.text("p_file_date","2025-07-01")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
v_file_date

In [0]:
year_month = v_file_date[:7]
print(year_month)

##### 1. Loading data year-month wise

In [0]:
url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year_month}.parquet"
dbfs_tmp_dir = "/dbfs/tmp"

# Create the folder if it doesn't exist
os.makedirs(dbfs_tmp_dir, exist_ok=True)

local_path = f"{dbfs_tmp_dir}/yellow_taxi_{year_month}.parquet"       

# Download file to local temp directory
with open(local_path, "wb") as f:
    f.write(requests.get(url).content)

In [0]:
url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_{year_month}.parquet"
dbfs_tmp_dir = "/dbfs/tmp"

# Create the folder if it doesn't exist
os.makedirs(dbfs_tmp_dir, exist_ok=True)

local_path = f"{dbfs_tmp_dir}/green_taxi_{year_month}.parquet"       

# Download file to local temp directory
with open(local_path, "wb") as f:
    f.write(requests.get(url).content)


##### 2. Reading data 

In [0]:
yellow_schema =StructType([
StructField('VendorID', IntegerType(), True), 
StructField('tpep_pickup_datetime', TimestampType(), True), 
StructField('tpep_dropoff_datetime', TimestampType(), True), 
StructField('passenger_count', LongType(), True), 
StructField('trip_distance', DoubleType(), True), 
StructField('RatecodeID', LongType(), True), 
StructField('store_and_fwd_flag', StringType(), True), 
StructField('PULocationID', IntegerType(), True), 
StructField('DOLocationID', IntegerType(), True), 
StructField('payment_type', LongType(), True), 
StructField('fare_amount', DoubleType(), True), 
StructField('extra', DoubleType(), True), 
StructField('mta_tax', DoubleType(), True), 
StructField('tip_amount', DoubleType(), True), 
StructField('tolls_amount', DoubleType(), True), 
StructField('improvement_surcharge', DoubleType(), True), 
StructField('total_amount', DoubleType(), True), 
StructField('congestion_surcharge', DoubleType(), True), 
StructField('Airport_fee', DoubleType(), True), 
StructField('cbd_congestion_fee', DoubleType(), True)])

In [0]:
green_schema= StructType([
StructField('VendorID', IntegerType(), True), 
StructField('lpep_pickup_datetime', TimestampType(), True), 
StructField('lpep_dropoff_datetime', TimestampType(), True), 
StructField('store_and_fwd_flag', StringType(), True), 
StructField('RatecodeID', LongType(), True), 
StructField('PULocationID', IntegerType(), True), 
StructField('DOLocationID', IntegerType(), True), 
StructField('passenger_count', LongType(), True), 
StructField('trip_distance', DoubleType(), True), 
StructField('fare_amount', DoubleType(), True), 
StructField('extra', DoubleType(), True), 
StructField('mta_tax', DoubleType(), True), 
StructField('tip_amount', DoubleType(), True), 
StructField('tolls_amount', DoubleType(), True), 
StructField('ehail_fee', DoubleType(), True), 
StructField('improvement_surcharge', DoubleType(), True), 
StructField('total_amount', DoubleType(), True), 
StructField('payment_type', LongType(), True), 
StructField('trip_type', LongType(), True), 
StructField('congestion_surcharge', DoubleType(), True), 
StructField('cbd_congestion_fee', DoubleType(), True)])

In [0]:
yellow_taxi_df = spark.read.schema(yellow_schema).parquet(f"/tmp/yellow_taxi_{year_month}.parquet")


In [0]:
yellow_taxi_df.count()

In [0]:
yellow_taxi_df.show(5)

In [0]:
green_taxi_df = spark.read \
    .schema(green_schema) \
    .parquet(f"/tmp/green_taxi_{year_month}.parquet")


In [0]:
green_taxi_df.count()

In [0]:
green_taxi_df.show(5)

In [0]:
yellow_taxi_df = yellow_taxi_df.withColumn("file_year_month", lit(year_month))
green_taxi_df = green_taxi_df.withColumn("file_year_month", lit(year_month))

In [0]:
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

In [0]:
yellow_taxi_df.write.format("parquet") \
    .mode("overwrite") \
    .partitionBy("file_year_month") \
    .save(f"abfss://bronze@nyctaxitrips2025.dfs.core.windows.net/yellow_taxi/")
green_taxi_df.write.format("parquet") \
    .mode("overwrite") \
    .partitionBy("file_year_month") \
    .save(f"abfss://bronze@nyctaxitrips2025.dfs.core.windows.net/green_taxi/")

In [0]:
dbutils.fs.ls("abfss://bronze@nyctaxitrips2025.dfs.core.windows.net/yellow_taxi/")

In [0]:
%sql
DROP TABLE IF EXISTS taxi_trips_2025.bronze.yellow_taxi;

CREATE TABLE IF NOT EXISTS taxi_trips_2025.bronze.yellow_taxi
USING PARQUET
LOCATION 'abfss://bronze@nyctaxitrips2025.dfs.core.windows.net/yellow_taxi';

In [0]:
%sql
DROP TABLE IF EXISTS taxi_trips_2025.bronze.green_taxi;

CREATE TABLE taxi_trips_2025.bronze.green_taxi
USING PARQUET
LOCATION 'abfss://bronze@nyctaxitrips2025.dfs.core.windows.net/green_taxi';

In [0]:
%sql
select count(1)
from taxi_trips_2025.bronze.yellow_taxi
-- where file_year_month = '2025-08"';

##### 3. Also writing taxi zone look up data to table

In [0]:
%sql
CREATE TABLE IF NOT EXISTS taxi_trips_2025.bronze.taxi_zone_lookup
USING CSV
OPTIONS (header = "true", inferSchema = "true")
LOCATION 'abfss://bronze@nyctaxitrips2025.dfs.core.windows.net/taxi_zone_lookup/';

In [0]:
dbutils.notebook.exit("Success")