In [0]:
'''from datetime import date, timedelta

# Remove this before running Data Factory Pipeline
start_date = date.today() - timedelta(1)

bronze_adls = "abfss://bronze@earthquakeadlstorage.dfs.core.windows.net/"
silver_adls = "abfss://silver@earthquakeadlstorage.dfs.core.windows.net/"
'''

import json

# Retrieve the bronze_params directly as a widget
bronze_params = dbutils.widgets.get("bronze_params")
print(f"Raw bronze_params: {bronze_params}")

# Parse the JSON string
output_data = json.loads(bronze_params)

# Access individual variables
start_date = output_data.get("start_date", "")
end_date = output_data.get("end_date", "")
bronze_adls = output_data.get("bronze_adls", "")
silver_adls = output_data.get("silver_adls", "")
gold_adls = output_data.get("gold_adls", "")

print(f"Start Date: {start_date}, Bronze ADLS: {bronze_adls}")

In [0]:

from pyspark.sql.functions import col, isnull, when
from pyspark.sql.types import TimestampType
from datetime import date, timedelta

In [0]:
# Load the JSON data into a Spark DataFrame
df = spark.read.option("multiline", "true").json(f"{bronze_adls}{start_date}_earthquake_data.json")

In [0]:
df

DataFrame[geometry: struct<coordinates:array<double>,type:string>, id: string, properties: struct<alert:string,cdi:double,code:string,detail:string,dmin:double,felt:bigint,gap:double,ids:string,mag:double,magType:string,mmi:double,net:string,nst:bigint,place:string,rms:double,sig:bigint,sources:string,status:string,time:bigint,title:string,tsunami:bigint,type:string,types:string,tz:string,updated:bigint,url:string>, type: string]

In [0]:
df.head()

Row(geometry=Row(coordinates=[-122.817001342773, 38.8306655883789, 1.42999994754791], type='Point'), id='nc75270656', properties=Row(alert=None, cdi=None, code='75270656', detail='https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=nc75270656&format=geojson', dmin=0.01597, felt=None, gap=74.0, ids=',nc75270656,', mag=0.25, magType='md', mmi=None, net='nc', nst=9, place='8 km NW of The Geysers, CA', rms=0.02, sig=1, sources=',nc,', status='automatic', time=1764287496580, title='M 0.3 - 8 km NW of The Geysers, CA', tsunami=0, type='earthquake', types=',nearby-cities,origin,phase-data,', tz=None, updated=1764287594070, url='https://earthquake.usgs.gov/earthquakes/eventpage/nc75270656'), type='Feature')

In [0]:
# Reshape earthquake data
df = (
    df
    .select(
        'id',
        col('geometry.coordinates').getItem(0).alias('longitude'),
        col('geometry.coordinates').getItem(1).alias('latitude'),
        col('geometry.coordinates').getItem(2).alias('elevation'),
        col('properties.title').alias('title'),
        col('properties.place').alias('place_description'),
        col('properties.sig').alias('sig'),
        col('properties.mag').alias('mag'),
        col('properties.magType').alias('magType'),
        col('properties.time').alias('time'),
        col('properties.updated').alias('updated')
    )
)

In [0]:
df

DataFrame[id: string, longitude: double, latitude: double, elevation: double, title: string, place_description: string, sig: bigint, mag: double, magType: string, time: bigint, updated: bigint]

In [0]:
df.head()

Row(id='nc75270656', longitude=-122.817001342773, latitude=38.8306655883789, elevation=1.42999994754791, title='M 0.3 - 8 km NW of The Geysers, CA', place_description='8 km NW of The Geysers, CA', sig=1, mag=0.25, magType='md', time=1764287496580, updated=1764287594070)

In [0]:
# Save the transformed DataFrame to the Silver container
silver_output_path = f"{silver_adls}earthquake_events_silver/"

In [0]:
# Append DataFrame to Silver container in Parquet format
df.write.mode('append').parquet(silver_output_path)

In [0]:
dbutils.notebook.exit(silver_output_path)