In [0]:
# Securely configure Spark to access Azure Data Lake Storage Gen2 (ADLS Gen2)
# using OAuth 2.0 with Azure Active Directory via service principal credentials.
# Secrets like the client secret are retrieved securely from Databricks secret scope.

# Set the client secret using Databricks secrets for secure storage
spark.conf.set("fs.azure.account.oauth2.client.secret.covid2025storage.dfs.core.windows.net",
               dbutils.secrets.get(scope="myscope", key="sp-secret"))

# Set the authentication type to OAuth for accessing the storage account
spark.conf.set("fs.azure.account.auth.type.covid2025storage.dfs.core.windows.net", "OAuth")

# Specify the provider class that supports OAuth 2.0 client credentials flow
spark.conf.set("fs.azure.account.oauth.provider.type.covid2025storage.dfs.core.windows.net", 
               "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")

# Set the client ID (application ID) of your Azure AD registered app
spark.conf.set("fs.azure.account.oauth2.client.id.covid2025storage.dfs.core.windows.net", 
               "af9d0047-d4cd-4d32-81a0-3e2ebeff1343")
               
# Define the token endpoint for your Azure tenant to obtain access tokens
spark.conf.set("fs.azure.account.oauth2.client.endpoint.covid2025storage.dfs.core.windows.net", 
               "https://login.microsoftonline.com/fdae273f-8a49-469c-b9bc-444ce8f28607/oauth2/token")



In [0]:
# Import necessary libraries to read and manipulate the notebook
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("abfss://bronze@covid2025storage.dfs.core.windows.net/countrywise.csv")


In [0]:
df.show()

In [0]:
print(f"Row count: {df.count()}")
df.printSchema()

In [0]:
# Create or update a column with transformed data
df_silver = df.withColumnRenamed("Country/Region", "country") \
       .withColumnRenamed("New cases", "new_cases") \
       .withColumnRenamed("New deaths", "new_deaths") \
       .withColumnRenamed("New recovered", "new_recovered") \
       .withColumnRenamed("Deaths / 100 Cases", "deaths_per_100_cases") \
       .withColumnRenamed("Recovered / 100 Cases", "recovered_per_100_cases") \
       .withColumnRenamed("Deaths / 100 Recovered", "deaths_per_100_recovered") \
       .withColumnRenamed("Confirmed last week", "confirmed_last_week") \
       .withColumnRenamed("1 week change", "one_week_change") \
       .withColumnRenamed("1 week % increase", "one_week_percent_increase") \
       .withColumnRenamed("WHO Region", "who_region")

In [0]:
# Drop unneccessary columns and save as a new df
cols_to_drop = ["deaths_per_100_cases", "recovered_per_100_cases", "deaths_per_100_recovered"]
df_silver = df_silver.drop(*cols_to_drop)

In [0]:
df_silver = df_silver.dropna()

In [0]:
df_silver.describe()

In [0]:
df_silver.printSchema()

In [0]:
print(df_silver.show())

In [0]:
output_path = f"abfss://silver@covid2025storage.dfs.core.windows.net/countrywise/"

df_silver.repartition(1).write.mode("overwrite").option("header", "true").csv(output_path)

display(dbutils.fs.ls(output_path))