In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
storage_account_name = "covidreportingdladf"
client_id = dbutils.secrets.get(scope="covid-reporting-scope", key="client-id")
client_secret = dbutils.secrets.get(scope="covid-reporting-scope", key="client-secret")
tenant_id = dbutils.secrets.get(scope="covid-reporting-scope", key="tenant-id")

spark.conf.set(
    f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net",
    "OAuth"
)
spark.conf.set(
    f"fs.azure.account.oauth.provider.type.{storage_account_name}.dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
)
spark.conf.set(
    f"fs.azure.account.oauth2.client.id.{storage_account_name}.dfs.core.windows.net",
    client_id
)
spark.conf.set(
    f"fs.azure.account.oauth2.client.secret.{storage_account_name}.dfs.core.windows.net",
    client_secret
)
spark.conf.set(
    f"fs.azure.account.oauth2.client.endpoint.{storage_account_name}.dfs.core.windows.net",
    f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
)

### 1) Ingestion

Reading Country Lookup File

In [0]:
df_country_lookup = spark.read.format('csv')\
                              .option('header','true')\
                              .option('inferSchema','true')\
                              .load('abfss://lookup@covidreportingdladf.dfs.core.windows.net/country_lookup/country_lookup.csv')

Reading Dim Date File

In [0]:
df_dim_date = spark.read.format('csv')\
                              .option('header','true')\
                              .option('inferSchema','true')\
                              .load('abfss://lookup@covidreportingdladf.dfs.core.windows.net/dim_date/dim_date.csv')

Reading the Testing File

In [0]:
df_testing_raw = spark.read.format('csv')\
                              .option('header','true')\
                              .option('inferSchema','true')\
                              .load('abfss://raw@covidreportingdladf.dfs.core.windows.net/ecdc/testing/testing.csv')

### 2) Transformation 

Removing Unnecessary Column From Dim Date

In [0]:
df_dim_date_new = df_dim_date.select(
    "date",
    "year",
    "week_of_year"
)

Deriving Week of Year Column

In [0]:
df_dim_date_new = df_dim_date_new.withColumn("week_of_year", concat(col("year"), lit("-W"), lpad(col("week_of_year"), 2, 0)))

Deriving Week Min Date and Week Max Date based on Week of Year Column

In [0]:
df_dim_date_new = df_dim_date_new.groupBy("week_of_year").agg(min("date").alias("week_min_date"), max("date").alias("week_max_date"))

Renaming columns from Country Lookup

In [0]:
df_country_lookup = df_country_lookup.withColumnRenamed("country", "country_lookup")\
                                     .withColumnRenamed("population", "population_lookup")

Joining the Dim Date New and Country Lookup with Testing Data

In [0]:
df_testing_processed = df_testing_raw.join(df_dim_date_new, df_testing_raw.year_week >= df_dim_date_new.week_of_year, "inner")\
                               .join(df_country_lookup, df_testing_raw.country_code == df_country_lookup.country_code_2_digit, "inner")

Selecting Required Columns

In [0]:
df_testing_processed = df_testing_processed.select(
"country",
"country_code_2_digit",
"country_code_3_digit",
"week_of_year",
"week_min_date",
"week_max_date",
"new_cases",
"tests_done",
"population",
"testing_rate",
"positivity_rate",
"testing_data_source"
)

### 3) Writing

In [0]:
df_testing_processed.write.format("csv")\
                             .option("header", "true")\
                             .mode("overwrite")\
                             .save("abfss://processed@covidreportingdladf.dfs.core.windows.net/ecdc/testing/")