### Loading the functions

In [0]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F

### Creating Schema

In [0]:
container="veersacontainer"
storage_account="storage12092004"

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS veersadatabricks.gold
MANAGED LOCATION 'abfss://veersacontainer@storage12092004.dfs.core.windows.net/gold/';

### Join of search trend and time table
- joined on date
- good join
- no changes needed

In [0]:
df_time = spark.table("veersadatabricks.silver_final.time_silver")
df_time = df_time.drop("_source_file","ingested_at","_source_file_mod_ts","is_current")

In [0]:
df_search_trend = spark.table("veersadatabricks.silver_final.searchtrend_silver")
df_search_trend = df_search_trend.drop("_source_file","ingested_at","_source_file_mod_ts","is_current")

In [0]:
df_time_search_trend = df_time.join(df_search_trend, on="date",how="inner")
#print(df_time_search_trend.count())

In [0]:
# df_time_search_trend = df_time_search_trend.drop("_source_file")

In [0]:
df_time_search_trend.write.format("delta") \
    .mode("overwrite") \
    .option("path", f"abfss://{container}@{storage_account}.dfs.core.windows.net/gold/time_searchtrend") \
    .saveAsTable("veersadatabricks.gold.time_searchtrend")

### Time Tables
- adding a mortality rate column

#### Time Province

In [0]:
df_time_province = spark.table("veersadatabricks.silver_final.timeprovince_silver")
df_time_province = df_time_province.drop("time")
df_time_province = df_time_province.orderBy(df_time_province.province, df_time_province.date)
#df_time_province.display()
df_time_province = df_time_province.withColumn(
    "mortality_rate",
    round(
        when(col("confirmed") > 0, col("deceased") / col("confirmed")).otherwise(0), 3
    )
)
#df_time_province.display()

In [0]:
df_time_province.write.format("delta") \
    .mode("overwrite") \
    .option("path", f"abfss://{container}@{storage_account}.dfs.core.windows.net/gold/TimeProvince") \
    .saveAsTable("veersadatabricks.gold.timeprovince")

#### Time Age

In [0]:
df_time_age = spark.table("veersadatabricks.silver_final.timeage_silver")
df_time_age = df_time_age.withColumn(
    "mortality_rate",
    round(
        when(col("confirmed") > 0, col("deceased") / col("confirmed")).otherwise(None), 3
    )
)
#df_time_age.display()

In [0]:
df_time_age.write.format("delta") \
    .mode("overwrite") \
    .option("path", f"abfss://{container}@{storage_account}.dfs.core.windows.net/gold/TimeAge") \
    .saveAsTable("veersadatabricks.gold.timeage")

#### Time Gender

In [0]:
df_time_gender = spark.table("veersadatabricks.silver_final.timegender_silver")
df_time_gender = df_time_gender.withColumn(
    "mortality_rate",
    round(
        when(col("confirmed") > 0, col("deceased") / col("confirmed")).otherwise(None), 3
    )
)
#df_time_gender.display()

In [0]:
df_time_gender.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema","true")\
    .option("path", f"abfss://{container}@{storage_account}.dfs.core.windows.net/gold/TimeGender") \
    .saveAsTable("veersadatabricks.gold.timegender")

### Region and Weather Table Join
- inner join ensures that only valid, complete and relevant data is kept

In [0]:
df_region = spark.table("veersadatabricks.silver_final.region_silver")
df_region = df_region.drop("_source_file","ingested_at","_source_file_mod_ts","is_current")

In [0]:
df_weather = spark.table("veersadatabricks.silver_final.weather_silver")
df_weather= df_weather.drop("_source_file","ingested_at","_source_file_mod_ts","is_current")

In [0]:
df_region_weather = df_region.join(df_weather, df_region.code == df_weather.code, how = "inner")

In [0]:
df_region_weather = df_region_weather.drop(df_weather["code"]).drop(df_weather["province"])
df_region_weather.orderBy(df_region.code,df_weather.date)#.display()
#df_region_weather.count()

In [0]:
df_region_weather.write.format("delta") \
    .mode("overwrite") \
    .option("path", f"abfss://{container}@{storage_account}.dfs.core.windows.net/gold/RegionWeather") \
    .saveAsTable("veersadatabricks.gold.regionweather")

### Case Table
- no use of latitude and longitudes as they vary and don't know the value for half the data points for them anyway

In [0]:
df_case = spark.table("veersadatabricks.silver_final.case_table_silver")
df_case = df_case.drop("latitude", "longitude","_source_file","ingested_at","_source_file_mod_ts","is_current")

In [0]:
df_case.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true")\
    .option("path", f"abfss://{container}@{storage_account}.dfs.core.windows.net/gold/FinalCaseTable") \
    .saveAsTable("veersadatabricks.gold.case_table_gold")

## Joined the patient_info_silver table with cases table
- join on infection_case, province and city
- broadcast(df_case) is a Spark optimization technique that tells Spark to send the entire df_case DataFrame to all worker nodes, so it can be joined efficiently with a much larger DataFrame like df_patient.

In [0]:
df_patient_info = spark.table("veersadatabricks.silver_final.patientinfo_silver")
df_patient_info= df_patient_info.drop("_source_file","ingested_at","_source_file_mod_ts","is_current")

In [0]:
patient_case_df = df_patient_info.join(
    broadcast(df_case),
    (df_patient_info.infection_case == df_case.infection_case) &
    (df_patient_info.province == df_case.province) &
    (df_patient_info.city == df_case.city),
    how = "left"
).drop(df_case.province, df_case.city, df_case.infection_case, df_case.confirmedCases)

In [0]:
patient_case_df = patient_case_df.filter(~((col("sex").isNull()) & (col("age").isNull()) & (col("infection_case") == "Etc")))

In [0]:
patient_case_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true")\
    .option("path", f"abfss://{container}@{storage_account}.dfs.core.windows.net/gold/PatientCase") \
    .saveAsTable("veersadatabricks.gold.patientcase")