In [0]:
from pyspark.sql.functions import monotonically_increasing_id, current_timestamp, when, col

# Load source table
df = spark.table("dev_catalog.bronze.game")

# Deduplicate
df = df.dropDuplicates()

# Replace null values
df = df.fillna({'Publisher': 'N/A', 'Genre': 'N/A', 'Game_Length': 0, 'source_file': 'unknown'})

# Cleaning and normalizing data
df = df.withColumn("Rating", 
                   when(col("Rating") == "E", "Everyone")
                    .when(col("Rating") == "E10+", "Everyone 10 and older")
                    .when(col("Rating") == "T", "Teen")
                    .when(col("Rating") == "M", "Mature")
                    .when(col("Rating") == "RP", "Rating Pending")
                    .otherwise("NoRating"))

# Add insert and update date fields
df = df.withColumnRenamed("dt_insert", "insert_date").withColumn("update_date", current_timestamp())

# Create target table if it does not exist and upsert game data based on the gameid field
df.createOrReplaceTempView("temp_view")

merge_query = """
MERGE INTO dev_catalog.silver.game AS target
USING temp_view AS source
ON target.GameID = source.GameID
WHEN MATCHED THEN
  UPDATE SET target.GameID = source.GameID,
             target.Publisher = source.Publisher,
             target.Genre = source.Genre,
             target.Game_Length = source.Game_Length,
             target.Rating = source.Rating,
             target.update_date = source.update_date,
             target.source_file = source.source_file
WHEN NOT MATCHED THEN
  INSERT (GameID, Publisher, Genre, Game_Length, Rating, insert_date, update_date, source_file)
  VALUES (source.GameID, source.Publisher, source.Genre, source.Game_Length, source.Rating, source.insert_date, source.update_date, source.source_file)
"""

spark.sql(merge_query)

#display(df)