In [0]:
catalog = "movie"
b_schema = "movie_bronze"
s_schema = "movie_silver"
g_schema = "movie_gold"


In [0]:
df_genre = spark.sql(f"""SELECT Genre FROM {catalog}.{b_schema}.movies order by Genre""")


In [0]:
df_genre.display()

Genre
Action & Adventure
Action & Adventure
Action & Adventure
Action & Adventure
Action & Adventure
Action & Adventure
Action & Adventure
Action & Adventure
Action & Adventure
Action & Adventure


In [0]:
spark.sql(f"""CREATE TABLE IF NOT EXISTS {catalog}.{s_schema}.movie_genre_master (
    Genre_id STRING,
    Genre STRING,
    PRIMARY KEY (Genre_id)) USING DELTA""").display()


In [0]:
from pyspark.sql import functions as F
distinct_genre = df_genre.select("Genre").dropna().dropDuplicates()

distinct_genre.display()

Genre
Action & Adventure
Animation
Biography
Comedy
Crime
Documentary
Drama
Family
Fantasy
Horror


In [0]:
genre_table = spark.table(f"""{catalog}.{s_schema}.movie_genre_master""")
genre_table.display()

Genre_id,Genre
1,Horror
2,Thriller
3,Romance
4,Comedy
5,Fantasy
6,Crime
7,Animation
8,Mystery
9,Biography
10,Anime


In [0]:
new_genres = distinct_genre.join(genre_table, distinct_genre.Genre == genre_table.Genre, "left_anti")
new_genres.display()

Genre


In [0]:
from pyspark.sql.window import Window

max_id = genre_table.agg(F.max("Genre_id").cast("int").alias("max_id")).collect()[0]["max_id"]

if max_id is None:
    max_id = 0

window = Window.orderBy(F.monotonically_increasing_id())
new_genres = new_genres.withColumn(
    "Genre_id", (F.row_number().over(window) + max_id).cast("int")
)

new_genres.select("Genre_id", "Genre").createOrReplaceTempView("new_genres")



In [0]:
spark.sql(
    """
    INSERT INTO movie.movie_silver.movie_genre_master (Genre_id, Genre)
    SELECT
      COALESCE(
    TRY_CAST(regexp_replace(Genre_id, '[^0-9]', '') AS BIGINT),
    0
  ) AS Genre_id,
      Genre
    FROM new_genres
    WHERE Genre_id IS NOT NULL
    """
)

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
%sql
SELECT * FROM movie.movie_silver.movie_genre_master

Genre_id,Genre
1,Horror
2,Thriller
3,Romance
4,Comedy
5,Fantasy
6,Crime
7,Animation
8,Mystery
9,Biography
10,Anime
