# Transalate Kaggle data into Source table

In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F
from snowflake.snowpark.window import Window
session = get_active_session()


df_amazon = session.table("hulu_MOVIES_TV")
df_disney = session.table("disney_MOVIES_TV")
df_hulu = session.table("HULU_MOVIES_TV")
df_netflix = session.table("netflix_MOVIES_TV")

df_industry = session.table("INDUSTRY_MOVIES")
df_IMDB = session.table("IMDB_MOVIE_REVIEWS")



# Select only the required columns for df_amazon
df_amazon_with_columns = df_amazon \
    .select("TYPE", "TITLE", "DIRECTOR", "CAST", "COUNTRY", "RATING", "DATE_ADDED", 
            "RELEASE_YEAR", "DURATION", "LISTED_IN")

# Select only the required columns for df_disney
df_disney_with_columns = df_disney \
    .select("TYPE", "TITLE", "DIRECTOR", "CAST", "COUNTRY", "RATING", "DATE_ADDED", 
            "RELEASE_YEAR", "DURATION", "LISTED_IN")

# Select only the required columns for df_hulu
df_hulu_with_columns = df_hulu \
    .select("TYPE", "TITLE", "DIRECTOR", "CAST", "COUNTRY", "RATING", "DATE_ADDED", 
            "RELEASE_YEAR", "DURATION", "LISTED_IN")

# Select only the required columns for df_netflix
df_netflix_with_columns = df_netflix \
    .select("TYPE", "TITLE", "DIRECTOR", "CAST", "COUNTRY", "RATING", "DATE_ADDED", 
            "RELEASE_YEAR", "DURATION", "LISTED_IN")

# Union all the DataFrames into df_movie with the required columns
df_movie = df_amazon_with_columns \
    .union(df_disney_with_columns) \
    .union(df_hulu_with_columns) \
    .union(df_netflix_with_columns)

# Drop duplicate titles
df_movie = df_movie.groupBy("TITLE").agg(
    F.coalesce(F.max("TYPE"), F.lit(None)).alias("TYPE"),
    F.coalesce(F.max("DIRECTOR"), F.lit(None)).alias("DIRECTOR"),
    F.coalesce(F.max("CAST"), F.lit(None)).alias("CAST"),
    F.coalesce(F.max("COUNTRY"), F.lit(None)).alias("COUNTRY"),
    F.coalesce(F.max("RATING"), F.lit(None)).alias("RATING"),
    F.coalesce(F.max("DATE_ADDED"), F.lit(None)).alias("DATE_ADDED"),
    F.coalesce(F.max("RELEASE_YEAR"), F.lit(None)).alias("RELEASE_YEAR"),
    F.coalesce(F.max("DURATION"), F.lit(None)).alias("DURATION"),
    F.coalesce(F.max("LISTED_IN"), F.lit(None)).alias("LISTED_IN")
)

# Add primary key (row number)
df_movie = df_movie.withColumn("Movies_key", F.row_number().over(Window.orderBy(F.lit(1))))

# Save to table (assuming Snowflake environment is set up)
df_movie.write \
    .mode("overwrite") \
    .saveAsTable("MOVIES_TV")


### adding streaming tags
# Add 'amazon_rating' and 'amazon' flag
df_amazon_with_rating = df_amazon \
    .select("TITLE") \
    .with_column("amazon", F.lit(True))

# Add 'disney_rating' and 'disney' flag
df_disney_with_rating = df_disney \
    .select("TITLE") \
    .with_column("disney", F.lit(True))

# Add 'hulu_rating' and 'hulu' flag
df_hulu_with_rating = df_hulu \
    .select("TITLE") \
    .with_column("hulu", F.lit(True))

# Add 'netflix_rating' and 'netflix' flag
df_netflix_with_rating = df_netflix \
    .select("TITLE") \
    .with_column("netflix", F.lit(True))

# Perform left joins with df_movie
df_movie = df_movie \
    .join(df_amazon_with_rating, on="TITLE", how="left") \
    .join(df_disney_with_rating, on="TITLE", how="left") \
    .join(df_hulu_with_rating, on="TITLE", how="left") \
    .join(df_netflix_with_rating, on="TITLE", how="left")


# Save to table (assuming Snowflake environment is set up)
df_movie.write \
    .mode("overwrite") \
    .save_as_table("MOVIES_TV")

# join iMBD TABLE and INDUSTRY Table 
df_industry = df_industry.select(
    df_industry["NAME"].alias("TITLE"),
    df_industry["SCORE"].alias("INDUSTRY_SCORE"),
    df_industry["VOTES"].alias("INDUSTRY_VOTES"),
    df_industry["Budget"],
    df_industry["GROSS"],
    df_industry["COMPANY"]
)

df_IMDB = df_IMDB.select(
    df_IMDB["NAME"].alias("TITLE"),
    df_IMDB["IMBD_ID"].alias("IMBD"),
    df_IMDB["RATING"].alias("IMBD_RATING"),
    df_IMDB["METASCORE"],
    df_IMDB["VOTES"].alias("IMBD_VOTES")
)

# Perform a left join on the "TITLE" column
df_INDUSTRY_JOIN = df_industry.join(df_IMDB, on="TITLE", how="inner")

# Save to table (assuming Snowflake environment is set up)
df_INDUSTRY_JOIN.write \
    .mode("overwrite") \
    .save_as_table("MOVIES_TV_REVIEWS")

# Perform a left join on the "TITLE" column
df_streaming = df_movie.join(df_INDUSTRY_JOIN, on="TITLE", how="")
df_streaming.show()


# Trim leading/trailing whitespace and ensure the month is in the proper 3-letter format.
df_streaming_filled = df_streaming.with_column(
    "DATE_ADDED",
    F.to_char(
        F.to_date(
            F.trim(F.col("DATE_ADDED")),  # Trim any leading/trailing spaces
            "MON DD, YYYY"  # Ensure the date format is 'MON DD, YYYY'
        ),
        "MM/DD/YYYY"  # Convert to desired format
    )
)

# Fill missing DATE_ADDED with the default value based on RELEASE_YEAR
df_streaming_filled = df_streaming_filled.with_column(
    "DATE_ADDED",
    F.when(
        F.col("DATE_ADDED").is_null(),  # If DATE_ADDED is null
        F.concat(F.lit("01/01/"), F.col("RELEASE_YEAR").cast("string"))
    ).otherwise(F.col("DATE_ADDED"))  # Keep existing DATE_ADDED if not null
)

df_streaming_filled = df_streaming_filled.with_column(
    "RATING", F.coalesce(F.col("RATING"), F.lit("TV-PG"))
)

df_streaming_filled = df_streaming_filled.with_column(
    "COUNTRY", F.coalesce(F.col("COUNTRY"), F.lit("United States"))
)

# Remove "min" and keep the numeric value
df_streaming_filled = df_streaming_filled.with_column(
    "DURATION",
    F.regexp_replace(F.col("DURATION"), r"\s*min", "")
)

# Remove "season" and keep the numeric value
df_streaming_filled = df_streaming_filled.with_column(
    "DURATION",
    F.regexp_replace(F.col("DURATION"), r"\s*Season\s*s*", "00")
)

# Replace NULL values in the "DURATION" column with "90 min"
df_streaming_filled = df_streaming_filled.with_column(
    "DURATION",
    F.coalesce(F.col("DURATION"), F.lit("90"))
)

# Convert the DURATION column to numeric format, keeping the "min" part, so it's consistent.
df_streaming_filled = df_streaming_filled.with_column(
    "DURATION",
    F.concat(F.col("DURATION"))
)

# Save to table (assuming Snowflake environment is set up)
df_streaming_filled.write \
    .mode("overwrite") \
    .save_as_table("STREAMING_MOVIES_TV")

df_streaming_filled.show()

# Creating normative table from STREAMING_MOVIES_TV


## Entities

### 1. **nf_movies_tv_shows**
| **Field**       | **Type**  | **Description**                                        |
|-----------------|-----------|--------------------------------------------------------|
| MOVIES_KEY      | INT       | Primary Key. Unique identifier for each movie/show.    |
| TITLE           | VARCHAR   | Title of the movie or TV show.                         |
| TYPE            | VARCHAR   | Type of content (e.g., Movie, TV Show).                |
| RATING          | VARCHAR   | Rating (e.g., TV-PG, R, NULL for some).                |
| DATE_ADDED      | DATE      | Date when the movie/show was added.                    |
| RELEASE_YEAR    | INT       | Year of release.                                       |
| DURATION        | VARCHAR   | Duration of the movie/show (e.g., 88 min, 1 Season).   |
| LISTED_IN       | VARCHAR   | Categories of the movie/show (e.g., Horror, Thriller). |

### 2. **nf_cast**
| **Field**       | **Type**  | **Description**                                        |
|-----------------|-----------|--------------------------------------------------------|
| MOVIES_KEY      | INT       | Foreign Key. References MOVIES_KEY in `nf_movies_tv_shows`. |
| nf_cast            | VARCHAR   | Name(s) of the nf_cast members. (NULL if unavailable).    |

### 3. **nf_countries**
| **Field**       | **Type**  | **Description**                                        |
|-----------------|-----------|--------------------------------------------------------|
| MOVIES_KEY      | INT       | Foreign Key. References MOVIES_KEY in `nf_movies_tv_shows`. |
| COUNTRY         | VARCHAR   | Country/Region where the movie/show is available.      |

### 4. **nf_platforms**
| **Field**       | **Type**  | **Description**                                        |
|-----------------|-----------|--------------------------------------------------------|
| MOVIES_KEY      | INT       | Foreign Key. References MOVIES_KEY in `nf_movies_tv_shows`. |
| AMAZON          | BOOLEAN   | Availability on Amazon (True/False).                   |
| DISNEY          | BOOLEAN   | Availability on Disney+ (True/False).                  |
| HULU            | BOOLEAN   | Availability on Hulu (True/False).                     |
| NETFLIX         | BOOLEAN   | Availability on Netflix (True/False).                  |

### 5. **nf_ratings**
| **Field**       | **Type**  | **Description**                                        |
|-----------------|-----------|--------------------------------------------------------|
| MOVIES_KEY      | INT       | Foreign Key. References MOVIES_KEY in `nf_movies_tv_shows`. |
| IMBD            | INT       | IMDB ID for the movie/show.                            |
| IMBD_RATING     | FLOAT     | IMDB rating of the movie/show.                         |
| METASCORE       | INT       | Metascore rating of the movie/show.                    |
| IMBD_VOTES      | INT       | Number of IMDB votes for the movie/show.               |

### 6. **nf_industry**
| **Field**       | **Type**  | **Description**                                        |
|-----------------|-----------|--------------------------------------------------------|
| MOVIES_KEY      | INT       | Foreign Key. References MOVIES_KEY in `nf_movies_tv_shows`. |
| BUDGET          | DECIMAL   | Budget of the movie/show.                              |
| GROSS           | DECIMAL   | Gross earnings of the movie/show.                      |
| COMPANY         | VARCHAR   | Production company of the movie/show.                  |



In [None]:

-- deleting duplicate rows
DELETE FROM STREAMING_MOVIES_TV
WHERE MOVIES_KEY IN (
    SELECT MOVIES_KEY
    FROM STREAMING_MOVIES_TV
    GROUP BY MOVIES_KEY
    HAVING COUNT(*) > 1
);

CREATE OR REPLACE TABLE nf_movies_tv_shows AS
SELECT 
    MOVIES_KEY, 
    TITLE, 
    TYPE, 
    RATING, 
    DATE_ADDED, 
    RELEASE_YEAR, 
    DURATION, 
    LISTED_IN
FROM STREAMING_MOVIES_TV;
ALTER TABLE nf_movies_tv_shows
ADD PRIMARY KEY (MOVIES_KEY);


CREATE OR REPLACE TABLE nf_cast AS
SELECT 
    MOVIES_KEY, 
    "CAST"
FROM STREAMING_MOVIES_TV
WHERE "CAST" IS NOT NULL;
ALTER TABLE nf_countries
ADD CONSTRAINT fk_movies_key FOREIGN KEY (MOVIES_KEY) REFERENCES nf_movies_tv_shows(MOVIES_KEY);


CREATE OR REPLACE TABLE nf_countries AS
SELECT 
    MOVIES_KEY, 
    COUNTRY
FROM STREAMING_MOVIES_TV
WHERE COUNTRY IS NOT NULL;
ALTER TABLE nf_countries
ADD CONSTRAINT fk_movies_key FOREIGN KEY (MOVIES_KEY) REFERENCES nf_movies_tv_shows(MOVIES_KEY);


CREATE OR REPLACE TABLE nf_platforms AS
SELECT 
    MOVIES_KEY, 
    amazon, 
    disney, 
    hulu, 
    netflix
FROM STREAMING_MOVIES_TV;
ALTER TABLE nf_platforms
ADD CONSTRAINT fk_movies_key FOREIGN KEY (MOVIES_KEY) REFERENCES nf_movies_tv_shows(MOVIES_KEY);


CREATE OR REPLACE TABLE nf_ratings AS
SELECT 
    MOVIES_KEY, 
    IMBD, 
    IMBD_RATING, 
    METASCORE, 
    IMBD_VOTES
FROM STREAMING_MOVIES_TV;
ALTER TABLE nf_ratings
ADD CONSTRAINT fk_movies_key FOREIGN KEY (MOVIES_KEY) REFERENCES nf_movies_tv_shows(MOVIES_KEY);


CREATE OR REPLACE TABLE nf_industry AS
SELECT 
    MOVIES_KEY, 
    BUDGET, 
    GROSS, 
    COMPANY
FROM STREAMING_MOVIES_TV;
ALTER TABLE nf_industry
ADD CONSTRAINT fk_movies_key FOREIGN KEY (MOVIES_KEY) REFERENCES nf_movies_tv_shows(MOVIES_KEY);


# ETL Transformation and dimension modeling

### 1. **DIM_DATES**
| **Field**        | **Type**  | **Description**                                             |
|------------------|-----------|-------------------------------------------------------------|
| `date_key`       | INT       | Primary Key. Unique identifier for each date.               |
| `year_value`     | INT       | Year of the date.                                           |
| `month_value`    | INT       | Month of the date.                                          |
| `day_value`      | INT       | Day of the month.                                           |
| `quarter_fiscal` | INT       | Fiscal quarter of the year (1 to 4).                        |
| `is_week_day`    | BOOLEAN   | Whether the day is a weekday (TRUE) or weekend (FALSE).     |

---

### 2. **DIM_PERFORMANCE**
| **Field**        | **Type**  | **Description**                                             |
|------------------|-----------|-------------------------------------------------------------|
| `PERFORMANCE_KEY`| INT       | Primary Key. Unique identifier for the performance record.  |
| `GROSS`          | INT       | Gross earnings of the movie/show.                           |
| `BUDGET`         | INT       | Budget of the movie/show.                                   |
| `IMBD_NUM`       | INT       | IMDb ID for the movie/show.                                 |
| `METASCORE`      | VARCHAR   | Metascore rating of the movie/show.                         |
| `VOTES`          | INT       | Number of IMDb votes for the movie/show.                    |

---

### 3. **DIM_JUNK**
| **Field**        | **Type**  | **Description**                                             |
|------------------|-----------|-------------------------------------------------------------|
| `JUNK_KEY`       | INT       | Primary Key. Unique identifier for the junk record.         |
| `LISTED_IN`      | VARCHAR   | Categories of the movie/show (e.g., Horror, Thriller).      |
| `DURATION`       | INT       | Duration of the movie/show (e.g., in minutes or seasons).   |
| `CAST`           | VARCHAR   | Names of the cast members.                                  |
| `DIRECTOR`       | VARCHAR   | Director(s) of the movie/show (optional, may be NULL).      |
| `RATING`         | VARCHAR   | Rating of the movie/show (e.g., PG, R).                     |
| `AMAZON`         | VARCHAR   | Availability on Amazon (e.g., Yes, No).                     |
| `DISNEY`         | VARCHAR   | Availability on Disney+ (e.g., Yes, No).                    |
| `HULU`           | VARCHAR   | Availability on Hulu (e.g., Yes, No).                       |
| `NETFLIX`        | VARCHAR   | Availability on Netflix (e.g., Yes, No).                    |

---

### 4. **FACT_MOVIE_PERFORMANCE**
| **Field**        | **Type**  | **Description**                                             |
|------------------|-----------|-------------------------------------------------------------|
| `MOVIE_ID`       | INT       | Primary Key. Unique identifier for the movie/show.          |
| `JUNK_KEY`       | INT       | Foreign Key. References `JUNK_KEY` in `DIM_JUNK`.           |
| `PERFORMANCE_KEY`| INT       | Foreign Key. References `PERFORMANCE_KEY` in `DIM_PERFORMANCE`. |
| `DATE_KEY`       | INT       | Foreign Key. References `DATE_KEY` in `DIM_DATES`.          |

---


In [None]:
CREATE OR REPLACE TABLE DIM_DATES (
    date_key INT PRIMARY KEY,
    year_value INT,
    month_value INT,
    day_value INT,
    quarter_fiscal INT,
    is_week_day BOOLEAN
);

CREATE OR REPLACE TABLE DIM_PERFORMANCE (
    PERFORMANCE_KEY INT PRIMARY KEY,
    GROSS INT,
    BUDGET INT,
    IMBD_NUM INT,
    METASCORE VARCHAR,
    VOTES INT
);

CREATE OR REPLACE TABLE DIM_JUNK (
    JUNK_KEY INT PRIMARY KEY,
    LISTED_IN VARCHAR,
    DURATION INT,
    CAST VARCHAR,
    DIRECTOR VARCHAR,
    RATING VARCHAR,
    AMAZON VARCHAR,
    DISNEY VARCHAR,
    HULU VARCHAR,
    NETFLIX VARCHAR
);

CREATE OR REPLACE TABLE FACT_MOVIE_PERFORMANCE (
    MOVIE_ID INT PRIMARY KEY,
    MOVIE_TITLE VARCHAR, -- New column for movie title
    JUNK_KEY INT,
    PERFORMANCE_KEY INT,
    DATE_KEY INT,
    FOREIGN KEY (JUNK_KEY) REFERENCES DIM_JUNK(JUNK_KEY),
    FOREIGN KEY (PERFORMANCE_KEY) REFERENCES DIM_PERFORMANCE(PERFORMANCE_KEY),
    FOREIGN KEY (DATE_KEY) REFERENCES DIM_DATES(date_key)
);


-- Inserting data into DIM_DATES with renamed columns
INSERT INTO DATAWHAREHOUSEPROJECT.PUBLIC.DIM_DATES (date_key, year_value, month_value, day_value, quarter_fiscal, is_week_day)
SELECT
    MM.movies_key AS date_key, -- Replaced date_key with MM.movies_key
    EXTRACT(YEAR FROM TO_DATE(MM.DATE_ADDED, 'MM/DD/YYYY')) AS year_value,
    EXTRACT(MONTH FROM TO_DATE(MM.DATE_ADDED, 'MM/DD/YYYY')) AS month_value, 
    EXTRACT(DAY FROM TO_DATE(MM.DATE_ADDED, 'MM/DD/YYYY')) AS day_value, 
    CASE 
        WHEN EXTRACT(MONTH FROM TO_DATE(MM.DATE_ADDED, 'MM/DD/YYYY')) BETWEEN 1 AND 3 THEN 1
        WHEN EXTRACT(MONTH FROM TO_DATE(MM.DATE_ADDED, 'MM/DD/YYYY')) BETWEEN 4 AND 6 THEN 2
        WHEN EXTRACT(MONTH FROM TO_DATE(MM.DATE_ADDED, 'MM/DD/YYYY')) BETWEEN 7 AND 9 THEN 3
        ELSE 4
    END AS quarter_fiscal,
    CASE 
        WHEN TO_CHAR(TO_DATE(MM.DATE_ADDED, 'MM/DD/YYYY'), 'D') IN ('1', '7') THEN FALSE
        ELSE TRUE
    END AS is_week_day
FROM DATAWHAREHOUSEPROJECT.PUBLIC.NF_MOVIES_TV_SHOWS MM;

-- Inserting data into DIM_PERFORMANCE with handling of commas in numeric values
INSERT INTO DATAWHAREHOUSEPROJECT.PUBLIC.DIM_PERFORMANCE (PERFORMANCE_KEY, GROSS, BUDGET, IMBD_NUM, METASCORE, VOTES)
SELECT
    MM.MOVIES_KEY AS PERFORMANCE_KEY,
    TO_NUMBER(REPLACE(NI.GROSS, ',', '')) AS GROSS,   
    TO_NUMBER(REPLACE(NI.BUDGET, ',', '')) AS BUDGET,   
    TO_NUMBER(REPLACE(NR.IMBD, ',', '')) AS IMBD_NUM,   
    NR.METASCORE,
    TO_NUMBER(REPLACE(NR.IMBD_VOTES, ',', '')) AS VOTES  
FROM DATAWHAREHOUSEPROJECT.PUBLIC.NF_MOVIES_TV_SHOWS MM
JOIN DATAWHAREHOUSEPROJECT.PUBLIC.NF_INDUSTRY NI ON MM.MOVIES_KEY = NI.MOVIES_KEY
JOIN DATAWHAREHOUSEPROJECT.PUBLIC.NF_RATINGS NR ON MM.MOVIES_KEY = NR.MOVIES_KEY;

-- Inserting data into DIM_JUNK without DIRECTOR
INSERT INTO DATAWHAREHOUSEPROJECT.PUBLIC.DIM_JUNK (JUNK_KEY, LISTED_IN, DURATION, CAST, RATING, AMAZON, DISNEY, HULU, NETFLIX)
SELECT
    MM.MOVIES_KEY AS JUNK_KEY,  -- Explicitly qualify MOVIES_KEY
    MM.LISTED_IN,
    TO_NUMBER(MM.DURATION) AS DURATION,  -- Assuming DURATION should be numeric
    NC.CAST,
    MM.RATING,
    NP.AMAZON,
    NP.DISNEY,
    NP.HULU,
    NP.NETFLIX
FROM DATAWHAREHOUSEPROJECT.PUBLIC.NF_MOVIES_TV_SHOWS MM
JOIN DATAWHAREHOUSEPROJECT.PUBLIC.NF_CAST NC ON MM.MOVIES_KEY = NC.MOVIES_KEY
JOIN DATAWHAREHOUSEPROJECT.PUBLIC.NF_PLATFORMS NP ON MM.MOVIES_KEY = NP.MOVIES_KEY;

INSERT INTO DATAWHAREHOUSEPROJECT.PUBLIC.FACT_MOVIE_PERFORMANCE (MOVIE_ID, MOVIE_TITLE, JUNK_KEY, PERFORMANCE_KEY, DATE_KEY)
SELECT DISTINCT
    MM.MOVIES_KEY AS MOVIE_ID,  -- Explicitly qualify MOVIES_KEY
    MM.TITLE AS MOVIE_TITLE,    -- Correctly reference MOVIE_TITLE
    DJ.JUNK_KEY,
    DP.PERFORMANCE_KEY,
    DD.date_key
FROM DATAWHAREHOUSEPROJECT.PUBLIC.NF_MOVIES_TV_SHOWS MM
JOIN DATAWHAREHOUSEPROJECT.PUBLIC.DIM_JUNK DJ ON MM.MOVIES_KEY = DJ.JUNK_KEY
JOIN DATAWHAREHOUSEPROJECT.PUBLIC.DIM_PERFORMANCE DP ON MM.MOVIES_KEY = DP.PERFORMANCE_KEY
JOIN DATAWHAREHOUSEPROJECT.PUBLIC.DIM_DATES DD 
    ON EXTRACT(YEAR FROM TO_DATE(MM.RELEASE_YEAR, 'YYYY')) = DD.year_value  -- Match by year
WHERE DD.year_value IS NOT NULL;



In [None]:
df_dim_dates = session.table("DIM_DATES")
df_dim_performance = session.table("DIM_performance")
df_dim_junk = session.table("DIM_junk")
df_FACT_MOVIE_PERFORMANCE = session.table("FACT_MOVIE_PERFORMANCE")



In [None]:
from snowflake.snowpark.functions import col, avg
import matplotlib.pyplot as plt

# Aggregating the average gross by fiscal quarter
df_avg_gross_by_quarter = (df_dim_dates
                           .join(df_dim_performance, df_dim_dates.date_key == df_dim_performance.PERFORMANCE_KEY)
                           .group_by(df_dim_dates.quarter_fiscal)
                           .agg(avg(df_dim_performance.GROSS).alias("avg_gross"))
                           .sort(col("quarter_fiscal"))
)

# Collecting the results into a Pandas DataFrame
gross_data = df_avg_gross_by_quarter.to_pandas()

# Plotting the bar chart with the correct column names
plt.figure(figsize=(8, 6))
plt.bar(gross_data['QUARTER_FISCAL'], gross_data['AVG_GROSS'], color='skyblue')
plt.xlabel('Fiscal Quarter')
plt.ylabel('Average Gross')
plt.title('Average Gross by Fiscal Quarter')
plt.xticks(gross_data['QUARTER_FISCAL'])
plt.show()


In [None]:
from snowflake.snowpark.functions import col, avg
import matplotlib.pyplot as plt

# Aggregating the average gross by fiscal quarter
df_avg_gross_by_quarter = (df_dim_dates
                           .join(df_dim_performance, df_dim_dates.date_key == df_dim_performance.PERFORMANCE_KEY)
                           .group_by(df_dim_dates.year_value)
                           .agg(avg(df_dim_performance.GROSS).alias("avg_gross"))
                           .sort(col("year_value"))
)

# Collecting the results into a Pandas DataFrame
gross_data = df_avg_gross_by_quarter.to_pandas()

# Plotting the bar chart with the correct column names
plt.figure(figsize=(8, 6))
plt.bar(gross_data['YEAR_VALUE'], gross_data['AVG_GROSS'], color='skyblue')
plt.xlabel('YEAR_VALUE')
plt.ylabel('Average Gross')
plt.title('Average Gross by YEAR')
plt.xticks(gross_data['YEAR_VALUE'])
plt.show()


In [None]:
from snowflake.snowpark.functions import col, avg
import matplotlib.pyplot as plt

# Aggregating the average budget by year (instead of fiscal quarter)
df_avg_budget_by_year = (
    df_dim_dates
    .join(df_dim_performance, df_dim_dates.date_key == df_dim_performance.PERFORMANCE_KEY)  # Corrected join
    .group_by(df_dim_dates.year_value)  # Grouping by year_value
    .agg(avg(df_dim_performance.BUDGET).alias("avg_BUDGET"))
    .sort(col("year_value"))
)

# Collecting the results into a Pandas DataFrame
budget_data = df_avg_budget_by_year.to_pandas()

# Plotting the bar chart with the correct column names
plt.figure(figsize=(8, 6))
plt.bar(budget_data['YEAR_VALUE'], budget_data['AVG_BUDGET'], color='skyblue')
plt.xlabel('Year')
plt.ylabel('Average Budget')
plt.title('Average Budget by Year')
plt.xticks(budget_data['YEAR_VALUE'], rotation=45)  # Rotating xticks for better visibility
plt.show()


In [None]:
from snowflake.snowpark.functions import col, avg
import matplotlib.pyplot as plt

# Aggregating the average duration by year
df_avg_duration_by_year = (
    df_dim_dates
    .join(df_dim_junk, df_dim_dates.date_key == df_dim_junk.junk_key)  # Join with df_dim_junk for DURATION
    .group_by(df_dim_dates.year_value)  # Grouping by year_value
    .agg(avg(df_dim_junk.DURATION).alias("avg_DURATION"))  # Correct column for duration from df_dim_junk
    .sort(col("year_value"))
)

# Collecting the results into a Pandas DataFrame
duration_data = df_avg_duration_by_year.to_pandas()

# Plotting the bar chart with the correct column names
plt.figure(figsize=(8, 6))
plt.bar(duration_data['YEAR_VALUE'], duration_data['AVG_DURATION'], color='skyblue')
plt.xlabel('Year')
plt.ylabel('Average Duration')
plt.title('Average Duration by Year')  # Title reflecting "Duration"
plt.xticks(duration_data['YEAR_VALUE'], rotation=45)  # Rotating xticks for better visibility
plt.show()


In [None]:
from snowflake.snowpark.functions import col, avg

# Join the DIM_JUNK and DIM_PERFORMANCE tables to get METASCORE and IMBD_NUM
df_joined = df_dim_junk.join(
    df_dim_performance, 
    df_dim_junk["JUNK_KEY"] == df_dim_performance["PERFORMANCE_KEY"], 
    "inner"
)

# Group by LISTED_IN (single genre) and calculate the average METASCORE and IMBD_NUM
df_avg_score = df_joined.groupBy("LISTED_IN").agg(
    avg("METASCORE").alias("avg_metascore"),
    avg("IMBD_NUM").alias("avg_imdb")
)

# Show the result
df_avg_score.show()

import pandas as pd
import matplotlib.pyplot as plt

# Collect the result into a Pandas DataFrame
df_avg_score_pd = df_avg_score.to_pandas()

# Plotting the graph using Pandas
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot the avg_metascore as a bar plot
df_avg_score_pd.plot(kind='bar', x='LISTED_IN', y='AVG_METASCORE', ax=ax1, color='b', alpha=0.6, legend=False)

# Create a second y-axis to plot IMDB averages
ax2 = ax1.twinx()
df_avg_score_pd.plot(kind='line', x='LISTED_IN', y='AVG_IMDB', ax=ax2, color='r', marker='o', legend=False)

# Adding labels and title
ax1.set_xlabel('Genre')
ax1.set_ylabel('Average Metascore', color='b')
ax2.set_ylabel('Average IMDB', color='r')
plt.title('Average Metascore and IMDB per Genre')

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Show the plot
plt.tight_layout()
plt.show()
