In [0]:
%run "/Workspace/Users/suhmacc@fastmail.com/00 - DDL"

In [0]:
spark.sql(
    f"""
    USE SCHEMA {SILVER_SCHEMA}
    """
)

### GAMES

In [0]:
spark.sql(f"""
CREATE OR REPLACE TABLE {ALL_GAMES_CLEAN} (
    gamePk BIGINT COMMENT 'Unique game identifier assigned by the MLB Stats API.',
    game_date DATE COMMENT 'Official calendar date of the game.',
    season INT COMMENT 'MLB season year (e.g., 2024).',
    game_type STRING COMMENT 'Normalized game type: Regular Season, Postseason, Spring Training, or Exhibition.',
    home_team_name STRING COMMENT 'Full name of the home team.',
    away_team_name STRING COMMENT 'Full name of the visiting team.',
    venue_name STRING COMMENT 'Name of the ballpark or stadium.',
    home_score INT COMMENT 'Total number of runs scored by the home team.',
    away_score INT COMMENT 'Total number of runs scored by the away team.',
    home_is_winner BOOLEAN COMMENT 'True if the home team won the game.',
    away_is_winner BOOLEAN COMMENT 'True if the away team won the game.',
    attendance INT COMMENT 'Official attendance count for the game.',
    attendance_missing BOOLEAN COMMENT 'True if attendance is missing or reported as zero.',
    dayNight STRING COMMENT 'Indicates whether the game was played during the day or at night.',
    day_of_week STRING COMMENT 'Day of the week the game took place (e.g., Monday, Friday).',
    is_weekend BOOLEAN COMMENT 'True if the game was played on a Saturday or Sunday.',
    has_promotion BOOLEAN COMMENT 'True if the game featured any promotion, giveaway, or themed event.',
    num_promotions INT COMMENT 'Number of distinct promotions linked to this game.',
    offer_names ARRAY<STRING> COMMENT 'Array of promotional offer names tied to the game (e.g., "Bobblehead Giveaway").',
    promotion_types_array ARRAY<STRING> COMMENT 'Array of general promotion categories (e.g., "Giveaway", "Theme Game").',
    offer_types_array ARRAY<STRING> COMMENT 'Array of specific promotional type labels (e.g., "Day of Game Highlight").',
    seriesGameNumber INT COMMENT 'Index of the game within a multi-game series.',
    gamesInSeries INT COMMENT 'Total number of games scheduled in the series.',
    is_doubleheader_date BOOLEAN COMMENT 'True if multiple games were played between the same teams on this date.',
    doubleHeader STRING COMMENT 'Marks whether the game was part of a doubleheader (Y or N).',
    gameNumber BIGINT COMMENT 'Identifies which game in a doubleheader this record represents (1 or 2).',
    description STRING COMMENT 'Text description of the game (e.g., "home opener", "in Seoul, South Korea").'
)
COMMENT 'Silver-level cleaned version of all MLB games. Each record represents one game, with standardized fields and promotion arrays.';
""")

In [0]:
spark.sql(f"""
INSERT OVERWRITE {ALL_GAMES_CLEAN}
SELECT
    CAST(gamePk AS BIGINT) AS gamePk,
    CAST(game_date AS DATE) AS game_date,
    CAST(season AS INT) AS season,
    CASE gameType
        WHEN 'R' THEN 'Regular Season'
        WHEN 'P' THEN 'Postseason'
        WHEN 'S' THEN 'Spring Training'
        WHEN 'E' THEN 'Exhibition'
        ELSE 'Other'
    END AS game_type,
    home_team_name,
    away_team_name,
    venue_name,
    CAST(home_score AS INT) AS home_score,
    CAST(away_score AS INT) AS away_score,
    CAST(home_is_winner AS BOOLEAN) AS home_is_winner,
    CAST(away_is_winner AS BOOLEAN) AS away_is_winner,
    CAST(attendance AS INT) AS attendance,
    CASE WHEN attendance IS NULL OR attendance = 0 THEN TRUE ELSE FALSE END AS attendance_missing,
    LOWER(dayNight) AS dayNight,
    INITCAP(day_of_week) AS day_of_week,
    CASE WHEN day_of_week IN ('Saturday', 'Sunday') THEN TRUE ELSE FALSE END AS is_weekend,
    CAST(has_promotion AS BOOLEAN) AS has_promotion,
    COALESCE(num_promotions, size(offer_names)) AS num_promotions,
    array_distinct(COALESCE(offer_names, ARRAY())) AS offer_names,
    array_distinct(COALESCE(promotion_types_array, ARRAY())) AS promotion_types_array,
    array_distinct(COALESCE(offer_types_array, ARRAY())) AS offer_types_array,
    CAST(seriesGameNumber AS INT) AS seriesGameNumber,
    CAST(gamesInSeries AS INT) AS gamesInSeries,
    CAST(is_doubleheader_date AS BOOLEAN) AS is_doubleheader_date,
    doubleHeader,
    gameNumber,
    description
FROM {CATALOG}.{BRONZE_SCHEMA}.{ALL_GAMES_TBL};"""
)

In [0]:
%sql
DESCRIBE EXTENDED mlb_tech_summit.silver.all_games_clean;


In [0]:
spark.sql(f"""
CREATE OR REPLACE TABLE {ONLY_GAMES_WITH_PROMOS_CLEAN} (
    gamePk BIGINT COMMENT 'Unique game identifier assigned by the MLB Stats API. Links to silver.all_games_clean.',
    game_date DATE COMMENT 'Official calendar date of the game.',
    season INT COMMENT 'MLB season year (e.g., 2024).',
    day_of_week STRING COMMENT 'Day of the week the game took place (e.g., Monday, Saturday).',
    is_weekend BOOLEAN COMMENT 'True if the game was played on a Saturday or Sunday.',
    dayNight STRING COMMENT 'Indicates whether the game was played during the day or at night.',
    home_team_name STRING COMMENT 'Full name of the home team.',
    away_team_name STRING COMMENT 'Full name of the visiting team.',
    venue_name STRING COMMENT 'Name of the ballpark or stadium.',
    attendance INT COMMENT 'Official attendance count for the game.',
    attendance_missing BOOLEAN COMMENT 'True if attendance is missing or zero.',
    offer_name STRING COMMENT 'Specific promotion or offer name (e.g., "Bobblehead Giveaway").',
    promotion_types STRING COMMENT 'High-level category of the promotion (e.g., "Misc. Giveaway", "Theme Game").',
    offer_type STRING COMMENT 'Marketing or operational classification of the promotion (e.g., "Giveaway", "Day of Game Highlight").',
    distribution STRING COMMENT 'Target audience or distribution type for the offer (e.g., "All fans", "First 10,000 fans").',
    has_multiple_promos BOOLEAN COMMENT 'True if the game featured more than one promotion.',
    description STRING COMMENT 'Optional free-text description or event label for the game (e.g., "Home Opener").'
)
COMMENT 'Silver-level promotion table. Each record represents a single promotion tied to a specific MLB game. Includes contextual fields for analysis.';
          """
)

In [0]:
spark.sql(f"""
INSERT OVERWRITE {ONLY_GAMES_WITH_PROMOS_CLEAN}
SELECT
    p.gamePk,
    gc.game_date,
    gc.season,
    gc.day_of_week,
    gc.is_weekend,
    gc.dayNight,
    gc.home_team_name,
    gc.away_team_name,
    gc.venue_name,
    gc.attendance,
    gc.attendance_missing,
    p.offer_name,
    p.promotion_types,
    p.offer_type,
    p.distribution,
    CASE WHEN gc.num_promotions > 1 THEN TRUE ELSE FALSE END AS has_multiple_promos,
    gc.description
FROM {CATALOG}.{BRONZE_SCHEMA}.{ONLY_GAMES_WITH_PROMOS_TBL} p
JOIN {CATALOG}.{SILVER_SCHEMA}.{ALL_GAMES_CLEAN} gc
  ON p.gamePk = gc.gamePk;
    """
)

In [0]:
%sql
-- Check sample of promotions with context
SELECT gamePk, home_team_name, offer_name, promotion_types, offer_type, attendance
FROM mlb_tech_summit.silver.only_games_with_promos_clean
ORDER BY game_date
LIMIT 10;

In [0]:
%sql
-- How many promotions per offer_type
SELECT offer_type, COUNT(*) AS num_promotions, ROUND(AVG(attendance),0) AS avg_attendance
FROM mlb_tech_summit.silver.only_games_with_promos_clean
GROUP BY offer_type
ORDER BY avg_attendance DESC;