In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [0]:
CATALOG = 'mlb_tech_summit'
BRONZE_SCHEMA = 'bronze'
SILVER_SCHEMA = 'silver'
GOLD_SCHEMA = 'gold'
SEMANTIC_SCHEMA = 'semantic'
LANDING_ZONE = 'raw'
RAW_VOL = f'/Volumes/mlb_tech_summit/bronze/{LANDING_ZONE}/'
ALL_GAMES_TBL = 'all_games'
ONLY_GAMES_WITH_PROMOS_TBL = 'only_games_with_promos'
ALL_GAMES_DIR = f'{RAW_VOL}/all_games'
ONLY_GAMES_WITH_PROMOS_DIR = f'{RAW_VOL}/only_games_with_promos'
ALL_GAMES_CLEAN = 'all_games_clean'
ONLY_GAMES_WITH_PROMOS_CLEAN = 'only_games_with_promos_clean'

GOLD_ATTENDANCE_BY_PROMO = "attendance_by_promo_type"

SEMANTIC_ATTENDANCE_IMPACT = "mlb_attendance_impact"

In [0]:
spark.sql(f"""
          CREATE CATALOG IF NOT EXISTS {CATALOG}
          COMMENT 'Catalog for storing and processing all MLB Tech Summit Training data'
          """)
spark.sql(f"""
          USE CATALOG {CATALOG}
          """)

In [0]:
spark.sql(f"""
          CREATE SCHEMA IF NOT EXISTS {CATALOG}.{BRONZE_SCHEMA}
          COMMENT 'Bronze database landing all Tech Summit training data';
          """)

spark.sql(f"""
          CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SILVER_SCHEMA}
          COMMENT 'Silver database for storing clean data for analysis, with quality constraints met';
          """)

spark.sql(f"""
          CREATE SCHEMA IF NOT EXISTS {CATALOG}.{GOLD_SCHEMA}
          COMMENT 'Gold database for serving aggregated data, KPIs to consumption layer';
          """)
spark.sql(f"""
          CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SEMANTIC_SCHEMA}
          COMMENT 'Gold database for serving aggregated data, KPIs to consumption layer';
          """)
spark.sql(f"""
          USE SCHEMA {BRONZE_SCHEMA};
          """)

In [0]:
spark.sql(f"""
          CREATE VOLUME IF NOT EXISTS {LANDING_ZONE}
          COMMENT 'Raw data volume for bronze schema';
          """)

In [0]:
dbutils.fs.mkdirs(f"{ALL_GAMES_DIR}")
dbutils.fs.mkdirs(f"{ONLY_GAMES_WITH_PROMOS_DIR}")

In [0]:
spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {ALL_GAMES_TBL} (
      gamePk BIGINT COMMENT 'Unique game identifier assigned by the MLB Stats API.',
      home_is_winner BOOLEAN COMMENT 'Indicates whether the home team won the game (true if home team won).',
      away_is_winner BOOLEAN COMMENT 'Indicates whether the away team won the game (true if away team won).',
      home_score DOUBLE COMMENT 'Total number of runs scored by the home team.',
      away_score DOUBLE COMMENT 'Total number of runs scored by the away team.',
      doubleHeader STRING COMMENT 'Marks whether the game was part of a doubleheader (Y or N).',
      gameNumber BIGINT COMMENT 'Identifies which game in a doubleheader this record represents (e.g., 1 or 2).',
      description STRING COMMENT 'Text description of the game, often noting location or event (e.g., “in Seoul, South Korea”).',
      seriesGameNumber BIGINT COMMENT 'Sequence number of this game within a multi-game series between the same teams.',
      gamesInSeries BIGINT COMMENT 'Total number of games scheduled in the series.',
      is_doubleheader_date BOOLEAN COMMENT 'Indicates if multiple games were played between the same teams on this date.',
      game_date DATE COMMENT 'Official calendar date on which the game was played.',
      season STRING COMMENT 'MLB season year (e.g., 2024).',
      home_team_name STRING COMMENT 'Full name of the home team.',
      away_team_name STRING COMMENT 'Full name of the visiting (away) team.',
      venue_name STRING COMMENT 'Name of the ballpark or stadium where the game took place.',
      attendance DOUBLE COMMENT 'Official attendance count for the game.',
      dayNight STRING COMMENT 'Indicates whether the game was played during the day or at night.',
      gameType STRING COMMENT 'Type of game: Regular (R), Postseason (P), Spring Training (S), or Exhibition (E).',
      offer_names ARRAY<STRING> COMMENT 'Array of promotional offer names tied to the game (e.g., “Bobblehead Giveaway”).',
      promotion_types_array ARRAY<STRING> COMMENT 'Array of promotion categories (e.g., “Giveaway,” “Theme Game”).',
      offer_types_array ARRAY<STRING> COMMENT 'Array of specific promotional type labels (e.g., “Day of Game Highlight”).',
      num_promotions BIGINT COMMENT 'Total number of unique promotions linked to the game.',
      away_cap STRING COMMENT 'Description of the away team’s cap worn during the game.',
      away_jersey STRING COMMENT 'Description of the away team’s jersey style or color.',
      away_pants STRING COMMENT 'Description of the away team’s pants style or color.',
      away_jersey_code STRING COMMENT 'Internal or vendor code representing the away team’s jersey design.',
      home_cap STRING COMMENT 'Description of the home team’s cap worn during the game.',
      home_jersey STRING COMMENT 'Description of the home team’s jersey style or color.',
      home_pants STRING COMMENT 'Description of the home team’s pants style or color.',
      home_jersey_code STRING COMMENT 'Internal or vendor code representing the home team’s jersey design.',
      has_promotion BOOLEAN COMMENT 'Boolean flag indicating whether the game featured any promotion or giveaway.',
      day_of_week STRING COMMENT 'Day of the week on which the game was played (e.g., Monday, Friday).'
    )
    USING DELTA
    COMMENT 'Comprehensive game-level dataset integrating on-field results, attendance metrics, promotional details, and uniform metadata for Major League Baseball (MLB) games from the 2024 and 2025 seasons. Each record represents a single scheduled or completed MLB game, linking competitive outcomes (scores, winners, series context) with fan-engagement attributes such as giveaways, theme nights, and attendance. The dataset also includes uniform configurations for both teams and supports analysis across seasons, venues, and promotional effectiveness.'
    """
)

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {ONLY_GAMES_WITH_PROMOS_TBL} (
  gamePk LONG COMMENT 'Unique game identifier assigned by MLB’s Stats API. Use as a primary key and join key to other MLB datasets (e.g., game outcomes, weather).',
  game_date DATE COMMENT 'Scheduled date of the game. Temporal dimension for season or month-over-month analysis.',
  season STRING COMMENT 'MLB season year. Enables time-based grouping (multi-season extension possible).',
  gameType STRING COMMENT 'Type of game: regular season (R), postseason (P), spring training (S), or exhibition (E). Useful for filtering by competition phase.',
  home_is_winner BOOLEAN COMMENT 'Boolean flag showing whether the home team won the game (true if home team won, false otherwise).',
  away_is_winner BOOLEAN COMMENT 'Boolean flag showing whether the away team won the game (true if away team won, false otherwise).',
  home_score DOUBLE COMMENT 'Total number of runs scored by the home team in the game.',
  away_score DOUBLE COMMENT 'Total number of runs scored by the away team in the game.',
  doubleHeader STRING COMMENT 'Indicates whether the game was part of a doubleheader — a pair of games between the same teams on the same day. Values typically Y or N.',
  gameNumber LONG COMMENT 'Identifies which game in the doubleheader this record represents (e.g., 1 or 2). For non-doubleheader games, this is usually 1.',
  description STRING COMMENT 'Free-text description of the game or special context — for instance, “in Seoul, South Korea” or “Opening Day.”',
  seriesGameNumber LONG COMMENT 'The sequential number of this game within the current series between the two teams (e.g., 1 for the first game of a three-game series).',
  gamesInSeries LONG COMMENT 'Total number of games scheduled in the series between these two teams (e.g., 3 for a standard weekend series).',
  is_doubleheader_date BOOLEAN,
  home_team_name STRING COMMENT 'Full name of the home team. Key for team-level rollups, leaderboards, or filtering.',
  away_team_name STRING COMMENT 'Full name of the visiting team. Useful for conversational joins (e.g., “Which visiting teams draw large crowds?”).',
  venue_name STRING COMMENT 'Full name of the ballpark or stadium hosting the game. Enables geographic and capacity-based insights. Consider linking to a venue dimension table.',
  attendance DOUBLE COMMENT 'Official attendance reported for the game. Target metric for analysis; dependent variable in attendance-impact modeling.',
  dayNight STRING COMMENT 'Indicator of whether the game was played during the day or at night. Strong explanatory factor for attendance; often interacts with promotion type.',
  offer_name STRING COMMENT 'Name or title of the promotional event associated with the game. Core feature for natural-language agent queries (e.g., “When were rally towels given away?”).',
  promotion_types STRING COMMENT 'Broad category of the promotion, describing its nature (e.g., giveaway, event, show). Good feature for grouping analysis of promotion effectiveness.',
  offer_type STRING COMMENT 'The marketing classification of the offer, representing how the promotion is delivered. Useful for segmentation in dashboards (“Which offer type drives the most attendance?”).',
  distribution STRING COMMENT 'Describes how the giveaway or promotion was distributed (e.g., “First 10,000 fans,” “All attendees,” or “Season-ticket holders only”). Useful for understanding reach and scale of a promotional event.',
  away_cap STRING COMMENT 'Name or description of the hat style worn by the away team during the game. Can include color or design (e.g., “navy with white logo”).',
  away_jersey STRING COMMENT 'Name or description of the away team’s jersey. Often includes details like colorway or special-event uniforms (e.g., “road gray,” “City Connect,” “throwback”).',
  away_pants STRING COMMENT 'Name or description of the away team’s pants worn in that game. Usually corresponds to the uniform style (e.g., “gray,” “white pinstripe”).',
  away_jersey_code STRING COMMENT 'Uniform code or internal identifier representing the away team’s jersey type. Typically used for consistent tracking across games (e.g., “A-01,” “CC-23”).',
  home_cap STRING COMMENT 'Name or description of the hat style worn by the home team. Captures variation across special uniforms or theme nights.',
  home_jersey STRING COMMENT 'Name or description of the home team’s jersey worn in that game (e.g., “white home,” “Independence Day,” “Pride Weekend”).',
  home_pants STRING COMMENT 'Name or description of the pants worn by the home team during the game. Complements jersey and cap data for full uniform details.',
  home_jersey_code STRING COMMENT 'Uniform code or internal identifier representing the home team’s jersey type, used for consistent mapping across datasets or seasons.',
  day_of_week STRING COMMENT 'Day of the week on which the game was played. Strong attendance driver; use for trend and regression analysis.'
) USING DELTA
COMMENT 'Designed to analyze how various promotional events (e.g., giveaways, fireworks, theme games) impact attendance across Major League Baseball games. This dataset supports BI dashboards for trend visualization and conversational AI agents that answer natural-language questions about promotions, attendance patterns, and team engagement.'
""")