# View Design: Integrated Performance & Attendance Analysis  
> Collaborative Exploration: Linda & Ronald

**Context:**  
Ronald’s analysis revealed a gap — performance metrics are isolated from attendance data.

**Goal:**  
Create analysis-ready **views** combining performance, attendance, and contextual data to answer: **“What drives attendance?”**

**Approach:**  
Systematically inventory all ESPN JSON data and convert it into structured datasets through joint exploration.

- Inspect the raw JSON and note key analytical categories (Linda + Ronald)  
- Categorize fields using automation (Linda) and feedback (Ronald)  
- Infer and display a candidate schema with `infer_schema_from_json()` (Linda)  
- Brainstorm analytical questions from available fields (Ronald)  
- Translate questions into the **business logic** for each view (Linda)  
- Validate each proposed view with sample queries and extracts (Linda + Ronald)

*This notebook documents a simulated iterative design process for the integrated attendance–performance model.*

---

### 0. Setup

In [1]:
# Standard library
import sys
import json
import random
from pathlib import Path
from datetime import datetime
from pprint import pprint
import ipywidgets as widgets
from IPython.display import display, clear_output

# Third-party
import pandas as pd

# Local
PROJECT_ROOT = Path('../').resolve()
sys.path.insert(0, str(PROJECT_ROOT))
from src.utils.config import *  # schema defined here

# Data Path
ESPN_FILE = Path(f"{PROJECT_ROOT}/data/raw/espn_games.json")

# Load JSON
with open(ESPN_FILE, "r") as f:
    espn_games = json.load(f)

### 1. Visual Inspection

- Read a few raw JSON entries from `espn_games.json`
- What does the data actually look like?
- Note: what might drive attendance?

In [2]:
# Print one entry (pretty formatted)
pprint(espn_games[0], width=100)


{'competitions': [{'attendance': 71699,
                   'broadcast': 'ESPN/ABC',
                   'broadcasts': [{'market': 'national', 'names': ['ESPN', 'ABC']}],
                   'competitors': [{'homeAway': 'home',
                                    'id': '34',
                                    'linescores': [{'displayValue': '0', 'period': 1, 'value': 0.0},
                                                   {'displayValue': '0', 'period': 2, 'value': 0.0},
                                                   {'displayValue': '8', 'period': 3, 'value': 8.0},
                                                   {'displayValue': '11',
                                                    'period': 4,
                                                    'value': 11.0},
                                                   {'displayValue': '3',
                                                    'period': 5,
                                                    'value': 3.0}],
           

Are there some categories we can exclude because they are always the same?

In [33]:
# Check for unique regions and language broadcasts
print("\n🔍 Checking for non-US regions and non-English broadcasts:")
unique_langs = set()
unique_regions = set()

for game in espn_games:
    try:
        geo_broadcasts = game.get('competitions', [{}])[0].get('geoBroadcasts', [])
        for broadcast in geo_broadcasts:
            unique_langs.add(broadcast.get('lang'))
            unique_regions.add(broadcast.get('region'))
    except:
        pass

print(f"   Unique languages: {unique_langs}")
print(f"   Unique regions: {unique_regions}")

# Check uniqueness of broadcast-related fields
print("\n🔍 Checking broadcast field cardinality:")

broadcast_fields = {
    'geoBroadcasts_type_shortName': [],
    'geoBroadcasts_market_type': [],
    'geoBroadcasts_market_id': [],
    'geoBroadcasts_type_id': [],
    'broadcasts_market': [],
    'geoBroadcasts_lang': [],
}

for game in espn_games:
    try:
        geo_broadcasts = game.get('competitions', [{}])[0].get('geoBroadcasts', [])
        for broadcast in geo_broadcasts:
            broadcast_fields['geoBroadcasts_type_shortName'].append(
                broadcast.get('type', {}).get('shortName')
            )
            broadcast_fields['geoBroadcasts_market_type'].append(
                broadcast.get('market', {}).get('type')
            )
            broadcast_fields['geoBroadcasts_market_id'].append(
                broadcast.get('market', {}).get('id')
            )
            broadcast_fields['geoBroadcasts_type_id'].append(
                broadcast.get('type', {}).get('id')
            )
            broadcast_fields['geoBroadcasts_lang'].append(
                broadcast.get('lang')
            )
        
        broadcasts = game.get('competitions', [{}])[0].get('broadcasts', [])
        for broadcast in broadcasts:
            broadcast_fields['broadcasts_market'].append(
                broadcast.get('market')
            )
    except:
        pass

# Report uniqueness
for field, values in broadcast_fields.items():
    unique_vals = set(v for v in values if v is not None)
    print(f"\n   {field}:")
    print(f"      {len(unique_vals)} unique values: {sorted(unique_vals)}")
    if len(unique_vals) == 1:
        print(f"      ✅ CONSTANT - candidate for exclusion")

print("\n🔍 Analyzing uncategorized fields:")

uncategorized_checks = {
    'id': [],
    'name': [],
    'attendance': [],
    'timeValid': [],
    'conferenceCompetition': [],
    'notes.0.type': []
}

for game in espn_games:
    uncategorized_checks['id'].append(game.get('id'))
    uncategorized_checks['name'].append(game.get('name'))
    
    comp = game.get('competitions', [{}])[0]
    uncategorized_checks['attendance'].append(comp.get('attendance'))
    uncategorized_checks['timeValid'].append(comp.get('timeValid'))
    uncategorized_checks['conferenceCompetition'].append(comp.get('conferenceCompetition'))
    
    notes = comp.get('notes', [{}])
    if notes:
        uncategorized_checks['notes.0.type'].append(notes[0].get('type'))

for field, values in uncategorized_checks.items():
    unique = set(v for v in values if v is not None)
    print(f"\n   {field}:")
    print(f"      {len(unique)} unique values")
    if len(unique) <= 10:
        print(f"      Values: {sorted(unique)}")
    else:
        print(f"      Sample: {list(sorted(unique))[:5]}...")


🔍 Checking for non-US regions and non-English broadcasts:
   Unique languages: {'en', 'es'}
   Unique regions: {'us'}

🔍 Checking broadcast field cardinality:

   geoBroadcasts_type_shortName:
      3 unique values: ['Radio', 'Streaming', 'TV']

   geoBroadcasts_market_type:
      2 unique values: ['Home', 'National']

   geoBroadcasts_market_id:
      2 unique values: ['1', '2']

   geoBroadcasts_type_id:
      3 unique values: ['1', '4', '5']

   broadcasts_market:
      3 unique values: ['away', 'home', 'national']

   geoBroadcasts_lang:
      2 unique values: ['en', 'es']

🔍 Analyzing uncategorized fields:

   id:
      1452 unique values
      Sample: ['401131036', '401131037', '401131038', '401131039', '401131040']...

   name:
      816 unique values
      Sample: ['AFC  at NFC ', 'Arizona Cardinals at Atlanta Falcons', 'Arizona Cardinals at Buffalo Bills', 'Arizona Cardinals at Carolina Panthers', 'Arizona Cardinals at Chicago Bears']...

   attendance:
      1118 unique valu

- exclude broadcast region (it's always US)
- but include language (because sometimes it's english and sometimes it's spanish)

### 2. Semantic Modeling - Brainstorm Predictors/Features for Attendance

> Group by Analytical Theme

> Capture business logic and hypotheses

- publicity
    - Does attendance in the next 1–3 games change after a **nationally televised broadcast**?
    - Does having a **game highlight or headline** increase subsequent attendance?
    - Can **headline text** (e.g., “comeback,” “overtime,” “victory”) be scored for “excitement”?
- stats/data
    - Do high individual stats (passing/rushing/receiving leaders) drive fan engagement in later games?
    - How do data availability flags (e.g., play-by-play available) relate to game interest?
- matchups
    - Do games between **high- vs low**-ranking teams influence attendance?
    - Do **evenly matched** teams (similar records) draw more fans?
- score/wins
    - Do recent win/loss summaries or running averages correlate with attendance?
    - Does a “close score” or “exciting” game (derived from linescores) increase future attendance?
- marketing
    - Do team colors (e.g., red vs blue) influence fan turnout?
    - Could team name or logo characteristics correlate with attendance or popularity?
    - Are some short names (e.g., “BUF @ HOU”) more recognizable or appealing than others?
- location
    - Do larger metro areas, regions, or weather patterns influence attendance?
    - How does venue type (indoor/outdoor, neutral site) affect turnout?
- time
    - Does attendance vary by time of year, month, or week number?
    - Are weekend or holiday games more popular?
    - Do night vs day games differ in attendance?
    - Is the “slug” field meaningful? (unique string identify particular phase of NFL season, in contrast to the regular season)

### 3. View Design
> Map "Theme" -> View Name -> JSON source(s) -> Key Columns

> Seed with some automated guesses (`infer_schema(json_data)`, `categorize_fields()`)

Steps
1. Flatten the JSON using a recursive helper (!!!pandas.json_normalize() will only go 1 record deep)
2. Build a keyword → theme dictionary from our semantic analysis
3. Auto-tag each flattened field by matching keywords in the field path
4. Review & refine using csv output for Ronald
5. Export theme-grouped views (one DataFrame per theme, or a mapping dict)

 

In [41]:
import pandas as pd
import json
from pathlib import Path
import random
import re
from datetime import datetime
from collections import OrderedDict

# ─────────────────────────────────────────────────────────────────────
# Load JSON with ORDER PRESERVED
# ─────────────────────────────────────────────────────────────────────
with open(RAW_DATA_PATH / 'espn_games.json', 'r') as f:
    games = json.load(f, object_pairs_hook=OrderedDict)

print(f"Loaded {len(games)} games from JSON (order preserved)")

# ─────────────────────────────────────────────────────────────────────
# Helper: Extract leaf paths IN ORDER from the first game
# ─────────────────────────────────────────────────────────────────────
def extract_leaf_paths_ordered(data, parent_path=''):
    """Extract leaf paths in the ORDER they appear in the JSON file."""
    paths = []
    
    if isinstance(data, (dict, OrderedDict)):
        for key, value in data.items():
            new_path = f"{parent_path}.{key}" if parent_path else key
            paths.extend(extract_leaf_paths_ordered(value, new_path))
    
    elif isinstance(data, list):
        for idx, item in enumerate(data):
            new_path = f"{parent_path}.{idx}"
            paths.extend(extract_leaf_paths_ordered(item, new_path))
    
    else:
        depth = parent_path.count('.') if parent_path else 0
        paths.append((parent_path, depth))
    
    return paths

# Get ordered paths from first game
first_game_paths = extract_leaf_paths_ordered(games[0])
path_order = {path: idx for idx, (path, _) in enumerate(first_game_paths)}
max_nesting_depth = max(depth for _, depth in first_game_paths)

print(f"Found {len(first_game_paths)} leaf paths from first game")
print(f"Maximum nesting depth: {max_nesting_depth}")

# Collect ALL unique paths across all games
all_paths_set = set()
for game in games:
    paths = extract_leaf_paths_ordered(game)
    all_paths_set.update(path for path, _ in paths)

# Paths that exist in other games but not first game
new_paths = sorted(all_paths_set - set(path_order.keys()))
for new_path in new_paths:
    path_order[new_path] = len(path_order)

print(f"Total unique paths across all games: {len(all_paths_set)}")
if new_paths:
    print(f"  ({len(new_paths)} paths found in other games not in first game)")

# ─────────────────────────────────────────────────────────────────────
# Select sample records: first, random middle, last
# ─────────────────────────────────────────────────────────────────────
sample_indices = [0, random.randint(1, len(games) - 2), len(games) - 1]
sample_games = [games[i] for i in sample_indices]
print(f"Sample records: indices {sample_indices}")

# ─────────────────────────────────────────────────────────────────────
# Helper: Get value at path from a game
# ─────────────────────────────────────────────────────────────────────
def get_value_at_path(data, path):
    """Navigate through nested structure using dot-separated path."""
    parts = path.split('.')
    current = data
    
    try:
        for part in parts:
            if part.isdigit():
                current = current[int(part)]
            else:
                current = current[part]
        
        if current is None:
            return ""
        elif isinstance(current, bool):
            return str(current).upper()
        elif isinstance(current, str) and len(current) > 50:
            return current[:47] + "..."
        else:
            return str(current)
    
    except (KeyError, IndexError, TypeError):
        return ""

# ─────────────────────────────────────────────────────────────────────
# Helper: Split path into nesting level columns
# ─────────────────────────────────────────────────────────────────────
def split_path_by_levels(path, max_levels):
    """Split path into separate level columns."""
    parts = path.split('.')
    return parts + [''] * (max_levels - len(parts))

# ─────────────────────────────────────────────────────────────────────
# Helper: Suggest human-readable column name
# ─────────────────────────────────────────────────────────────────────
def suggest_column_name(path):
    """Convert 'competitions.0.venue.address.city' → 'venue_address_city'"""
    clean = re.sub(r'\.\d+', '', path)
    parts = [p for p in clean.split('.') if p not in ['competitions', 'competitors']]
    
    if len(parts) == 0:
        return clean.replace('.', '_')
    elif len(parts) <= 2:
        return '_'.join(parts)
    else:
        return '_'.join(parts[-3:])

# ─────────────────────────────────────────────────────────────────────
# Helper: Add semantic labels for array indices
# ─────────────────────────────────────────────────────────────────────
def add_semantic_suffix(path):
    """Add meaningful suffix for array elements."""
    suffix_parts = []
    
    # Check for period number (in linescores)
    match = re.search(r'\.linescores\.(\d+)\.', path)
    if match:
        period = int(match.group(1)) + 1
        suffix_parts.append(f'P{period}')
    
    # Check for team (home/away)
    if '.competitors.0.' in path:
        suffix_parts.append('home')
    elif '.competitors.1.' in path:
        suffix_parts.append('away')
    
    # Check for leader type
    match = re.search(r'\.leaders\.(\d+)\.', path)
    if match:
        idx = int(match.group(1))
        leader_types = ['passing', 'rushing', 'receiving']
        if idx < len(leader_types):
            suffix_parts.append(leader_types[idx])
    
    # Check for geoBroadcasts - label by type (TV/Radio/Streaming) + index
    match = re.search(r'\.geoBroadcasts\.(\d+)\.', path)
    if match:
        idx = int(match.group(1))
        # Extract type from the path if available, otherwise just use index
        # We'll need to look at actual data to determine type
        # For now, use generic labeling
        suffix_parts.append(f'broadcast{idx + 1}')
    
    # Check for broadcasts.names array (network names like ESPN, ABC, FOX)
    match = re.search(r'\.broadcasts\.(\d+)\.names\.(\d+)', path)
    if match:
        broadcast_idx = int(match.group(1))
        name_idx = int(match.group(2))
        suffix_parts.append(f'market{broadcast_idx + 1}_network{name_idx + 1}')
    elif re.search(r'\.broadcasts\.(\d+)\.', path):
        # For non-names fields in broadcasts (like market)
        match = re.search(r'\.broadcasts\.(\d+)\.', path)
        broadcast_idx = int(match.group(1))
        suffix_parts.append(f'market{broadcast_idx + 1}')
    
    # Combine all parts
    return '_' + '_'.join(suffix_parts) if suffix_parts else ''

# ─────────────────────────────────────────────────────────────────────
# Build Ronald's DataFrame IN DOCUMENT ORDER
# ─────────────────────────────────────────────────────────────────────
rows = []

for path in sorted(path_order.keys(), key=lambda p: path_order[p]):
    json_idx = path_order[path]
    level_parts = split_path_by_levels(path, max_nesting_depth + 1)
    samples = [get_value_at_path(game, path) for game in sample_games]
    
    row = {
        'json_idx': json_idx,
        **{f'json_L{i+1}': level_parts[i] for i in range(max_nesting_depth + 1)},
        'ref_column': path,
        'column_name': suggest_column_name(path) + add_semantic_suffix(path),
        'include_YN': 'Y',
        'sample_data_1': samples[0],
        'sample_data_2': samples[1],
        'sample_data_3': samples[2],
    }
    
    rows.append(row)

ronald_csv = pd.DataFrame(rows)

# ─────────────────────────────────────────────────────────────────────
# Auto-exclude known garbage (Linda's patterns from Ronald's feedback)
# ─────────────────────────────────────────────────────────────────────
auto_exclude_patterns = [
    r'uid$',                # UID fields (id is sufficient)
    r'\.href$',             # URLs we don't need
    r'\blinks\b',           # Team link metadata (boolean flags about links, paywall indicators, link relationships)
    r'\bheadshots\b',       # URL to headshots
    r'\brecords\b',         # Records (this is a score summary for human consumption - ronald will make his own)
    r'\bstatus\b',          # Ronald will only include completed games, and he can calculate OT, etc
    r'\btype.id\b',         # Broadcast ids - Ronald will use nominal values
    r'\bmarket.id\b',
    r'\bmedia\b',           # Broadcast media logo links
    r'\.displayValue$',                          # Display values (numeric values are sufficient)
    r'\.linescores\.\d+\.period$',               # Period numbers (redundant with column naming)
    r'\.leaders\.\d+\.name$',                    # Leader stat type names (e.g., "passingYards")
    r'\.leaders\.\d+\.displayName$',             # Leader display labels (e.g., "Passing Leader")
    r'\.leaders\.\d+\.shortDisplayName$',        # Leader stat type labels
    r'\.leaders\.\d+\.abbreviation$',            # Leader category abbreviations
    r'\.athlete\.displayName$',                  # Athlete display names (use fullName instead)
    r'\.athlete\.shortName$',                    # Athlete short names (use fullName instead)
    r'\.position\.abbreviation$',                # Player positions
    r'\.geoBroadcasts\.region$',                 # Always "us"
]

def should_auto_exclude(ref_column):
    for pattern in auto_exclude_patterns:
        if re.search(pattern, ref_column):
            return True
    return False

ronald_csv.loc[ronald_csv['ref_column'].apply(should_auto_exclude), 'include_YN'] = 'N'

excluded_count = (ronald_csv['include_YN'] == 'N').sum()
print(f"\nAuto-excluded {excluded_count} fields based on patterns")

# ─────────────────────────────────────────────────────────────────────
# Auto-suggest analytical themes based on keywords
# ─────────────────────────────────────────────────────────────────────
theme_keywords = {
    'publicity': ['broadcast', 'headline', 'highlight', 'geoBroadcast', 'media'],
    'stats/data': ['leader', 'statistics', 'playByPlay', 'linescore', 'score'],
    'matchups': ['competitor', 'record', 'winner', 'homeAway', 'team', 'conference'],
    'score/wins': ['score', 'winner', 'linescore', 'period'],
    'marketing': ['color', 'logo', 'abbreviation', 'displayName', 'shortName'],
    'location': ['venue', 'address', 'city', 'state', 'indoor', 'neutralSite'],
    'time': ['time', 'date', 'startDate', 'week', 'season', 'slug', 'clock']
}

def suggest_themes(column_name, theme_kw_dict):
    """Returns list of themes whose keywords appear in the column name."""
    matches = []
    col_lower = column_name.lower()
    for theme, keywords in theme_kw_dict.items():
        if any(kw.lower() in col_lower for kw in keywords):
            matches.append(theme)
    return matches if matches else ['uncategorized']

ronald_csv['suggested_themes'] = ronald_csv['ref_column'].apply(
    lambda col: ', '.join(suggest_themes(col, theme_keywords))
)

# ─────────────────────────────────────────────────────────────────────
# Override with CORE for designated columns
# ─────────────────────────────────────────────────────────────────────
core_columns = ['id', 'name', 'attendance']

ronald_csv.loc[ronald_csv['column_name'].isin(core_columns), 'suggested_themes'] = 'CORE'

print(f"\n🎯 Auto-suggested theme distribution:")
theme_counts = ronald_csv['suggested_themes'].value_counts()
for theme, count in theme_counts.head(10).items():
    print(f"   {theme}: {count} fields")

# ─────────────────────────────────────────────────────────────────────
# Document auto-excluded columns for transparency
# ─────────────────────────────────────────────────────────────────────
excluded_df = ronald_csv[ronald_csv['include_YN'] == 'N'][
    ['json_idx', 'ref_column', 'column_name', 'sample_data_1', 'suggested_themes']
].copy()

excluded_df['exclusion_reason'] = excluded_df['ref_column'].apply(
    lambda col: next(
        (pattern for pattern in auto_exclude_patterns if re.search(pattern, col)),
        'manual'
    )
)

# ─────────────────────────────────────────────────────────────────────
# Disambiguate duplicate column names
# ─────────────────────────────────────────────────────────────────────
def disambiguate_column_names(df, column_to_fix='column_name'):
    """Add suffixes to duplicate column names."""
    name_counts = {}
    new_names = []
    
    for name in df[column_to_fix]:
        if name in name_counts:
            count = name_counts[name]
            new_names.append(f"{name}_dup{count}")
            name_counts[name] += 1
        else:
            new_names.append(name)
            name_counts[name] = 1
    
    return new_names

ronald_csv['column_name'] = disambiguate_column_names(ronald_csv, 'column_name')

duplicate_bases = ronald_csv[ronald_csv['column_name'].str.contains('_dup', na=False)]['column_name'].str.replace(r'_dup\d+', '', regex=True).unique()

if len(duplicate_bases) > 0:
    print(f"\n⚠️  RONALD: {len(duplicate_bases)} column names need review (appeared multiple times):")
    print(f"   Filter column_name by these base names to rename each variant:\n")
    for base in sorted(duplicate_bases):
        count = len(ronald_csv[ronald_csv['column_name'].str.startswith(base + '_dup')]) + 1
        print(f"   • {base} ({count} occurrences)")
    print(f"\n   💡 TIP: In Excel, filter column_name for each base name above, then rename variants")
    print(f"          based on their ref_column context (e.g., id → game_id, team_id, venue_id)")
else:
    print(f"\n✅ No duplicate column names detected")

# ─────────────────────────────────────────────────────────────────────
# Export to CSV
# ─────────────────────────────────────────────────────────────────────
notebook_dir = Path.cwd()
timestamp = datetime.now().strftime('%Y%m%d')

baseline_file = notebook_dir / 'schema_automated_suggestions.csv'
working_file = notebook_dir / f'schema_reviewed_ronald_{timestamp}.csv'
excluded_file = notebook_dir / 'schema_auto_excluded_columns.csv'

ronald_csv.to_csv(baseline_file, index=False)
excluded_df.to_csv(excluded_file, index=False)

if not working_file.exists():
    ronald_csv.to_csv(working_file, index=False)
    print(f"\n✅ Created: {working_file.name}")
else:
    print(f"\n⚠️  {working_file.name} already exists (not overwriting)")

print(f"\n📊 Generated CSV with:")
print(f"   {len(ronald_csv)} total rows")
print(f"   {(ronald_csv['include_YN'] == 'Y').sum()} included fields")
print(f"   {excluded_count} auto-excluded fields (see {excluded_file.name})")
print(f"   {max_nesting_depth + 1} nesting level columns")
print(f"   3 sample data columns from game indices: {sample_indices}")

print(f"\n📋 First 10 fields (in ESPN's original order):")
print(ronald_csv[['json_idx', 'ref_column', 'column_name', 'sample_data_1']].head(10).to_string(index=False))

Loaded 1452 games from JSON (order preserved)
Found 318 leaf paths from first game
Maximum nesting depth: 10
Total unique paths across all games: 339
  (21 paths found in other games not in first game)
Sample records: indices [0, 14, 1451]

Auto-excluded 222 fields based on patterns

🎯 Auto-suggested theme distribution:
   matchups: 95 fields
   uncategorized: 65 fields
   stats/data: 37 fields
   stats/data, matchups, score/wins: 32 fields
   publicity: 26 fields
   stats/data, marketing: 18 fields
   matchups, marketing: 14 fields
   time: 12 fields
   publicity, marketing: 12 fields
   location: 9 fields

⚠️  RONALD: 40 column names need review (appeared multiple times):
   Filter column_name by these base names to rename each variant:

   • athlete_links_rel_passing (3 occurrences)
   • athlete_links_rel_receiving (3 occurrences)
   • athlete_links_rel_rushing (3 occurrences)
   • date (2 occurrences)
   • id (2 occurrences)
   • links_href (5 occurrences)
   • links_isExternal (5 

### 5. Ronald's Schema Review

**Method**
1. Open `schema_reviewed_ronald_YYYYMMDD.csv`
2. Exclude and Filter — Set a regex pattern with Linda, or set `include_YN` = `N` for fields you don't need
4. Rename — Fix remaining rows `column_name` values, resolve any "_dup" suffixes
5. Verify themes — Edit `suggested_themes` if needed, a single theme is best practice
6. Save and run next cell for verification scripts

In [45]:
print("🔍 Ronald's Schema Review Report\n")

# ─────────────────────────────────────────────────────────────────────
# 1. Multi-theme fields that need primary theme selection
# ─────────────────────────────────────────────────────────────────────
ronald_final = pd.read_csv('schema_reviewed_ronald_20251017.csv')
included = ronald_final[ronald_final['include_YN'] == 'Y']
multi_theme = included[included['suggested_themes'].str.contains(',', na=False)]

if len(multi_theme) > 0:
    print(f"⚠️  {len(multi_theme)} fields tagged with multiple themes (pick primary):\n")
    for _, row in multi_theme.iterrows():
        print(f"   {row['column_name']}: {row['suggested_themes']}")
    print()
else:
    print("✅ No multi-theme fields\n")

# ─────────────────────────────────────────────────────────────────────
# 2. Uncategorized fields still included
# ─────────────────────────────────────────────────────────────────────
uncategorized = included[included['suggested_themes'] == 'uncategorized']

if len(uncategorized) > 0:
    print(f"⚠️  {len(uncategorized)} fields still marked 'uncategorized' and included:\n")
    for _, row in uncategorized.iterrows():
        print(f"   {row['ref_column']} → {row['column_name']}")
    print()
else:
    print("✅ No uncategorized fields in included set\n")

# ─────────────────────────────────────────────────────────────────────
# 3. Invalid include_YN values
# ─────────────────────────────────────────────────────────────────────
valid_values = {'Y', 'N'}
invalid = included[~included['include_YN'].isin(valid_values)]

if len(invalid) > 0:
    print(f"❌ {len(invalid)} fields with invalid include_YN values:\n")
    for _, row in invalid.iterrows():
        print(f"   {row['column_name']}: include_YN = '{row['include_YN']}'")
    print()
else:
    print("✅ All include_YN values are valid (Y or N)\n")

# ─────────────────────────────────────────────────────────────────────
# 4. Check for constant-value fields still included (across all games)
# ─────────────────────────────────────────────────────────────────────
print("🔍 Checking constant values across all games (this may take a moment)...\n")

included = included[included['include_YN'] == 'Y']
constant_fields = []

for _, row in included.iterrows():
    ref_col = row['ref_column']
    col_name = row['column_name']
    
    # Collect values from ALL games
    all_values = set()
    for game in games:
        value = get_value_at_path(game, ref_col)
        if value:  # Skip empty values
            all_values.add(value)
    
    # Check if only one unique value exists
    if len(all_values) == 1:
        constant_fields.append((col_name, list(all_values)[0], len(games)))

if len(constant_fields) > 0:
    print(f"⚠️  {len(constant_fields)} fields with constant values across all {len(games)} games:\n")
    for col, value, count in constant_fields:
        # Truncate long values for display
        display_val = value if len(value) <= 50 else value[:47] + "..."
        print(f"   {col}: always '{display_val}'")
    print()
else:
    print(f"✅ No constant-value fields in included set (checked {len(games)} games)\n")

# Update summary to use actual constant count
print("=" * 70)
print("SUMMARY:")
print(f"   Total fields: {len(ronald_final)}")
print(f"   Included: {(ronald_final['include_YN'] == 'Y').sum()}")
print(f"   Excluded: {(ronald_final['include_YN'] == 'N').sum()}")
print(f"   Issues to resolve: {len(multi_theme) + len(uncategorized) + len(invalid) + len(constant_fields)}")
print(f"\n   {len(games)} games → {len(included)} cleaned and flattened columns!")

🔍 Ronald's Schema Review Report

✅ No multi-theme fields

✅ No uncategorized fields in included set

✅ All include_YN values are valid (Y or N)

🔍 Checking constant values across all games (this may take a moment)...

⚠️  9 fields with constant values across all 1452 games:

   format_regulation_periods: always '4'
   team_logo_home: always 'https://a.espncdn.com/i/teamlogos/nfl/500/score...'
   team_logo_away: always 'https://a.espncdn.com/i/teamlogos/nfl/500/score...'
   geoBroadcasts_lang_broadcast1: always 'en'
   geoBroadcasts_region_broadcast1: always 'us'
   geoBroadcasts_region_broadcast2: always 'us'
   geoBroadcasts_lang_broadcast3: always 'en'
   geoBroadcasts_market_type_broadcast3: always 'National'
   geoBroadcasts_region_broadcast3: always 'us'

SUMMARY:
   Total fields: 339
   Included: 98
   Excluded: 241
   Issues to resolve: 9

   1452 games → 98 cleaned and flattened columns!


### Export Refined Schema (The handoff to production)

In [47]:
# ─────────────────────────────────────────────────────────────────────
# Generate production schemas from Ronald's approved CSV
# ─────────────────────────────────────────────────────────────────────
ronald_final = pd.read_csv('schema_reviewed_ronald_20251017.csv')
included = ronald_final[ronald_final['include_YN'] == 'Y']

print("=" * 70)
print("COPY THIS TO config.py")
print("=" * 70)

# Group by theme
for theme in sorted(included['suggested_themes'].unique()):
    theme_fields = included[included['suggested_themes'] == theme].sort_values('json_idx')
    
    schema_var = f"ESPN_GAMES_{theme.upper().replace('/', '_').replace('-', '_')}_SCHEMA"
    print(f"\n{schema_var} = [")
    
    for _, row in theme_fields.iterrows():
        # Convert ref_column to path list
        path = [int(p) if p.isdigit() else p for p in row['ref_column'].split('.')]
        
        # Infer default value (you can make this smarter)
        default = "''" if 'id' in row['column_name'] or 'name' in row['column_name'] else "0"
        
        print(f"    ('{row['column_name']}', {path}, {default}),")
    
    print("]\n")

print("\n# Add to ESPN_FILES in config.py:")
print('ESPN_FILES = {')
for theme in sorted(included['suggested_themes'].unique()):
    theme_key = theme.lower().replace('/', '_').replace('-', '_')
    schema_var = f"ESPN_GAMES_{theme.upper().replace('/', '_').replace('-', '_')}_SCHEMA"
    print(f'    "games_{theme_key}": {{')
    print(f'        "filename": "espn_games.json",')
    print(f'        "path": RAW_DATA_PATH,')
    print(f'        "table_name": "espn_games_{theme_key}",')
    print(f'        "schema": "{schema_var}"')
    print(f'    }},')
print('}')

COPY THIS TO config.py

ESPN_GAMES_CORE_SCHEMA = [
    ('id', ['id'], ''),
    ('attendance', ['competitions', 0, 'attendance'], 0),
    ('type_abbreviation', ['competitions', 0, 'type', 'abbreviation'], 0),
    ('neutralSite', ['competitions', 0, 'neutralSite'], 0),
    ('team_isActive_home', ['competitions', 0, 'competitors', 0, 'team', 'isActive'], 0),
    ('team_isActive_away', ['competitions', 0, 'competitors', 1, 'team', 'isActive'], 0),
    ('format_regulation_periods', ['competitions', 0, 'format', 'regulation', 'periods'], 0),
]


ESPN_GAMES_PUBLICITY_SCHEMA = [
    ('notes_headline', ['competitions', 0, 'notes', 0, 'headline'], 0),
    ('broadcasts_market_market1', ['competitions', 0, 'broadcasts', 0, 'market'], 0),
    ('broadcasts_names_market1_network1', ['competitions', 0, 'broadcasts', 0, 'names', 0], ''),
    ('broadcasts_names_market1_network2', ['competitions', 0, 'broadcasts', 0, 'names', 1], ''),
    ('broadcast', ['competitions', 0, 'broadcast'], 0),
    ('geoBroad