In [1]:
import json
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import re
import os
from typing import Dict, List, Set, Tuple

## 1. Load Data

In [2]:
# Load ratings cache
with open("data/ratings_cache.json", "r", encoding="utf-8") as f:
    ratings_cache = json.load(f)

# Load game data from individual files in games_slim directory
all_games = []
games_dir = "data/games_slim"

print("Loading games from individual JSON files...")
for filename in os.listdir(games_dir):
    if filename.endswith(".json"):
        filepath = os.path.join(games_dir, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            game = json.load(f)
            all_games.append(game)

print(f"Loaded {len(all_games)} games from {games_dir}")

# Filter valid ratings
valid_ratings = {k: v for k, v in ratings_cache.items() if v is not None}

print(f"Total games: {len(all_games)}")
print(f"Total ratings cache entries: {len(ratings_cache)}")
print(f"Valid ratings (PEGI/ESRB): {len(valid_ratings)}")
print(f"Games without ratings: {len(ratings_cache) - len(valid_ratings)}")

Loading games from individual JSON files...
Loaded 145767 games from data/games_slim
Total games: 145767
Total ratings cache entries: 123567
Valid ratings (PEGI/ESRB): 14713
Games without ratings: 108854


## 2. Analyze Rating Sources and Distribution

In [3]:
# Analyze sources
source_counts = Counter(r['source'] for r in valid_ratings.values())
tier_counts = Counter(r['tier'] for r in valid_ratings.values())

print("Rating Sources:")
for source, count in source_counts.items():
    print(f"  {source}: {count} ({count/len(valid_ratings)*100:.1f}%)")

print("\nTier Distribution:")
tier_names = {0: "Everyone", 1: "Teen", 2: "Mature", 3: "Adults Only"}
for tier in sorted(tier_counts.keys()):
    count = tier_counts[tier]
    print(f"  Tier {tier} ({tier_names[tier]}): {count} ({count/len(valid_ratings)*100:.1f}%)")

Rating Sources:
  pegi: 11259 (76.5%)
  esrb: 3454 (23.5%)

Tier Distribution:
  Tier 0 (Everyone): 6847 (46.5%)
  Tier 1 (Teen): 4219 (28.7%)
  Tier 2 (Mature): 2563 (17.4%)
  Tier 3 (Adults Only): 1084 (7.4%)


## 3. Analyze and Unify Descriptors

PEGI has clean descriptors while ESRB has messy ones. We'll map ESRB descriptors to PEGI categories.

In [4]:
# Collect all unique descriptors by source
pegi_descriptors = set()
esrb_descriptors = set()

for rating in valid_ratings.values():
    descriptors = rating.get('descriptors', [])
    if rating['source'] == 'pegi':
        pegi_descriptors.update(descriptors)
    elif rating['source'] == 'esrb':
        esrb_descriptors.update(descriptors)

print("PEGI Descriptors (clean):")
for desc in sorted(pegi_descriptors):
    count = sum(1 for r in valid_ratings.values() 
                if r['source'] == 'pegi' and desc in r.get('descriptors', []))
    print(f"  {desc}: {count}")

print(f"\nESRB Descriptors (messy): {len(esrb_descriptors)} unique")
print("Sample ESRB descriptors:")
for desc in sorted(esrb_descriptors)[:30]:
    print(f"  {desc}")

PEGI Descriptors (clean):
  bad_language: 3168
  discrimination: 3
  drugs: 237
  fear: 719
  gambling: 150
  in-game-purchase: 955
  sex: 550
  violence: 6958

ESRB Descriptors (messy): 349 unique
Sample ESRB descriptors:
  "goo-like" environment
  ' 'big brother' surveillance
  a contract miner battling to find an energy source on a frozen planet
  a man shipwrecked in a mysterious country
  a mysterious fog corrupting the land
  a wrongly convicted man attempting to expose corruption
  alcohol
  alcohol reference
  alcohol reference.
  alcohol reference. also includes users interact
  alcohol reference. this is a narrative puzzle game in which players follow the story of a woman's life
  alcohol reference. this is a puzzle game in which players arrange multicolored barrels into matching groups of three
  alcohol reference. this is a strategy game in which players manage a series of small restaurants
  aliens in post-apocalyptic settings
  and a memory game
  and adventure
  and alco

## 4. Create ESRB to PEGI Descriptor Mapping

Map ESRB's messy descriptors to PEGI's clean categories:
- violence
- bad_language  
- fear
- gambling
- sex
- drugs
- discrimination
- in-game-purchase

In [5]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# PEGI categories with expanded descriptions for better matching
PEGI_CATEGORIES = {
    'violence': 'violence, blood, gore, combat, fighting, killing, weapons, shooting, battles',
    'bad_language': 'profanity, swearing, offensive language, crude humor, vulgar jokes, inappropriate words',
    'fear': 'horror, scary, frightening, terror, disturbing, psychological horror, jump scares',
    'gambling': 'gambling, betting, casino games, poker, slots, lottery, wagering',
    'sex': 'sexual content, nudity, sexual themes, suggestive, erotic, adult themes, intimate scenes',
    'drugs': 'drugs, alcohol, tobacco, smoking, drinking, substance abuse, narcotics, intoxication',
    'discrimination': 'discrimination, racism, hate speech, prejudice, stereotypes, offensive content',
    'in-game-purchase': 'in-game purchases, microtransactions, loot boxes, paid content, DLC purchases'
}

print("Loading sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and efficient model

# Encode PEGI category descriptions
print("Encoding PEGI categories...")
pegi_names = list(PEGI_CATEGORIES.keys())
pegi_descriptions = list(PEGI_CATEGORIES.values())
pegi_embeddings = model.encode(pegi_descriptions, show_progress_bar=False)

# Get all unique ESRB descriptors
print("Collecting ESRB descriptors...")
all_esrb_descriptors = list(esrb_descriptors)

# Filter out very long descriptions (likely game descriptions, not content ratings)
short_esrb = [d for d in all_esrb_descriptors if len(d) <= 150]
print(f"Filtering ESRB descriptors: {len(all_esrb_descriptors)} -> {len(short_esrb)} (removed long descriptions)")

# Encode ESRB descriptors
print("Encoding ESRB descriptors...")
esrb_embeddings = model.encode(short_esrb, show_progress_bar=True)

# Calculate similarity matrix
print("Calculating similarity scores...")
similarity_matrix = cosine_similarity(esrb_embeddings, pegi_embeddings)

# Create automatic mapping
SIMILARITY_THRESHOLD = 0.3  # Minimum similarity to consider a match
esrb_to_pegi_similarity = {}

for i, esrb_desc in enumerate(short_esrb):
    # Get best matching PEGI category
    best_idx = np.argmax(similarity_matrix[i])
    best_score = similarity_matrix[i][best_idx]
    
    if best_score >= SIMILARITY_THRESHOLD:
        esrb_to_pegi_similarity[esrb_desc.lower()] = pegi_names[best_idx]
    else:
        esrb_to_pegi_similarity[esrb_desc.lower()] = None

# Show mapping statistics
mapped_count = sum(1 for v in esrb_to_pegi_similarity.values() if v is not None)
print(f"\nSimilarity-based mapping created:")
print(f"  Total ESRB descriptors: {len(short_esrb)}")
print(f"  Mapped: {mapped_count} ({mapped_count/len(short_esrb)*100:.1f}%)")
print(f"  Unmapped: {len(short_esrb) - mapped_count} ({(len(short_esrb) - mapped_count)/len(short_esrb)*100:.1f}%)")

# Show category distribution
category_dist = Counter(v for v in esrb_to_pegi_similarity.values() if v is not None)
print("\nMapped to categories:")
for cat, count in category_dist.most_common():
    print(f"  {cat}: {count}")

# Show some examples
print("\nSample mappings:")
sample_mappings = [(desc, esrb_to_pegi_similarity[desc.lower()]) 
                   for desc in sorted(short_esrb)[:20] 
                   if esrb_to_pegi_similarity[desc.lower()] is not None]
for desc, cat in sample_mappings[:15]:
    print(f"  '{desc[:60]}' -> {cat}")

Loading sentence transformer model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding PEGI categories...
Collecting ESRB descriptors...
Filtering ESRB descriptors: 349 -> 338 (removed long descriptions)
Encoding ESRB descriptors...


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating similarity scores...

Similarity-based mapping created:
  Total ESRB descriptors: 338
  Mapped: 227 (67.2%)
  Unmapped: 111 (32.8%)

Mapped to categories:
  violence: 125
  sex: 27
  drugs: 25
  bad_language: 23
  in-game-purchase: 14
  gambling: 10
  fear: 3

Sample mappings:
  'alcohol' -> drugs
  'alcohol reference' -> drugs
  'alcohol reference.' -> drugs
  'alcohol reference. also includes users interact' -> drugs
  'and a memory game' -> in-game-purchase
  'and adventure' -> violence
  'and alcohol reference' -> drugs
  'and alcohol reference.' -> drugs
  'and allied forces during world war ii' -> violence
  'and animated blood' -> violence


## 3.5. Build Similarity-Based ESRB→PEGI Mapping

Use sentence embeddings to automatically map ESRB descriptors to PEGI categories based on semantic similarity.

In [8]:
def normalize_esrb_descriptor(desc: str) -> str:
    """Normalize an ESRB descriptor to PEGI category using similarity mapping with pattern fallback."""
    desc_lower = desc.lower().strip()
    
    # Skip very long descriptions (likely game descriptions)
    if len(desc_lower) > 100:
        return None
    
    # Skip common filler phrases
    skip_phrases = [
        'this is a', 'this is an', 'players', 'player can',
        'also includes users interact', 'includes online features',
        'the game', 'this game'
    ]
    if any(phrase in desc_lower for phrase in skip_phrases):
        return None
    
    # Primary: Use similarity-based mapping if available
    if 'esrb_to_pegi_similarity' in globals() and desc_lower in esrb_to_pegi_similarity:
        result = esrb_to_pegi_similarity[desc_lower]
        if result is not None:
            return result
    
    # Fallback: Pattern-based mapping for edge cases
    # Violence patterns
    violence_words = ['violence', 'blood', 'gore', 'combat', 'fighting', 'weapon', 'shooting']
    if any(word in desc_lower for word in violence_words):
        return 'violence'
    
    # Language patterns
    language_words = ['language', 'profanity', 'humor']
    if any(word in desc_lower for word in language_words):
        return 'bad_language'
    
    # Sexual content patterns
    sex_words = ['sex', 'nudity', 'suggestive', 'sexual']
    if any(word in desc_lower for word in sex_words):
        return 'sex'
    
    # Substance patterns
    drug_words = ['drug', 'alcohol', 'tobacco', 'substance', 'smoking', 'drinking']
    if any(word in desc_lower for word in drug_words):
        return 'drugs'
    
    # Fear patterns
    fear_words = ['fear', 'horror', 'scary', 'frightening']
    if any(word in desc_lower for word in fear_words):
        return 'fear'
    
    # Gambling patterns
    if 'gambl' in desc_lower:
        return 'gambling'
    
    # Purchase patterns
    if 'purchase' in desc_lower or 'microtransaction' in desc_lower:
        return 'in-game-purchase'
    
    # Discrimination patterns
    if 'discrimination' in desc_lower or 'hate' in desc_lower:
        return 'discrimination'
    
    # Unknown - return None
    return None

# Test the hybrid mapping
print("Testing hybrid ESRB descriptor normalization:")
test_descriptors = [
    'intense violence',
    'blood and gore',
    'sexual themes',
    'use of drugs and alcohol',
    'strong language',
    'comic mischief',
    'partial nudity',
    'animated blood',
    'mild cartoon violence',
    'realistic blood and gore'
]
for desc in test_descriptors:
    normalized = normalize_esrb_descriptor(desc)
    source = 'similarity' if 'esrb_to_pegi_similarity' in globals() and desc.lower() in esrb_to_pegi_similarity and esrb_to_pegi_similarity[desc.lower()] is not None else 'pattern'
    normalized_str = str(normalized) if normalized else 'None'
    print(f"  {desc:40} -> {normalized_str:20} [{source}]")

Testing hybrid ESRB descriptor normalization:
  intense violence                         -> violence             [similarity]
  blood and gore                           -> violence             [pattern]
  sexual themes                            -> sex                  [similarity]
  use of drugs and alcohol                 -> drugs                [pattern]
  strong language                          -> bad_language         [similarity]
  comic mischief                           -> None                 [pattern]
  partial nudity                           -> sex                  [similarity]
  animated blood                           -> violence             [similarity]
  mild cartoon violence                    -> violence             [similarity]
  realistic blood and gore                 -> violence             [pattern]


In [7]:
# Apply normalization and check coverage
esrb_unmapped = set()
esrb_mapped_counts = Counter()

for rating in valid_ratings.values():
    if rating['source'] == 'esrb':
        for desc in rating.get('descriptors', []):
            normalized = normalize_esrb_descriptor(desc)
            if normalized:
                esrb_mapped_counts[normalized] += 1
            else:
                esrb_unmapped.add(desc)

print(f"ESRB descriptors mapped: {len(esrb_mapped_counts)}")
print(f"ESRB descriptors unmapped: {len(esrb_unmapped)}")

print("\nMapped ESRB descriptor counts:")
for desc, count in esrb_mapped_counts.most_common():
    print(f"  {desc}: {count}")

if esrb_unmapped:
    print("\nUnmapped ESRB descriptors (need attention):")
    for desc in sorted(esrb_unmapped)[:20]:
        print(f"  {desc}")

ESRB descriptors mapped: 7
ESRB descriptors unmapped: 202

Mapped ESRB descriptor counts:
  violence: 4061
  bad_language: 1083
  sex: 894
  drugs: 893
  gambling: 37
  in-game-purchase: 34
  fear: 2

Unmapped ESRB descriptors (need attention):
  "goo-like" environment
  ' 'big brother' surveillance
  a contract miner battling to find an energy source on a frozen planet
  a man shipwrecked in a mysterious country
  a mysterious fog corrupting the land
  a wrongly convicted man attempting to expose corruption
  alcohol reference. also includes users interact
  alcohol reference. this is a narrative puzzle game in which players follow the story of a woman's life
  alcohol reference. this is a puzzle game in which players arrange multicolored barrels into matching groups of three
  alcohol reference. this is a strategy game in which players manage a series of small restaurants
  aliens in post-apocalyptic settings
  and beat the competition
  and boars within different outdoor environment

## 5. Unified Descriptor System

Create unified descriptors for all ratings

In [9]:
def get_unified_descriptors(rating: Dict) -> List[str]:
    """Get unified PEGI-style descriptors from any rating source."""
    descriptors = rating.get('descriptors', [])
    
    if rating['source'] == 'pegi':
        # PEGI descriptors are already clean
        return descriptors
    elif rating['source'] == 'esrb':
        # Normalize ESRB descriptors
        normalized = []
        for desc in descriptors:
            unified = normalize_esrb_descriptor(desc)
            if unified and unified not in normalized:
                normalized.append(unified)
        return normalized
    
    return []

# Test unified descriptors
print("Sample unified descriptors:")
for app_id, rating in list(valid_ratings.items())[:10]:
    unified = get_unified_descriptors(rating)
    print(f"  {rating['source']}: {rating.get('descriptors', [])} -> {unified}")

Sample unified descriptors:
  pegi: ['violence'] -> ['violence']
  pegi: ['violence'] -> ['violence']
  esrb: ['blood', 'gore', 'and intense violence'] -> ['violence']
  esrb: ['blood', 'gore', 'intense violence', 'and language'] -> ['violence']
  pegi: [] -> []
  pegi: ['violence'] -> ['violence']
  esrb: ['blood', 'gore', 'drug reference', 'and intense violence'] -> ['violence', 'drugs']
  esrb: ['blood', 'gore', 'intense violence', 'and language'] -> ['violence']
  pegi: ['violence'] -> ['violence']
  esrb: ['blood', 'intense violence'] -> ['violence']


## 6. Theme Detection Based on Descriptors

Detect content themes based on descriptor combinations

In [10]:
def detect_themes(descriptors: List[str]) -> List[str]:
    """Detect content themes based on unified descriptors."""
    themes = []
    desc_set = set(descriptors)
    
    # Violence themes
    if 'violence' in desc_set:
        themes.append('violent')
    
    # Horror theme
    if 'fear' in desc_set:
        themes.append('horror')
    
    # Adult theme
    if 'sex' in desc_set or ('violence' in desc_set and 'drugs' in desc_set):
        themes.append('adult')
    
    # Substance use theme
    if 'drugs' in desc_set:
        themes.append('substance_use')
    
    # Mature language theme
    if 'bad_language' in desc_set:
        themes.append('mature_language')
    
    # Gambling theme
    if 'gambling' in desc_set:
        themes.append('gambling')
    
    # Monetization theme
    if 'in-game-purchase' in desc_set:
        themes.append('monetization')
    
    # Family-friendly (no problematic descriptors)
    problematic = {'violence', 'sex', 'drugs', 'bad_language', 'fear', 'discrimination'}
    if not desc_set.intersection(problematic):
        themes.append('family_friendly')
    
    return themes

# Test theme detection
print("Testing theme detection:")
test_cases = [
    ['violence'],
    ['violence', 'drugs', 'sex'],
    ['fear'],
    ['in-game-purchase'],
    [],
]
for descriptors in test_cases:
    themes = detect_themes(descriptors)
    print(f"  {descriptors} -> {themes}")

Testing theme detection:
  ['violence'] -> ['violent']
  ['violence', 'drugs', 'sex'] -> ['violent', 'adult', 'substance_use']
  ['fear'] -> ['horror']
  ['in-game-purchase'] -> ['monetization', 'family_friendly']
  [] -> ['family_friendly']


## 7. Generate Unified Labels with All Information

In [11]:
# Create unified labels for all games
unified_labels = {}
tier_names = {0: "Everyone", 1: "Teen", 2: "Mature", 3: "Adults Only"}

for app_id, rating in valid_ratings.items():
    # Get unified descriptors and themes
    unified_descriptors = get_unified_descriptors(rating)
    themes = detect_themes(unified_descriptors)
    
    # Create label entry
    unified_labels[app_id] = {
        "tier": rating['tier'],
        "tier_label": tier_names[rating['tier']],
        "source": rating['source'],
        "original_rating": rating.get('age_rating') if rating['source'] == 'pegi' else rating.get('rating'),
        "descriptors": unified_descriptors,
        "themes": themes,
        "found_title": rating.get('found_title'),
    }

print(f"Generated unified labels for {len(unified_labels)} games")
print(f"\nSample labels:")
for app_id, label in list(unified_labels.items())[:5]:
    print(f"\nApp ID {app_id}:")
    print(f"  Tier: {label['tier']} ({label['tier_label']})")
    print(f"  Source: {label['source']} ({label['original_rating']})")
    print(f"  Descriptors: {label['descriptors']}")
    print(f"  Themes: {label['themes']}")

Generated unified labels for 14713 games

Sample labels:

App ID 10:
  Tier: 3 (Adults Only)
  Source: pegi (18)
  Descriptors: ['violence']
  Themes: ['violent']

App ID 30:
  Tier: 1 (Teen)
  Source: pegi (12)
  Descriptors: ['violence']
  Themes: ['violent']

App ID 20:
  Tier: 2 (Mature)
  Source: esrb (M)
  Descriptors: ['violence']
  Themes: ['violent']

App ID 50:
  Tier: 2 (Mature)
  Source: esrb (M)
  Descriptors: ['violence']
  Themes: ['violent']

App ID 60:
  Tier: 0 (Everyone)
  Source: pegi (3)
  Descriptors: []
  Themes: ['family_friendly']


## 8. Statistics on Unified Labels

In [12]:
# Descriptor frequency
all_descriptors = []
all_themes = []
for label in unified_labels.values():
    all_descriptors.extend(label['descriptors'])
    all_themes.extend(label['themes'])

descriptor_counts = Counter(all_descriptors)
theme_counts = Counter(all_themes)

print("Unified Descriptor Frequency:")
for desc, count in descriptor_counts.most_common():
    print(f"  {desc}: {count} ({count/len(unified_labels)*100:.1f}%)")

print("\nTheme Frequency:")
for theme, count in theme_counts.most_common():
    print(f"  {theme}: {count} ({count/len(unified_labels)*100:.1f}%)")

print("\nTier Distribution:")
tier_dist = Counter(label['tier'] for label in unified_labels.values())
for tier in sorted(tier_dist.keys()):
    count = tier_dist[tier]
    print(f"  Tier {tier} ({tier_names[tier]}): {count} ({count/len(unified_labels)*100:.1f}%)")

Unified Descriptor Frequency:
  violence: 9449 (64.2%)
  bad_language: 4191 (28.5%)
  sex: 1338 (9.1%)
  in-game-purchase: 989 (6.7%)
  drugs: 926 (6.3%)
  fear: 721 (4.9%)
  gambling: 187 (1.3%)
  discrimination: 3 (0.0%)

Theme Frequency:
  violent: 9449 (64.2%)
  mature_language: 4191 (28.5%)
  family_friendly: 3612 (24.5%)
  adult: 1756 (11.9%)
  monetization: 989 (6.7%)
  substance_use: 926 (6.3%)
  horror: 721 (4.9%)
  gambling: 187 (1.3%)

Tier Distribution:
  Tier 0 (Everyone): 6847 (46.5%)
  Tier 1 (Teen): 4219 (28.7%)
  Tier 2 (Mature): 2563 (17.4%)
  Tier 3 (Adults Only): 1084 (7.4%)


## 9. Export Unified Labels to JSON

In [13]:
# Export unified labels
output_file = 'data/unified_maturity_labels.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(unified_labels, f, indent=2, ensure_ascii=False)

print(f"Exported {len(unified_labels)} unified labels to {output_file}")

# Also create a summary statistics file
summary = {
    "total_games_labeled": len(unified_labels),
    "source_distribution": dict(Counter(l['source'] for l in unified_labels.values())),
    "tier_distribution": dict(Counter(l['tier'] for l in unified_labels.values())),
    "descriptor_frequency": dict(descriptor_counts),
    "theme_frequency": dict(theme_counts),
    "tier_names": tier_names,
    "available_descriptors": sorted(list(set(all_descriptors))),
    "available_themes": sorted(list(set(all_themes))),
}

summary_file = 'data/label_statistics.json'
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print(f"Exported summary statistics to {summary_file}")

Exported 14713 unified labels to data/unified_maturity_labels.json
Exported summary statistics to data/label_statistics.json


## 10. Create Training-Ready Dataset

Merge game data with unified labels for model training

In [14]:
# Create a mapping of app_id to game data
games_dict = {str(game['app_id']): game for game in all_games}

# Create training dataset
training_data = []

for app_id, label in unified_labels.items():
    if app_id in games_dict:
        game = games_dict[app_id]
        
        # Combine game data with label
        training_entry = {
            "app_id": int(app_id),
            "title": game.get('title'),
            "description": game.get('description'),
            "about_this_game": game.get('about_this_game'),
            "mature_content": game.get('mature_content'),
            "tags": game.get('tags', []),
            "genres": game.get('genres', []),
            "developers": game.get('developers', []),
            "publishers": game.get('publishers', []),
            "release_date": game.get('release_date'),
            "price": game.get('price'),
            "positive_reviews": game.get('positive_reviews', 0),
            "negative_reviews": game.get('negative_reviews', 0),
            # Labels
            "maturity_tier": label['tier'],
            "maturity_label": label['tier_label'],
            "rating_source": label['source'],
            "original_rating": label['original_rating'],
            "content_descriptors": label['descriptors'],
            "content_themes": label['themes'],
            "is_mature": label['tier'] >= 2,  # Binary for backwards compatibility
        }
        
        training_data.append(training_entry)

print(f"Created training dataset with {len(training_data)} games")

# Export training dataset
training_file = 'data/training_dataset_with_labels.json'
with open(training_file, 'w', encoding='utf-8') as f:
    json.dump(training_data, f, indent=2, ensure_ascii=False)

print(f"Exported training dataset to {training_file}")

# Show sample
print("\nSample training entry:")
import pprint
pprint.pprint(training_data[0], width=100, compact=False)

Created training dataset with 14713 games
Exported training dataset to data/training_dataset_with_labels.json

Sample training entry:
{'about_this_game': 'About This Game\n'
                    "\t\t\t\t\t\t\tPlay the world's number 1 online action game. Engage in an "
                    'incredibly realistic brand of terrorist warfare in this wildly popular '
                    'team-based game. Ally with teammates to complete strategic missions. Take out '
                    "enemy sites. Rescue hostages. Your role affects your team's success. Your "
                    "team's success affects your role.",
 'app_id': 10,
 'content_descriptors': ['violence'],
 'content_themes': ['violent'],
 'description': "Play the world's number 1 online action game. Engage in an incredibly realistic "
                'brand of terrorist warfare in this wildly popular team-based game. Ally with '
                'teammates to complete strategic missions. Take out enemy sites. Rescue hostages. '
 

## Summary

Created unified labeling system with:
- **Standardized tiers**: 0-3 (Everyone, Teen, Mature, Adults Only)
- **Unified descriptors**: ESRB descriptors mapped to PEGI categories
- **Themes**: Automatically detected from descriptor combinations
- **Source tracking**: Original rating source and value preserved

Output files:
- `unified_maturity_labels.json`: Labels only
- `label_statistics.json`: Summary statistics
- `training_dataset_with_labels.json`: Full dataset ready for model training