In [19]:
# Global parameters
NUM_CLUSTERS = 10  # Reduced from 25 to create more balanced clusters
MAX_GAMES = 200000

# Set to True to force retraining even if saved models exist
FORCE_RETRAIN = False

In [2]:
import glob
import json
import os

index = 0

all_games = []
for file_path in glob.glob('data/games_slim/*.json'):
    if len(all_games) >= MAX_GAMES:
        break

    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    index += 1
    if index % 100 == 0:
        print(f"Loaded {index} files...", end='\r')

    all_games.append(data)

print(f"Loaded {len(all_games)} games (limited to {MAX_GAMES})")

Loaded 145767 games (limited to 200000)


In [3]:
unique_games = {}

for game in all_games:
    title = (game.get("title") or "").strip().lower()
    devs = tuple(sorted((game.get("developer") or [])))
    pubs = tuple(sorted((game.get("publisher") or [])))
    key = (title, devs, pubs)

    current_best = unique_games.get(key)
    if current_best is None:
        unique_games[key] = game
    else:
        # Keep the one with the highest review_count
        if (game.get("review_count") or 0) > (current_best.get("review_count") or 0):
            unique_games[key] = game

deduped_games = list(unique_games.values())

print(f"After deduplication: {len(deduped_games)} unique games (from {len(all_games)} total)")
all_games = deduped_games

After deduplication: 123567 unique games (from 145767 total)


In [4]:
prices = []
for game in all_games:
    price_str = game.get('price', '')
    if price_str:
        if 'free' in price_str.lower():
            prices.append(0.0)
        else:
            price_clean = price_str.replace('zł', '').replace(',', '.').strip()
            try:
                price = float(price_clean)
                prices.append(price)
            except ValueError:
                prices.append(None)
    else:
        prices.append(None)

print(f"Extracted prices for {len([p for p in prices if p is not None])} games")

Extracted prices for 89960 games


### Clustering

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

texts = []
titles = []
for game in all_games:
    text = (game.get('about_this_game', '') + ' ' + 
            game.get('description', ''))
    texts.append(text)
    titles.append(game.get('title', 'Unknown'))

print(f"Prepared {len(texts)} text documents")

# Vectorize text
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(texts)
print(f"Vectorized to shape: {X.shape}")

# Cluster
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X)
print("Clustering completed")

# Calculate silhouette score
# Commented since it's slow to compute
# sil_score = silhouette_score(X, clusters)
# print(f"Silhouette Score: {sil_score:.3f}")

# Filter top 1000 games by review_count for visualization only
top_indices = sorted(range(len(all_games)), key=lambda i: all_games[i].get('review_count') or 0, reverse=True)[:1000]

pca = PCA(n_components=3, random_state=42)
X_pca = pca.fit_transform(X[top_indices].toarray())

df = pd.DataFrame({
    'x': X_pca[:, 0],
    'y': X_pca[:, 1],
    'z': X_pca[:, 2],
    'cluster': clusters[top_indices],
    'title': [titles[i] for i in top_indices]
})

fig = px.scatter_3d(df, x='x', y='y', z='z', color='cluster', 
                    hover_data=['title'], 
                    title='Game Clusters Based on Text Content (3D PCA) - Top 1000 by Review Count',
                    labels={'x': 'PCA Component 1', 'y': 'PCA Component 2', 'z': 'PCA Component 3'},
                    size_max=3, opacity=0.6)
from IPython.display import display
display(fig)

Prepared 123567 text documents
Vectorized to shape: (123567, 1000)
Clustering completed


In [6]:
feature_names = vectorizer.get_feature_names_out()

cluster_topics = {}
for cluster_id in range(NUM_CLUSTERS):
    cluster_indices = [i for i, c in enumerate(clusters) if c == cluster_id]
    cluster_tfidf = X[cluster_indices]
    avg_tfidf = cluster_tfidf.mean(axis=0).A1  # average TF-IDF per word
    top_indices = avg_tfidf.argsort()[-10:][::-1]  # top 10 words
    top_words = [feature_names[i] for i in top_indices]
    cluster_topics[cluster_id] = top_words

    # Get cluster games and sort by review_count
    cluster_games = [game for game, c in zip(all_games, clusters) if c == cluster_id]
    sorted_games = sorted(cluster_games, key=lambda g: g.get('review_count') or 0, reverse=True)

    # Take up to 10 unique titles
    sample = []
    seen_titles = set()
    for g in sorted_games:
        title = g['title']
        if title not in seen_titles:
            sample.append(g)
            seen_titles.add(title)
        if len(sample) >= 10:
            break

    print(f"Cluster {cluster_id}")
    print(f"({', '.join(top_words[:5])})")  # Show top 5 words as topics
    print(f"{len(cluster_games)} games")

    for i, g in enumerate(sample, 1):
        rating = g.get('review_score', 'N/A')
        votes = g.get('review_count') or 0
        print(f"{i}. {g['title']} (rating: {rating}, votes: {votes})")
    print()

Cluster 0
(vr, virtual, game, experience, reality)
3889 games
1. VRChat (rating: 69.0, votes: 166100)
2. Half-Life: Alyx (rating: 97.0, votes: 58910)
3. Pavlov (rating: 53.0, votes: 30799)
4. BONEWORKS (rating: 93.0, votes: 29194)
5. Hot Dogs, Horseshoes & Hand Grenades (rating: 96.0, votes: 18766)
6. VTOL VR (rating: 97.0, votes: 15880)
7. High On Life (rating: 85.0, votes: 11884)
8. FIVE NIGHTS AT FREDDY'S: HELP WANTED (rating: 94.0, votes: 7330)
9. SEGA Mega Drive and Genesis Classics (rating: 56.0, votes: 7099)
10. The Elder Scrolls V: Skyrim VR (rating: 77.0, votes: 7086)

Cluster 1
(cards, card, deck, game, play)
2562 games
1. Balatro (rating: 97.0, votes: 90659)
2. Inscryption (rating: 96.0, votes: 72307)
3. Slay the Spire (rating: 96.0, votes: 68394)
4. Yu-Gi-Oh! Master Duel (rating: 49.0, votes: 45546)
5. TCG Card Shop Simulator (rating: 94.0, votes: 29331)
6. UNO (rating: 37.0, votes: 25495)
7. Minion Masters (rating: 75.0, votes: 24495)
8. FragPunk (rating: 77.0, votes: 2145

### Classifier

In [None]:
import json

with open("mature_labels.json", "r", encoding="utf-8") as f:
    mature_labels = json.load(f)

In [None]:
def prepare_data_for_cluster(all_games, clusters, cluster_id):
    texts = []
    labels = []

    for game, c in zip(all_games, clusters):
        if c != cluster_id:
            continue

        appid = str(game["app_id"])
        if appid not in mature_labels:
            continue

        label = mature_labels[appid]

        text = game.get("about_this_game") or ""
        if not text.strip():
            continue

        texts.append(text)
        labels.append(int(bool(label)))

    return texts, labels

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle

def train_classifier(texts, labels, cluster_id):
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )

    model = Pipeline([
        ("tfidf", TfidfVectorizer(
            max_features=10000,
            ngram_range=(1, 2),
            stop_words="english"
        )),
        ("clf", LogisticRegression(max_iter=2000))
    ])

    model.fit(X_train, y_train)

    pred = model.predict(X_test)
    print(classification_report(y_test, pred))

    with open(f"classificators/mature_classifier_cluster_{cluster_id}.pkl", "wb") as f:
        pickle.dump(model, f)
    
    return model

In [None]:
def predict_game_mature(game, model):
    text = game.get("about_this_game", "") or ""
    return model.predict([text])[0]

def classifier_stats(model, cluster_id):
    errors = 0
    total = 0

    for game, c in zip(all_games, clusters):
        if c != cluster_id:
            continue

        appid = str(game["app_id"])
        if appid not in mature_labels:
            continue

        label = mature_labels[appid]
        total += 1
        pred_label = predict_game_mature(game, model)
        if pred_label != label:
            errors += 1
            # print(f"AppID: {appid}, Title: {game['title']}")
            # print(f"True Label: {label}, Predicted: {pred_label}")
            # print()    

    print(f"Total games checked: {total}, Errors: {errors}, Error Rate: {errors/total:.2%}")

In [None]:
# for cluster_id in range(NUM_CLUSTERS):
#     print(f"Training classifier for cluster {cluster_id}...")
#     texts, labels = prepare_data_for_cluster(all_games, clusters, cluster_id)
#     if len(texts) < 10:
#         print(f"Not enough data for cluster {cluster_id}, skipping...")
#         continue

#     model = train_classifier(texts, labels, cluster_id)
#     classifier_stats(model, cluster_id)
#     print()

## Transformer Analysis with Unified PEGI/ESRB Labels

Using the unified maturity labels from official PEGI/ESRB ratings.
Training on 14,713 games with verified ratings (76.5% PEGI, 23.5% ESRB).

Classification approach:
- **Multi-class Tier Prediction**: Predict exact age rating tier (0-3)
  - Tier 0: Everyone
  - Tier 1: Teen  
  - Tier 2: Mature
  - Tier 3: Adults Only
- **Data**: Games with official PEGI/ESRB ratings only

Comparing two approaches:
1. **Global Model**: Direct tier classification from text
2. **Cluster-Aware Model**: Cluster assignment followed by per-cluster tier classification

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np
from tqdm import tqdm

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [8]:
# Load unified maturity labels (PEGI/ESRB)
with open("data/unified_maturity_labels.json", "r", encoding="utf-8") as f:
    unified_labels = json.load(f)

print(f"Loaded {len(unified_labels)} unified maturity labels from PEGI/ESRB ratings")

# Prepare dataset for transformer training
def prepare_transformer_data(all_games, unified_labels, use_description=True):
    """Prepare texts and labels for transformer training using unified PEGI/ESRB data."""
    texts = []
    labels = []
    app_ids = []
    tiers = []
    sources = []
    
    for game in all_games:
        appid = str(game["app_id"])
        if appid not in unified_labels:
            continue
        
        label_data = unified_labels[appid]
        tier = label_data['tier']
        
        about = game.get("about_this_game") or ""
        desc = game.get("description") or ""
        
        if use_description:
            text = f"{about} {desc}".strip()
        else:
            text = about.strip()
        
        if not text:
            continue
        
        texts.append(text)
        # Multi-class classification: Predict tier directly (0-3)
        # Tier 0 (Everyone), Tier 1 (Teen), Tier 2 (Mature), Tier 3 (Adults Only)
        labels.append(tier)
        app_ids.append(appid)
        tiers.append(tier)
        sources.append(label_data['source'])
    
    return texts, labels, app_ids, tiers, sources

# Prepare the full dataset
all_texts, all_labels, all_app_ids, all_tiers, all_sources = prepare_transformer_data(all_games, unified_labels)

print(f"\nTotal samples for transformer: {len(all_texts)}")
print(f"Task: 4-class tier prediction (0=Everyone, 1=Teen, 2=Mature, 3=Adults Only)")

# Show tier distribution
from collections import Counter
tier_counts = Counter(all_tiers)
tier_names = {0: "Everyone", 1: "Teen", 2: "Mature", 3: "Adults Only"}
print("\nTier distribution:")
for tier in sorted(tier_counts.keys()):
    count = tier_counts[tier]
    print(f"  Tier {tier} ({tier_names[tier]}): {count} ({100*count/len(all_tiers):.1f}%)")

# Show source distribution
source_counts = Counter(all_sources)
print("\nSource distribution:")
for source, count in source_counts.items():
    print(f"  {source.upper()}: {count} ({100*count/len(all_sources):.1f}%)")

Loaded 14713 unified maturity labels from PEGI/ESRB ratings

Total samples for transformer: 13519
Task: 4-class tier prediction (0=Everyone, 1=Teen, 2=Mature, 3=Adults Only)

Tier distribution:
  Tier 0 (Everyone): 6360 (47.0%)
  Tier 1 (Teen): 3847 (28.5%)
  Tier 2 (Mature): 2318 (17.1%)
  Tier 3 (Adults Only): 994 (7.4%)

Source distribution:
  PEGI: 10340 (76.5%)
  ESRB: 3179 (23.5%)


In [9]:
# Custom Dataset for game text classification
class GameTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Simple Transformer Classifier using DistilBERT
import logging
# Suppress the expected warning about unused weights
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

class TransformerClassifier(nn.Module):
    def __init__(self, num_classes=4, dropout=0.5):  # Default to 4 classes for tier prediction
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        # Add additional dropout layer
        self.dropout2 = nn.Dropout(dropout * 0.8)  # Secondary dropout
        # Add intermediate layer for better regularization
        hidden_size = self.bert.config.hidden_size
        self.intermediate = nn.Linear(hidden_size, hidden_size // 2)
        self.classifier = nn.Linear(hidden_size // 2, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]  # CLS token
        pooled = self.dropout(pooled)
        # Add intermediate layer with activation and dropout
        hidden = self.intermediate(pooled)
        hidden = self.relu(hidden)
        hidden = self.dropout2(hidden)
        logits = self.classifier(hidden)
        return logits

print("Tokenizer loaded")

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


Tokenizer loaded


In [15]:
def train_transformer(model, train_loader, val_loader, epochs=3, lr=2e-5, class_weights=None):
    """Train a transformer model and return training history with early stopping."""
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    
    # Use class weights if provided (for imbalanced data)
    if class_weights is not None:
        class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
        criterion = nn.CrossEntropyLoss(weight=class_weights)
    else:
        criterion = nn.CrossEntropyLoss()
    
    # Learning rate scheduler - reduce LR when validation loss plateaus
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2, min_lr=1e-7
    )
    
    history = {'train_loss': [], 'val_loss': [], 'val_acc': [], 'val_f1': []}
    
    # Early stopping parameters
    best_val_loss = float('inf')
    patience_counter = 0
    early_stop_patience = 4  # Stop if no improvement for 4 epochs
    best_model_state = None
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_losses = []
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            train_losses.append(loss.item())
        
        avg_train_loss = np.mean(train_losses)
        history['train_loss'].append(avg_train_loss)
        
        # Validation
        model.eval()
        val_losses = []
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                val_losses.append(loss.item())
                
                preds = torch.argmax(outputs, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        avg_val_loss = np.mean(val_losses)
        val_acc = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average='macro')  # Use macro for multi-class
        
        history['val_loss'].append(avg_val_loss)
        history['val_acc'].append(val_acc)
        history['val_f1'].append(val_f1)
        
        # Step the scheduler
        scheduler.step(avg_val_loss)
        
        # Early stopping check
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            # Save best model state
            best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            print(f"Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}, Val Acc={val_acc:.4f}, Val F1={val_f1:.4f} ✓ (best)")
        else:
            patience_counter += 1
            print(f"Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}, Val Acc={val_acc:.4f}, Val F1={val_f1:.4f} (no improvement {patience_counter}/{early_stop_patience})")
            
            if patience_counter >= early_stop_patience:
                print(f"\nEarly stopping triggered after epoch {epoch+1}")
                print(f"Best validation loss: {best_val_loss:.4f}")
                break
    
    # Restore best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        model = model.to(device)
        print("Restored best model from validation")
    
    return model, history

def evaluate_model(model, test_loader):
    """Evaluate model and return predictions and true labels."""
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return all_preds, all_labels


### Model 1: Global Transformer

In [16]:
# Split data for global model with stratified splits to preserve tier distribution
X_train_global, X_test_global, y_train_global, y_test_global = train_test_split(
    all_texts, all_labels, test_size=0.2, random_state=42, stratify=all_labels
)

X_train_global, X_val_global, y_train_global, y_val_global = train_test_split(
    X_train_global, y_train_global, test_size=0.1, random_state=42, stratify=y_train_global
)

print(f"Global Model - Train: {len(X_train_global)}, Val: {len(X_val_global)}, Test: {len(X_test_global)}")

# Show tier distribution in each split
from collections import Counter
tier_names = {0: "Everyone", 1: "Teen", 2: "Mature", 3: "Adults Only"}
for split_name, split_labels in [("Train", y_train_global), ("Val", y_val_global), ("Test", y_test_global)]:
    tier_counts = Counter(split_labels)
    print(f"\n{split_name} distribution:")
    for tier in sorted(tier_counts.keys()):
        count = tier_counts[tier]
        print(f"  Tier {tier} ({tier_names[tier]}): {count} ({100*count/len(split_labels):.1f}%)")

# Create datasets and dataloaders
BATCH_SIZE = 16
MAX_LENGTH = 512  # DistilBERT maximum - captures 85.7% of descriptions fully (14.3% truncated)

train_dataset_global = GameTextDataset(X_train_global, y_train_global, tokenizer, MAX_LENGTH)
val_dataset_global = GameTextDataset(X_val_global, y_val_global, tokenizer, MAX_LENGTH)
test_dataset_global = GameTextDataset(X_test_global, y_test_global, tokenizer, MAX_LENGTH)

train_loader_global = DataLoader(train_dataset_global, batch_size=BATCH_SIZE, shuffle=True)
val_loader_global = DataLoader(val_dataset_global, batch_size=BATCH_SIZE)
test_loader_global = DataLoader(test_dataset_global, batch_size=BATCH_SIZE)

print(f"Dataloaders created - Train batches: {len(train_loader_global)}, Val batches: {len(val_loader_global)}")
print(f"Using MAX_LENGTH={MAX_LENGTH} tokens (DistilBERT maximum)")


Global Model - Train: 9733, Val: 1082, Test: 2704

Train distribution:
  Tier 0 (Everyone): 4579 (47.0%)
  Tier 1 (Teen): 2770 (28.5%)
  Tier 2 (Mature): 1669 (17.1%)
  Tier 3 (Adults Only): 715 (7.3%)

Val distribution:
  Tier 0 (Everyone): 509 (47.0%)
  Tier 1 (Teen): 308 (28.5%)
  Tier 2 (Mature): 185 (17.1%)
  Tier 3 (Adults Only): 80 (7.4%)

Test distribution:
  Tier 0 (Everyone): 1272 (47.0%)
  Tier 1 (Teen): 769 (28.4%)
  Tier 2 (Mature): 464 (17.2%)
  Tier 3 (Adults Only): 199 (7.4%)
Dataloaders created - Train batches: 609, Val batches: 68
Using MAX_LENGTH=512 tokens (DistilBERT maximum)


In [12]:
# Check token lengths in the dataset
print("Analyzing token lengths in game descriptions...")
text_lengths = []
for text in all_texts[:1000]:  # Sample first 1000
    tokens = tokenizer(text, truncation=False)['input_ids']
    text_lengths.append(len(tokens))

import numpy as np
print(f"\nToken length statistics (n={len(text_lengths)}):")
print(f"  Min: {min(text_lengths)}")
print(f"  Max: {max(text_lengths)}")
print(f"  Mean: {np.mean(text_lengths):.1f}")
print(f"  Median: {np.median(text_lengths):.1f}")
print(f"  75th percentile: {np.percentile(text_lengths, 75):.1f}")
print(f"  90th percentile: {np.percentile(text_lengths, 90):.1f}")
print(f"  95th percentile: {np.percentile(text_lengths, 95):.1f}")
print(f"  99th percentile: {np.percentile(text_lengths, 99):.1f}")

print(f"\n% of texts that would be truncated at different lengths:")
for limit in [256, 512, 768, 1024]:
    pct = sum(1 for l in text_lengths if l > limit) / len(text_lengths) * 100
    avg_lost = np.mean([max(0, l - limit) for l in text_lengths if l > limit]) if any(l > limit for l in text_lengths) else 0
    print(f"  {limit} tokens: {pct:.1f}% truncated (avg {avg_lost:.0f} tokens lost)")


Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


Analyzing token lengths in game descriptions...

Token length statistics (n=1000):
  Min: 11
  Max: 8055
  Mean: 350.7
  Median: 296.0
  75th percentile: 418.0
  90th percentile: 584.2
  95th percentile: 683.4
  99th percentile: 1111.1

% of texts that would be truncated at different lengths:
  256 tokens: 61.2% truncated (avg 200 tokens lost)
  512 tokens: 15.2% truncated (avg 240 tokens lost)
  768 tokens: 3.0% truncated (avg 559 tokens lost)
  1024 tokens: 1.3% truncated (avg 905 tokens lost)


In [20]:
# Train or load global transformer model
global_model_path = "classificators/transformer_global_model_v3_tier.pt"  # New version for tier prediction

if not FORCE_RETRAIN and os.path.exists(global_model_path):
    print("Loading existing Global Transformer Model...")
    global_model = TransformerClassifier(num_classes=4)  # Changed to 4 for tier prediction
    global_model.load_state_dict(torch.load(global_model_path, map_location=device))
    global_model = global_model.to(device)
    # Create dummy history for compatibility
    global_history = {'train_loss': [], 'val_loss': [], 'val_acc': [], 'val_f1': []}
    print("Global model loaded!")
else:
    print("Training Global Transformer Model for tier prediction...")
    print("Changes: Dropout 0.5, Early stopping, Gradient clipping, Intermediate layer")
    print("Task: 4-class classification (Tier 0-3)")
    
    # Calculate class weights for imbalanced data (lighter weights to reduce overfitting)
    n_samples = len(y_train_global)
    tier_counts = Counter(y_train_global)
    # Reduce weight impact by using sqrt
    class_weights = []
    for tier in range(4):
        if tier in tier_counts:
            weight = (n_samples / (4 * tier_counts[tier])) ** 0.5
        else:
            weight = 1.0
        class_weights.append(weight)
    print(f"Class weights (reduced):")
    for tier, weight in enumerate(class_weights):
        print(f"  Tier {tier} ({tier_names[tier]}): {weight:.2f}")
    
    global_model = TransformerClassifier(num_classes=4, dropout=0.5)
    global_model, global_history = train_transformer(
        global_model, 
        train_loader_global, 
        val_loader_global, 
        epochs=1,  # Max epochs, but early stopping will kick in
        lr=5e-6,  # Reduced from 2e-5 for more stable training
        class_weights=class_weights
    )
    # Save the global model
    torch.save(global_model.state_dict(), global_model_path)
    print("Global model saved!")

Loading existing Global Transformer Model...


The following layers were not sharded: transformer.layer.*.output_layer_norm.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.ffn.lin*.bias, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.ffn.lin*.weight, embeddings.LayerNorm.weight, transformer.layer.*.attention.q_lin.bias, embeddings.position_embeddings.weight, embeddings.word_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.attention.out_lin.bias, transformer.layer.*.attention.q_lin.weight, transformer.layer.*.sa_layer_norm.bias, embeddings.LayerNorm.bias, transformer.layer.*.attention.k_lin.weight, transformer.layer.*.attention.v_lin.bias


Global model loaded!


In [21]:
# Evaluate global model on test set
global_preds, global_true = evaluate_model(global_model, test_loader_global)
print("\n=== Global Transformer Model Results (Tier Prediction) ===")
tier_names_list = ['Everyone (T0)', 'Teen (T1)', 'Mature (T2)', 'Adults Only (T3)']
print(classification_report(global_true, global_preds, target_names=tier_names_list))

global_accuracy = accuracy_score(global_true, global_preds)
global_f1 = f1_score(global_true, global_preds, average='macro')  # Use macro for multi-class
print(f"Test Accuracy: {global_accuracy:.4f}")
print(f"Test Macro F1 Score: {global_f1:.4f}")

# Baseline: predict majority class
majority_class_global = Counter(global_true).most_common(1)[0][0]
baseline_preds_global = [majority_class_global] * len(global_true)
baseline_acc_global = accuracy_score(global_true, baseline_preds_global)
baseline_f1_global = f1_score(global_true, baseline_preds_global, average='macro')

print(f"\n--- Baseline (Majority Class: {tier_names[majority_class_global]}) ---")
print(f"Baseline Accuracy: {baseline_acc_global:.4f}")
print(f"Baseline Macro F1 Score: {baseline_f1_global:.4f}")
print(f"Improvement over baseline: Acc={global_accuracy - baseline_acc_global:+.4f}, F1={global_f1 - baseline_f1_global:+.4f}")

Evaluating: 100%|██████████| 169/169 [00:38<00:00,  4.44it/s]


=== Global Transformer Model Results (Tier Prediction) ===
                  precision    recall  f1-score   support

   Everyone (T0)       0.70      0.78      0.74      1272
       Teen (T1)       0.47      0.53      0.50       769
     Mature (T2)       0.41      0.32      0.36       464
Adults Only (T3)       0.50      0.16      0.24       199

        accuracy                           0.59      2704
       macro avg       0.52      0.45      0.46      2704
    weighted avg       0.57      0.59      0.57      2704

Test Accuracy: 0.5854
Test Macro F1 Score: 0.4603

--- Baseline (Majority Class: Everyone) ---
Baseline Accuracy: 0.4704
Baseline Macro F1 Score: 0.1600
Improvement over baseline: Acc=+0.1150, F1=+0.3004





### Model 2: Cluster-Aware Transformer

In [None]:
# Create a mapping from app_id to cluster
appid_to_cluster = {}
appid_to_game = {}

for game, cluster_id in zip(all_games, clusters):
    appid = str(game["app_id"])
    appid_to_cluster[appid] = cluster_id
    appid_to_game[appid] = game

# Prepare data organized by clusters
def prepare_cluster_data(all_texts, all_labels, all_app_ids, appid_to_cluster):
    """Organize data by cluster."""
    cluster_data = {}
    
    for text, label, appid in zip(all_texts, all_labels, all_app_ids):
        if appid not in appid_to_cluster:
            continue
        
        cluster_id = appid_to_cluster[appid]
        
        if cluster_id not in cluster_data:
            cluster_data[cluster_id] = {'texts': [], 'labels': [], 'app_ids': []}
        
        cluster_data[cluster_id]['texts'].append(text)
        cluster_data[cluster_id]['labels'].append(label)
        cluster_data[cluster_id]['app_ids'].append(appid)
    
    return cluster_data

cluster_data = prepare_cluster_data(all_texts, all_labels, all_app_ids, appid_to_cluster)

print("Cluster data distribution:")
for cluster_id in sorted(cluster_data.keys()):
    n_samples = len(cluster_data[cluster_id]['texts'])
    n_mature = sum(cluster_data[cluster_id]['labels'])
    print(f"  Cluster {cluster_id}: {n_samples} samples ({n_mature} mature, {n_samples-n_mature} non-mature)")

Cluster data distribution:
  Cluster 0: 2920 samples (1881 mature, 1039 non-mature)
  Cluster 1: 174 samples (135 mature, 39 non-mature)
  Cluster 2: 2896 samples (3075 mature, -179 non-mature)
  Cluster 3: 855 samples (1072 mature, -217 non-mature)
  Cluster 4: 765 samples (428 mature, 337 non-mature)
  Cluster 5: 1474 samples (1157 mature, 317 non-mature)
  Cluster 6: 2084 samples (1539 mature, 545 non-mature)
  Cluster 7: 31 samples (29 mature, 2 non-mature)
  Cluster 8: 3161 samples (2984 mature, 177 non-mature)
  Cluster 9: 353 samples (297 mature, 56 non-mature)


In [None]:
# Train or load transformer models for each cluster
MIN_SAMPLES_PER_CLUSTER = 50  # Minimum samples needed to train a cluster model
CLUSTER_EPOCHS = 1  # Fewer epochs per cluster since we have less data

cluster_models = {}
cluster_histories = {}

for cluster_id in sorted(cluster_data.keys()):
    texts = cluster_data[cluster_id]['texts']
    labels = cluster_data[cluster_id]['labels']
    
    # Skip clusters with too few samples
    if len(texts) < MIN_SAMPLES_PER_CLUSTER:
        print(f"\nCluster {cluster_id}: Skipping (only {len(texts)} samples)")
        continue
    
    # Check if we have multiple classes
    unique_labels = set(labels)
    if len(unique_labels) < 2:
        print(f"\nCluster {cluster_id}: Skipping (only one class present)")
        continue
    
    model_path = f"classificators/transformer_cluster_{cluster_id}_model_tier.pt"
    
    # Split data (needed for both training and evaluation)
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            texts, labels, test_size=0.2, random_state=42, stratify=labels
        )
        X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
        )
            
    except ValueError as e:
        print(f"Cluster {cluster_id}: Skipping due to stratification error")
        continue
    
    # Check if model already exists
    if not FORCE_RETRAIN and os.path.exists(model_path):
        print(f"\nCluster {cluster_id}: Loading existing model ({len(texts)} samples)")
        cluster_models[cluster_id] = {
            'model_path': model_path,
            'test_data': {'texts': X_test, 'labels': y_test}
        }
        cluster_histories[cluster_id] = {'train_loss': [], 'val_loss': [], 'val_acc': [], 'val_f1': []}
        continue
    
    print(f"\n{'='*50}")
    print(f"Training Cluster {cluster_id} Model ({len(texts)} samples)")
    print(f"{'='*50}")
    
    # Calculate class weights for this cluster
    n_samples_cluster = len(y_train)
    tier_counts_cluster = Counter(y_train)
    cluster_class_weights = []
    for tier in range(4):
        if tier in tier_counts_cluster:
            weight = (n_samples_cluster / (4 * tier_counts_cluster[tier])) ** 0.5
        else:
            weight = 1.0
        cluster_class_weights.append(weight)
    
    # Create datasets
    train_dataset = GameTextDataset(X_train, y_train, tokenizer, MAX_LENGTH)
    val_dataset = GameTextDataset(X_val, y_val, tokenizer, MAX_LENGTH)
    test_dataset = GameTextDataset(X_test, y_test, tokenizer, MAX_LENGTH)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    # Train model with class weights for tier prediction
    model = TransformerClassifier(num_classes=4)
    model, history = train_transformer(model, train_loader, val_loader, epochs=CLUSTER_EPOCHS, lr=2e-5, class_weights=cluster_class_weights)
    
    # Move model to CPU and save immediately to free GPU memory
    model = model.cpu()
    torch.save(model.state_dict(), model_path)
    
    cluster_models[cluster_id] = {
        'model_path': model_path,
        'test_data': {'texts': X_test, 'labels': y_test}
    }
    cluster_histories[cluster_id] = history
    
    # Clear GPU memory
    del model, train_dataset, val_dataset, test_dataset
    del train_loader, val_loader, test_loader
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    import gc
    gc.collect()

print(f"\n\nTotal cluster models available: {len(cluster_models)}")


Training Cluster 0 Model (2920 samples)


The following layers were not sharded: transformer.layer.*.attention.q_lin.weight, embeddings.LayerNorm.bias, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.ffn.lin*.bias, transformer.layer.*.attention.k_lin.weight, transformer.layer.*.output_layer_norm.weight, transformer.layer.*.sa_layer_norm.weight, embeddings.position_embeddings.weight, embeddings.word_embeddings.weight, transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.attention.v_lin.bias

The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.


The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.

Epoch 1/15 - Training: 100%|██████████████████████████

Epoch 1: Train Loss=1.3029, Val Loss=1.2818, Val Acc=0.5897, Val F1=0.1855 ✓ (best)


Epoch 2/15 - Training: 100%|██████████████████████████████████████████████████████████| 132/132 [00:18<00:00,  7.05it/s]
Epoch 2/15 - Training: 100%|██████████████████████████████████████████████████████████| 132/132 [00:18<00:00,  7.05it/s]
Epoch 2/15 - Validation: 100%|██████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 14.41it/s]



Epoch 2: Train Loss=1.2632, Val Loss=1.2475, Val Acc=0.5855, Val F1=0.2703 ✓ (best)


Epoch 3/15 - Training: 100%|██████████████████████████████████████████████████████████| 132/132 [00:18<00:00,  6.99it/s]
Epoch 3/15 - Training: 100%|██████████████████████████████████████████████████████████| 132/132 [00:18<00:00,  6.99it/s]
Epoch 3/15 - Validation: 100%|██████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 15.32it/s]
Epoch 3/15 - Validation: 100%|██████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 15.32it/s]


Epoch 3: Train Loss=1.1685, Val Loss=1.2564, Val Acc=0.5470, Val F1=0.2804 (no improvement 1/4)


Epoch 4/15 - Training: 100%|██████████████████████████████████████████████████████████| 132/132 [00:20<00:00,  6.33it/s]
Epoch 4/15 - Training: 100%|██████████████████████████████████████████████████████████| 132/132 [00:20<00:00,  6.33it/s]
Epoch 4/15 - Validation: 100%|██████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 15.28it/s]
Epoch 4/15 - Validation: 100%|██████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 15.28it/s]


Epoch 4: Train Loss=1.0411, Val Loss=1.3652, Val Acc=0.5598, Val F1=0.3025 (no improvement 2/4)


Epoch 5/15 - Training: 100%|██████████████████████████████████████████████████████████| 132/132 [00:18<00:00,  6.98it/s]
Epoch 5/15 - Training: 100%|██████████████████████████████████████████████████████████| 132/132 [00:18<00:00,  6.98it/s]
Epoch 5/15 - Validation: 100%|██████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 14.71it/s]
Epoch 5/15 - Validation: 100%|██████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 14.71it/s]


Epoch 5: Train Loss=0.8496, Val Loss=1.4328, Val Acc=0.5256, Val F1=0.3289 (no improvement 3/4)


Epoch 6/15 - Training: 100%|██████████████████████████████████████████████████████████| 132/132 [00:20<00:00,  6.50it/s]
Epoch 6/15 - Training: 100%|██████████████████████████████████████████████████████████| 132/132 [00:20<00:00,  6.50it/s]
Epoch 6/15 - Validation: 100%|██████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 14.31it/s]



Epoch 6: Train Loss=0.6340, Val Loss=1.7022, Val Acc=0.5427, Val F1=0.2843 (no improvement 4/4)

Early stopping triggered after epoch 6
Best validation loss: 1.2475
Restored best model from validation

Training Cluster 1 Model (174 samples)

Training Cluster 1 Model (174 samples)


The following layers were not sharded: transformer.layer.*.attention.q_lin.weight, embeddings.LayerNorm.bias, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.ffn.lin*.bias, transformer.layer.*.attention.k_lin.weight, transformer.layer.*.output_layer_norm.weight, transformer.layer.*.sa_layer_norm.weight, embeddings.position_embeddings.weight, embeddings.word_embeddings.weight, transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.attention.v_lin.bias

The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.


The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.

Epoch 1/15 - Training: 100%|██████████████████████████

Epoch 1: Train Loss=1.3966, Val Loss=1.3196, Val Acc=0.5000, Val F1=0.1667 ✓ (best)


Epoch 2/15 - Training: 100%|██████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.64it/s]
Epoch 2/15 - Validation:   0%|                                                                    | 0/1 [00:00<?, ?it/s]
Epoch 2/15 - Validation: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.29it/s]



Epoch 2: Train Loss=1.2913, Val Loss=1.2811, Val Acc=0.5000, Val F1=0.1667 ✓ (best)


Epoch 3/15 - Training: 100%|██████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.70it/s]
Epoch 3/15 - Validation:   0%|                                                                    | 0/1 [00:00<?, ?it/s]
Epoch 3/15 - Validation: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.61it/s]



Epoch 3: Train Loss=1.3149, Val Loss=1.2718, Val Acc=0.5000, Val F1=0.1667 ✓ (best)


Epoch 4/15 - Training: 100%|██████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.74it/s]
Epoch 4/15 - Validation:   0%|                                                                    | 0/1 [00:00<?, ?it/s]
Epoch 4/15 - Validation: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.20it/s]



Epoch 4: Train Loss=1.2972, Val Loss=1.2687, Val Acc=0.5000, Val F1=0.1667 ✓ (best)


Epoch 5/15 - Training: 100%|██████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.70it/s]
Epoch 5/15 - Validation:   0%|                                                                    | 0/1 [00:00<?, ?it/s]
Epoch 5/15 - Validation: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.67it/s]



Epoch 5: Train Loss=1.3275, Val Loss=1.2674, Val Acc=0.5000, Val F1=0.1667 ✓ (best)


Epoch 6/15 - Training: 100%|██████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.69it/s]
Epoch 6/15 - Validation:   0%|                                                                    | 0/1 [00:00<?, ?it/s]
Epoch 6/15 - Validation: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.34it/s]
Epoch 6/15 - Validation: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.34it/s]


Epoch 6: Train Loss=1.2941, Val Loss=1.2763, Val Acc=0.5000, Val F1=0.1667 (no improvement 1/4)


Epoch 7/15 - Training: 100%|██████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.71it/s]
Epoch 7/15 - Validation:   0%|                                                                    | 0/1 [00:00<?, ?it/s]
Epoch 7/15 - Validation: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.46it/s]
Epoch 7/15 - Validation: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.46it/s]


Epoch 7: Train Loss=1.2943, Val Loss=1.2750, Val Acc=0.5000, Val F1=0.1667 (no improvement 2/4)


Epoch 8/15 - Training: 100%|██████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.71it/s]
Epoch 8/15 - Validation:   0%|                                                                    | 0/1 [00:00<?, ?it/s]
Epoch 8/15 - Validation: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.63it/s]
Epoch 8/15 - Validation: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.63it/s]


Epoch 8: Train Loss=1.2637, Val Loss=1.2715, Val Acc=0.5000, Val F1=0.1667 (no improvement 3/4)


Epoch 9/15 - Training: 100%|██████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.66it/s]
Epoch 9/15 - Validation:   0%|                                                                    | 0/1 [00:00<?, ?it/s]
Epoch 9/15 - Validation: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.54it/s]



Epoch 9: Train Loss=1.2447, Val Loss=1.2664, Val Acc=0.5000, Val F1=0.1667 ✓ (best)


Epoch 10/15 - Training: 100%|█████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.74it/s]
Epoch 10/15 - Validation:   0%|                                                                   | 0/1 [00:00<?, ?it/s]
Epoch 10/15 - Validation: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.39it/s]



Epoch 10: Train Loss=1.2401, Val Loss=1.2634, Val Acc=0.5000, Val F1=0.1667 ✓ (best)


Epoch 11/15 - Training: 100%|█████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.67it/s]
Epoch 11/15 - Validation:   0%|                                                                   | 0/1 [00:00<?, ?it/s]
Epoch 11/15 - Validation: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.21it/s]



Epoch 11: Train Loss=1.1930, Val Loss=1.2614, Val Acc=0.5000, Val F1=0.1667 ✓ (best)


Epoch 12/15 - Training: 100%|█████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.46it/s]
Epoch 12/15 - Validation:   0%|                                                                   | 0/1 [00:00<?, ?it/s]
Epoch 12/15 - Validation: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.17it/s]



Epoch 12: Train Loss=1.1486, Val Loss=1.2568, Val Acc=0.5000, Val F1=0.1667 ✓ (best)


Epoch 13/15 - Training: 100%|████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, -14.18it/s]
Epoch 13/15 - Training: 100%|████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, -14.18it/s]
Epoch 13/15 - Validation: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.83it/s]
Epoch 13/15 - Validation: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.83it/s]


Epoch 13: Train Loss=1.1799, Val Loss=1.2640, Val Acc=0.5000, Val F1=0.1750 (no improvement 1/4)


Epoch 14/15 - Training: 100%|█████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.43it/s]
Epoch 14/15 - Validation:   0%|                                                                   | 0/1 [00:00<?, ?it/s]
Epoch 14/15 - Validation: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.81it/s]
Epoch 14/15 - Validation: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.81it/s]


Epoch 14: Train Loss=1.1029, Val Loss=1.2713, Val Acc=0.5714, Val F1=0.2556 (no improvement 2/4)


Epoch 15/15 - Training: 100%|█████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.52it/s]
Epoch 15/15 - Validation:   0%|                                                                   | 0/1 [00:00<?, ?it/s]
Epoch 15/15 - Validation: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.75it/s]



Epoch 15: Train Loss=1.0654, Val Loss=1.2734, Val Acc=0.5000, Val F1=0.1750 (no improvement 3/4)
Restored best model from validation

Training Cluster 2 Model (2896 samples)

Training Cluster 2 Model (2896 samples)


The following layers were not sharded: transformer.layer.*.attention.q_lin.weight, embeddings.LayerNorm.bias, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.ffn.lin*.bias, transformer.layer.*.attention.k_lin.weight, transformer.layer.*.output_layer_norm.weight, transformer.layer.*.sa_layer_norm.weight, embeddings.position_embeddings.weight, embeddings.word_embeddings.weight, transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.attention.v_lin.bias

The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.


The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.

Epoch 1/15 - Training: 100%|██████████████████████████

Epoch 1: Train Loss=1.3490, Val Loss=1.2927, Val Acc=0.4569, Val F1=0.3583 ✓ (best)


Epoch 2/15 - Training: 100%|██████████████████████████████████████████████████████████| 131/131 [00:20<00:00,  6.40it/s]
Epoch 2/15 - Training: 100%|██████████████████████████████████████████████████████████| 131/131 [00:20<00:00,  6.40it/s]
Epoch 2/15 - Validation: 100%|██████████████████████████████████████████████████████████| 15/15 [00:01<00:00, 12.02it/s]



Epoch 2: Train Loss=1.2585, Val Loss=1.2923, Val Acc=0.3966, Val F1=0.3126 ✓ (best)


Epoch 3/15 - Training:  60%|███████████████████████████████████▌                       | 79/131 [00:12<00:07,  6.52it/s]



KeyboardInterrupt: 

In [13]:
# Evaluate all cluster models and aggregate results
print("\n=== Cluster-Aware Transformer Model Results ===\n")

# Calculate baseline (majority class) metrics
baseline_preds_cluster = []
all_cluster_preds = []
all_cluster_true = []
cluster_results = {}

for cluster_id, data in cluster_models.items():
    # Load model from disk to evaluate
    model = TransformerClassifier(num_classes=4)
    model.load_state_dict(torch.load(data['model_path'], map_location=device))
    model = model.to(device)
    
    # Create test loader
    test_texts = data['test_data']['texts']
    test_labels = data['test_data']['labels']
    test_dataset = GameTextDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    preds, true_labels = evaluate_model(model, test_loader)
    all_cluster_preds.extend(preds)
    all_cluster_true.extend(true_labels)
    
    # Baseline: predict majority class for this cluster
    majority_class = Counter(test_labels).most_common(1)[0][0]
    baseline_preds_cluster.extend([majority_class] * len(true_labels))
    
    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average='macro')  # Use macro for multi-class
    
    cluster_results[cluster_id] = {
        'accuracy': acc,
        'f1': f1,
        'n_samples': len(true_labels)
    }
    
    print(f"Cluster {cluster_id}: Accuracy={acc:.4f}, F1={f1:.4f}, Samples={len(true_labels)}")
    
    # Free memory
    del model, test_dataset, test_loader
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Overall cluster-aware performance
if all_cluster_preds:
    cluster_accuracy = accuracy_score(all_cluster_true, all_cluster_preds)
    cluster_f1 = f1_score(all_cluster_true, all_cluster_preds, average='macro')  # Use macro for multi-class
    
    # Baseline metrics
    baseline_acc_cluster = accuracy_score(all_cluster_true, baseline_preds_cluster)
    baseline_f1_cluster = f1_score(all_cluster_true, baseline_preds_cluster, average='macro')
    
    print("\n" + "="*50)
    print("AGGREGATED CLUSTER-AWARE RESULTS (Tier Prediction)")
    print("="*50)
    print(classification_report(all_cluster_true, all_cluster_preds, target_names=tier_names_list))
    print(f"Overall Accuracy: {cluster_accuracy:.4f}")
    print(f"Overall F1 Score: {cluster_f1:.4f}")
    print(f"\n--- Baseline (Majority Class) ---")
    print(f"Baseline Accuracy: {baseline_acc_cluster:.4f}")
    print(f"Baseline F1 Score: {baseline_f1_cluster:.4f}")
    print(f"Improvement over baseline: Acc={cluster_accuracy - baseline_acc_cluster:+.4f}, F1={cluster_f1 - baseline_f1_cluster:+.4f}")


=== Cluster-Aware Transformer Model Results ===



The following layers were not sharded: transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.attention.k_lin.weight, embeddings.LayerNorm.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.output_layer_norm.weight, embeddings.position_embeddings.weight, transformer.layer.*.attention.q_lin.weight, transformer.layer.*.ffn.lin*.bias, embeddings.word_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.out_lin.bias
Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 37/37 [00:02<00:00, 12.61it/s]



Cluster 0: Accuracy=0.5736, F1=0.2833, Samples=584


The following layers were not sharded: transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.attention.k_lin.weight, embeddings.LayerNorm.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.output_layer_norm.weight, embeddings.position_embeddings.weight, transformer.layer.*.attention.q_lin.weight, transformer.layer.*.ffn.lin*.bias, embeddings.word_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.out_lin.bias
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 16.72it/s]



Cluster 1: Accuracy=0.5714, F1=0.2830, Samples=35


The following layers were not sharded: transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.attention.k_lin.weight, embeddings.LayerNorm.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.output_layer_norm.weight, embeddings.position_embeddings.weight, transformer.layer.*.attention.q_lin.weight, transformer.layer.*.ffn.lin*.bias, embeddings.word_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.out_lin.bias
Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 37/37 [00:03<00:00, 11.96it/s]
Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 37/37 [00:03<00:00, 11.96it/s

Cluster 2: Accuracy=0.4328, F1=0.3383, Samples=580


The following layers were not sharded: transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.attention.k_lin.weight, embeddings.LayerNorm.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.output_layer_norm.weight, embeddings.position_embeddings.weight, transformer.layer.*.attention.q_lin.weight, transformer.layer.*.ffn.lin*.bias, embeddings.word_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.out_lin.bias
Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 12.74it/s]



Cluster 3: Accuracy=0.3977, F1=0.2939, Samples=171


The following layers were not sharded: transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.attention.k_lin.weight, embeddings.LayerNorm.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.output_layer_norm.weight, embeddings.position_embeddings.weight, transformer.layer.*.attention.q_lin.weight, transformer.layer.*.ffn.lin*.bias, embeddings.word_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.out_lin.bias
Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 14.16it/s]



Cluster 4: Accuracy=0.6275, F1=0.1928, Samples=153


The following layers were not sharded: transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.attention.k_lin.weight, embeddings.LayerNorm.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.output_layer_norm.weight, embeddings.position_embeddings.weight, transformer.layer.*.attention.q_lin.weight, transformer.layer.*.ffn.lin*.bias, embeddings.word_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.out_lin.bias
Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 19/19 [00:01<00:00, 11.96it/s]



Cluster 5: Accuracy=0.5220, F1=0.2838, Samples=295


The following layers were not sharded: transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.attention.k_lin.weight, embeddings.LayerNorm.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.output_layer_norm.weight, embeddings.position_embeddings.weight, transformer.layer.*.attention.q_lin.weight, transformer.layer.*.ffn.lin*.bias, embeddings.word_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.out_lin.bias
Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 27/27 [00:02<00:00, 11.97it/s]



Cluster 6: Accuracy=0.6163, F1=0.3929, Samples=417


The following layers were not sharded: transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.attention.k_lin.weight, embeddings.LayerNorm.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.output_layer_norm.weight, embeddings.position_embeddings.weight, transformer.layer.*.attention.q_lin.weight, transformer.layer.*.ffn.lin*.bias, embeddings.word_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.out_lin.bias
Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 40/40 [00:03<00:00, 12.54it/s]



Cluster 8: Accuracy=0.4771, F1=0.3567, Samples=633


The following layers were not sharded: transformer.layer.*.ffn.lin*.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.k_lin.bias, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.attention.k_lin.weight, embeddings.LayerNorm.bias, embeddings.LayerNorm.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.output_layer_norm.weight, embeddings.position_embeddings.weight, transformer.layer.*.attention.q_lin.weight, transformer.layer.*.ffn.lin*.bias, embeddings.word_embeddings.weight, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.out_lin.bias
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 13.93it/s]
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 13.93it/s

Cluster 9: Accuracy=0.4930, F1=0.2242, Samples=71

AGGREGATED CLUSTER-AWARE RESULTS (Tier Prediction)


NameError: name 'tier_names_list' is not defined

### Results Comparison

In [14]:
# Create comparison summary with baseline
comparison_data = {
    'Model': ['Baseline (Majority)', 'Global Transformer', 'Cluster-Aware Transformer'],
    'Accuracy': [baseline_acc_global, global_accuracy, cluster_accuracy if all_cluster_preds else 0],
    'F1 Score (Macro)': [baseline_f1_global, global_f1, cluster_f1 if all_cluster_preds else 0],
    'Test Samples': [len(global_true), len(global_true), len(all_cluster_true)]
}

comparison_df = pd.DataFrame(comparison_data)
print("\n" + "="*60)
print("TRANSFORMER MODEL COMPARISON (Tier Prediction 0-3)")
print("="*60)
print(comparison_df.to_string(index=False))
print()
print("Note: Baseline predicts majority class (most common tier) in test set.")
print()

# Determine winner
if all_cluster_preds:
    acc_diff = cluster_accuracy - global_accuracy
    f1_diff = cluster_f1 - global_f1
    
    print(f"Global vs Baseline: Acc={global_accuracy - 0.5:+.4f}, F1={global_f1 - 0.5:+.4f}")
    print(f"Cluster vs Baseline: Acc={cluster_accuracy - 0.5:+.4f}, F1={cluster_f1 - 0.5:+.4f}")
    print(f"\nAccuracy Difference (Cluster - Global): {acc_diff:+.4f} ({'Cluster-Aware better' if acc_diff > 0 else 'Global better' if acc_diff < 0 else 'Tie'})")
    print(f"F1 Score Difference (Cluster - Global): {f1_diff:+.4f} ({'Cluster-Aware better' if f1_diff > 0 else 'Global better' if f1_diff < 0 else 'Tie'})")

NameError: name 'baseline_acc_global' is not defined

In [15]:
# Visualization: Bar chart comparison
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2, subplot_titles=('Accuracy Comparison', 'F1 Score Comparison'))

# Accuracy bars
fig.add_trace(
    go.Bar(name='Global', x=['Accuracy'], y=[global_accuracy], marker_color='steelblue'),
    row=1, col=1
)
fig.add_trace(
    go.Bar(name='Cluster-Aware', x=['Accuracy'], y=[cluster_accuracy if all_cluster_preds else 0], marker_color='coral'),
    row=1, col=1
)

# F1 Score bars  
fig.add_trace(
    go.Bar(name='Global', x=['F1 Score'], y=[global_f1], marker_color='steelblue', showlegend=False),
    row=1, col=2
)
fig.add_trace(
    go.Bar(name='Cluster-Aware', x=['F1 Score'], y=[cluster_f1 if all_cluster_preds else 0], marker_color='coral', showlegend=False),
    row=1, col=2
)

fig.update_layout(
    title='Transformer Model Performance Comparison',
    barmode='group',
    height=400
)
fig.update_yaxes(range=[0, 1])

display(fig)

NameError: name 'global_accuracy' is not defined

In [51]:
# Training loss curves comparison
fig = make_subplots(rows=1, cols=2, subplot_titles=('Training Loss', 'Validation Loss'))

# Global model training curve
epochs_global = list(range(1, len(global_history['train_loss']) + 1))
fig.add_trace(
    go.Scatter(x=epochs_global, y=global_history['train_loss'], name='Global Train', line=dict(color='steelblue')),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=epochs_global, y=global_history['val_loss'], name='Global Val', line=dict(color='steelblue', dash='dash')),
    row=1, col=2
)

# Average cluster model training curves
if cluster_histories:
    avg_train_loss = []
    avg_val_loss = []
    max_epochs = max(len(h['train_loss']) for h in cluster_histories.values())
    
    for epoch_idx in range(max_epochs):
        train_losses = [h['train_loss'][epoch_idx] for h in cluster_histories.values() if epoch_idx < len(h['train_loss'])]
        val_losses = [h['val_loss'][epoch_idx] for h in cluster_histories.values() if epoch_idx < len(h['val_loss'])]
        avg_train_loss.append(np.mean(train_losses))
        avg_val_loss.append(np.mean(val_losses))
    
    epochs_cluster = list(range(1, len(avg_train_loss) + 1))
    fig.add_trace(
        go.Scatter(x=epochs_cluster, y=avg_train_loss, name='Cluster Avg Train', line=dict(color='coral')),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(x=epochs_cluster, y=avg_val_loss, name='Cluster Avg Val', line=dict(color='coral', dash='dash')),
        row=1, col=2
    )

fig.update_layout(title='Training Curves Comparison', height=400)
fig.update_xaxes(title_text='Epoch')
fig.update_yaxes(title_text='Loss')

display(fig)

In [52]:
# Per-cluster performance visualization
if cluster_results:
    cluster_ids = list(cluster_results.keys())
    accuracies = [cluster_results[c]['accuracy'] for c in cluster_ids]
    f1_scores = [cluster_results[c]['f1'] for c in cluster_ids]
    n_samples = [cluster_results[c]['n_samples'] for c in cluster_ids]
    
    fig = make_subplots(rows=2, cols=1, subplot_titles=('Per-Cluster Accuracy', 'Per-Cluster F1 Score'))
    
    # Accuracy per cluster
    fig.add_trace(
        go.Bar(
            x=[f"Cluster {c}" for c in cluster_ids], 
            y=accuracies, 
            text=[f"{a:.2f}" for a in accuracies],
            textposition='outside',
            marker_color='steelblue'
        ),
        row=1, col=1
    )
    
    # Add global accuracy line
    fig.add_hline(y=global_accuracy, line_dash="dash", line_color="red", 
                  annotation_text=f"Global: {global_accuracy:.2f}", row=1, col=1)
    
    # F1 Score per cluster
    fig.add_trace(
        go.Bar(
            x=[f"Cluster {c}" for c in cluster_ids], 
            y=f1_scores, 
            text=[f"{f:.2f}" for f in f1_scores],
            textposition='outside',
            marker_color='coral'
        ),
        row=2, col=1
    )
    
    # Add global F1 line
    fig.add_hline(y=global_f1, line_dash="dash", line_color="red", 
                  annotation_text=f"Global: {global_f1:.2f}", row=2, col=1)
    
    fig.update_layout(
        title='Cluster-Specific Transformer Performance vs Global Model',
        height=700,
        showlegend=False
    )
    fig.update_yaxes(range=[0, 1.1])
    
    display(fig)

In [16]:
# Summary
print("="*50)
print("SUMMARY - TIER PREDICTION (0-3)")
print("="*50)

print(f"\nDataset: {len(all_texts)} games with tier labels")
tier_distribution = Counter(all_labels)
for tier in sorted(tier_distribution.keys()):
    count = tier_distribution[tier]
    print(f"  Tier {tier} ({tier_names[tier]}): {count} ({100*count/len(all_labels):.1f}%)")
print(f"\nClusters: {NUM_CLUSTERS} total, {len(cluster_models)} with models")

print(f"\nGlobal Model:        Acc={global_accuracy:.4f}, F1={global_f1:.4f}")

if all_cluster_preds:
    print(f"Cluster-Aware Model: Acc={cluster_accuracy:.4f}, F1={cluster_f1:.4f}")
    
    acc_diff = cluster_accuracy - global_accuracy
    f1_diff = cluster_f1 - global_f1
    
    print(f"\nDifference: Acc={acc_diff:+.4f}, F1={f1_diff:+.4f}")
    winner = "Cluster-Aware" if (acc_diff + f1_diff) > 0 else "Global"
    print(f"Better model: {winner}")

print("="*50)

SUMMARY - TIER PREDICTION (0-3)

Dataset: 14713 games with tier labels
  Tier 0 (Everyone): 6847 (46.5%)
  Tier 1 (Teen): 4219 (28.7%)
  Tier 2 (Mature): 2563 (17.4%)
  Tier 3 (Adults Only): 1084 (7.4%)

Clusters: 10 total, 9 with models


NameError: name 'global_accuracy' is not defined

### Evaluation on Top 100 Games by Review Count

In [25]:
# Evaluate both classifiers on top 100 games by review count
print("="*60)
print("EVALUATION ON TOP 100 GAMES BY REVIEW COUNT")
print("="*60)

# Get top 100 games by review count that have unified labels
top_100_games = sorted(
    [g for g in all_games if str(g["app_id"]) in unified_labels],
    key=lambda g: g.get('review_count') or 0,
    reverse=True
)[:100]

print(f"\nEvaluating on {len(top_100_games)} top games\n")

# Prepare data for evaluation
top_100_results = []

for game in top_100_games:
    appid = str(game["app_id"])
    label_data = unified_labels[appid]
    tier = label_data['tier']
    true_label = tier  # Use tier directly (0-3)
    
    about = game.get("about_this_game") or ""
    desc = game.get("description") or ""
    text = f"{about} {desc}".strip()
    
    if not text:
        continue
    
    # Global model prediction
    global_model.eval()
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors='pt')
    with torch.no_grad():
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        output = global_model(input_ids, attention_mask)
        global_pred = torch.argmax(output, dim=1).item()
    
    # Cluster-aware model prediction
    # cluster_id = appid_to_cluster.get(appid)
    # if cluster_id is not None and cluster_id in cluster_models:
    #     # Load cluster model
    #     cluster_model = TransformerClassifier(num_classes=4)
    #     cluster_model.load_state_dict(torch.load(cluster_models[cluster_id]['model_path'], map_location=device))
    #     cluster_model = cluster_model.to(device)
    #     cluster_model.eval()
        
    #     with torch.no_grad():
    #         output = cluster_model(input_ids, attention_mask)
    #         cluster_pred = torch.argmax(output, dim=1).item()
        
    #     del cluster_model
    #     torch.cuda.empty_cache() if torch.cuda.is_available() else None
    # else:
    #     cluster_pred = None
    
    top_100_results.append({
        'title': game.get('title', 'Unknown'),
        'app_id': appid,
        'review_count': game.get('review_count') or 0,
        'tier': tier,
        'tier_label': label_data['tier_label'],
        'source': label_data['source'],
        'true_label': true_label,
        'global_pred': global_pred,
        # 'cluster_pred': cluster_pred,
        # 'cluster_id': cluster_id
    })

# Create results DataFrame
results_df = pd.DataFrame(top_100_results)
tier_names = {0: "Everyone", 1: "Teen", 2: "Mature", 3: "Adults Only"}
results_df['true_label_str'] = results_df['true_label'].map(tier_names)
results_df['global_pred_str'] = results_df['global_pred'].map(tier_names)
# results_df['cluster_pred_str'] = results_df['cluster_pred'].map({**tier_names, None: 'N/A'})
results_df['global_correct'] = results_df['true_label'] == results_df['global_pred']
# results_df['cluster_correct'] = results_df.apply(
#     lambda r: r['true_label'] == r['cluster_pred'] if r['cluster_pred'] is not None else None, axis=1
# )
# Summary statistics - show tier distribution
n_games = len(results_df)
tier_dist = results_df['true_label'].value_counts().sort_index()
print(f"\nTop 100 Tier Distribution:")
for tier in sorted(tier_dist.index):
    count = tier_dist[tier]
    print(f"  Tier {tier} ({tier_names[tier]}): {count} ({100*count/n_games:.1f}%)")

# Show source distribution in top 100
source_counts_top100 = results_df['source'].value_counts()
print("\nTop 100 Source Distribution:")
for source, count in source_counts_top100.items():
    print(f"  {source.upper()}: {count} ({100*count/n_games:.1f}%)")

# Baseline: predict majority class
majority_class_top100 = Counter(results_df['true_label']).most_common(1)[0][0]
baseline_preds_top100 = [majority_class_top100] * n_games
baseline_correct_top100 = sum(1 for true_label in results_df['true_label'] if true_label == majority_class_top100)
print(f"\n=== Baseline (Majority Class: {tier_names[majority_class_top100]}) ===")
print(f"Correct: {baseline_correct_top100}/{n_games} ({100*baseline_correct_top100/n_games:.1f}%)")

print("\n=== Global Model on Top 100 ===")
global_correct = results_df['global_correct'].sum()
print(f"Correct: {global_correct}/{n_games} ({100*global_correct/n_games:.1f}%)")
print(f"Improvement over baseline: {global_correct - baseline_correct_top100:+d} ({100*(global_correct - baseline_correct_top100)/n_games:+.1f}%)")

# cluster_evaluated = results_df['cluster_pred'].notna().sum()
# cluster_correct = results_df['cluster_correct'].sum()
# print(f"\n=== Cluster-Aware Model on Top 100 ===")
# print(f"Games with cluster model: {cluster_evaluated}/{n_games}")
# if cluster_evaluated > 0:
#     print(f"Correct: {cluster_correct}/{cluster_evaluated} ({100*cluster_correct/cluster_evaluated:.1f}%)")

# Show misclassified games
print("\n" + "="*60)
print("MISCLASSIFIED GAMES (Global Model)")
print("="*60)
misclassified = results_df[~results_df['global_correct']].head(20)
for _, row in misclassified.iterrows():
    tier_info = f"T{row['tier']}({row['tier_label'][:3]})"
    print(f"• {row['title'][:45]:<45} | {tier_info:<8} | True: {row['true_label_str']:<12} | Pred: {row['global_pred_str']}")

# Show sample of correct predictions
print("\n" + "="*60)
print("CORRECTLY CLASSIFIED TOP GAMES (Sample)")
print("="*60)
correct = results_df[results_df['global_correct']].head(10)
for _, row in correct.iterrows():
    tier_info = f"T{row['tier']}({row['tier_label'][:3]})"
    print(f"✓ {row['title'][:45]:<45} | {tier_info:<8} | {row['true_label_str']}")

# Display full table
print("\n" + "="*60)
print("FULL RESULTS TABLE")
print("="*60)
display_cols = ['title', 'review_count', 'tier', 'tier_label', 'source', 'true_label_str', 'global_pred_str', 'global_correct']
print(results_df[display_cols].to_string(index=False))

EVALUATION ON TOP 100 GAMES BY REVIEW COUNT

Evaluating on 100 top games


Top 100 Tier Distribution:
  Tier 0 (Everyone): 19 (19.0%)
  Tier 1 (Teen): 23 (23.0%)
  Tier 2 (Mature): 32 (32.0%)
  Tier 3 (Adults Only): 26 (26.0%)

Top 100 Source Distribution:
  PEGI: 83 (83.0%)
  ESRB: 17 (17.0%)

=== Baseline (Majority Class: Mature) ===
Correct: 32/100 (32.0%)

=== Global Model on Top 100 ===
Correct: 55/100 (55.0%)
Improvement over baseline: +23 (+23.0%)

MISCLASSIFIED GAMES (Global Model)
• Team Fortress 2                               | T2(Mat)  | True: Mature       | Pred: Teen
• Rust                                          | T3(Adu)  | True: Adults Only  | Pred: Teen
• Baldur's Gate 3                               | T3(Adu)  | True: Adults Only  | Pred: Mature
• Destiny 2                                     | T2(Mat)  | True: Mature       | Pred: Teen
• Stardew Valley                                | T1(Tee)  | True: Teen         | Pred: Everyone
• Cyberpunk 2077                  