In [None]:
!pip install --upgrade gdown

import gdown

file_id = '12MIVNeZ8Dek4iGUyzOeES6CQa-3f8aU3'
url = f'https://drive.google.com/uc?id={file_id}'

output = 'data.zip'
gdown.download(url, output, quiet=False)

!unzip -q data.zip -d /content/data/

print("Pobieranie i rozpakowywanie zakończone!")



Downloading...
From (original): https://drive.google.com/uc?id=12MIVNeZ8Dek4iGUyzOeES6CQa-3f8aU3
From (redirected): https://drive.google.com/uc?id=12MIVNeZ8Dek4iGUyzOeES6CQa-3f8aU3&confirm=t&uuid=984ff92a-2917-4876-82c7-789a5f314e9d
To: /content/data.zip
100%|██████████| 263M/263M [00:12<00:00, 20.5MB/s]


Pobieranie i rozpakowywanie zakończone!


In [None]:
import glob
import json
import os

DATA_PATH = 'data/games_slim/**/*.json'

MAX_GAMES = 200000

index = 0
all_games = []

print("Rozpoczynam wczytywanie danych z lokalnego dysku Colab...")

for file_path in glob.glob(DATA_PATH, recursive=True):
    if len(all_games) >= MAX_GAMES:
        break

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            all_games.append(data)
    except Exception as e:
        continue

    index += 1
    if index % 500 == 0:
        print(f"Loaded {index} files...", end='\r')

print(f"\nSukces! Wczytano {len(all_games)} gier.")

Rozpoczynam wczytywanie danych z lokalnego dysku Colab...
Loaded 145500 files...
Sukces! Wczytano 145767 gier.


In [None]:
unique_games = {}

for game in all_games:
    title = (game.get("title") or "").strip().lower()
    devs = tuple(sorted((game.get("developer") or [])))
    pubs = tuple(sorted((game.get("publisher") or [])))
    key = (title, devs, pubs)

    current_best = unique_games.get(key)
    if current_best is None:
        unique_games[key] = game
    else:
        # Keep the one with the highest review_count
        if (game.get("review_count") or 0) > (current_best.get("review_count") or 0):
            unique_games[key] = game

deduped_games = list(unique_games.values())

print(f"After deduplication: {len(deduped_games)} unique games (from {len(all_games)} total)")
all_games = deduped_games

After deduplication: 123567 unique games (from 145767 total)


In [None]:
prices = []
for game in all_games:
    price_str = game.get('price', '')
    if price_str:
        if 'free' in price_str.lower():
            prices.append(0.0)
        else:
            price_clean = price_str.replace('zł', '').replace(',', '.').strip()
            try:
                price = float(price_clean)
                prices.append(price)
            except ValueError:
                prices.append(None)
    else:
        prices.append(None)

print(f"Extracted prices for {len([p for p in prices if p is not None])} games")

Extracted prices for 89960 games


### Clustering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

texts = []
titles = []
for game in all_games:
    text = (game.get('about_this_game', '') + ' ' +
            game.get('description', ''))
    texts.append(text)
    titles.append(game.get('title', 'Unknown'))

print(f"Prepared {len(texts)} text documents")

# Vectorize text
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(texts)
print(f"Vectorized to shape: {X.shape}")

# Cluster
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X)
print("Clustering completed")

# Calculate silhouette score
# Commented since it's slow to compute
# sil_score = silhouette_score(X, clusters)
# print(f"Silhouette Score: {sil_score:.3f}")

# Filter top 1000 games by review_count for visualization only
top_indices = sorted(range(len(all_games)), key=lambda i: all_games[i].get('review_count') or 0, reverse=True)[:1000]

pca = PCA(n_components=3, random_state=42)
X_pca = pca.fit_transform(X[top_indices].toarray())

df = pd.DataFrame({
    'x': X_pca[:, 0],
    'y': X_pca[:, 1],
    'z': X_pca[:, 2],
    'cluster': clusters[top_indices],
    'title': [titles[i] for i in top_indices]
})

fig = px.scatter_3d(df, x='x', y='y', z='z', color='cluster',
                    hover_data=['title'],
                    title='Game Clusters Based on Text Content (3D PCA) - Top 1000 by Review Count',
                    labels={'x': 'PCA Component 1', 'y': 'PCA Component 2', 'z': 'PCA Component 3'},
                    size_max=3, opacity=0.6)
from IPython.display import display
display(fig)

Prepared 123567 text documents
Vectorized to shape: (123567, 1000)
Clustering completed


In [None]:
feature_names = vectorizer.get_feature_names_out()

cluster_topics = {}
for cluster_id in range(NUM_CLUSTERS):
    cluster_indices = [i for i, c in enumerate(clusters) if c == cluster_id]
    cluster_tfidf = X[cluster_indices]
    avg_tfidf = cluster_tfidf.mean(axis=0).A1  # average TF-IDF per word
    top_indices = avg_tfidf.argsort()[-10:][::-1]  # top 10 words
    top_words = [feature_names[i] for i in top_indices]
    cluster_topics[cluster_id] = top_words

    # Get cluster games and sort by review_count
    cluster_games = [game for game, c in zip(all_games, clusters) if c == cluster_id]
    sorted_games = sorted(cluster_games, key=lambda g: g.get('review_count') or 0, reverse=True)

    # Take up to 10 unique titles
    sample = []
    seen_titles = set()
    for g in sorted_games:
        title = g['title']
        if title not in seen_titles:
            sample.append(g)
            seen_titles.add(title)
        if len(sample) >= 10:
            break

    print(f"Cluster {cluster_id}")
    print(f"({', '.join(top_words[:5])})")  # Show top 5 words as topics
    print(f"{len(cluster_games)} games")

    for i, g in enumerate(sample, 1):
        rating = g.get('review_score', 'N/A')
        votes = g.get('review_count') or 0
        print(f"{i}. {g['title']} (rating: {rating}, votes: {votes})")
    print()

Cluster 0
(game, games, story, rpg, features)
1527 games
1. Sanfu (rating: 84.0, votes: 16720)
2. 古龙风云录 (rating: 62.0, votes: 8357)
3. 魔女的夜宴 (rating: 95.0, votes: 8089)
4. Kalpa of Sword (rating: 35.0, votes: 6551)
5. 古剑奇谭(GuJian) (rating: 86.0, votes: 4979)
6. 枝江往事 (rating: 96.0, votes: 4871)
7. Swaying Girl (rating: 89.0, votes: 4851)
8. 再刷一把 PlayAgain (rating: 66.0, votes: 4419)
9. 我的小鲨鱼 (rating: 100.0, votes: 4235)
10. 妄想破绽 Broken Delusion (rating: 60.0, votes: 3906)

Cluster 1
(puzzle, puzzles, game, solve, levels)
7694 games
1. OneShot (rating: 97.0, votes: 29533)
2. Unpacking (rating: 92.0, votes: 21070)
3. The Talos Principle (rating: 93.0, votes: 16710)
4. Portal: Revolution (rating: 97.0, votes: 15903)
5. Portal Reloaded (rating: 93.0, votes: 13929)
6. Superliminal (rating: 93.0, votes: 13364)
7. The Looker (rating: 97.0, votes: 11902)
8. Baba Is You (rating: 96.0, votes: 11471)
9. A Little to the Left (rating: 89.0, votes: 8556)
10. The Witness (rating: 75.0, votes: 8379)

C

### Classifier - ONLY BEST ATTEMPT

In [None]:
import json
import os

possible_paths = [
    "unified_maturity_labels.json"
]

mature_labels = None

for path in possible_paths:
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            mature_labels = json.load(f)
        print(f"✅ Sukces! Załadowano etykiety z: {path}")
        print(f"Liczba załadowanych etykiet: {len(mature_labels)}")
        break

✅ Sukces! Załadowano etykiety z: unified_maturity_labels.json
Liczba załadowanych etykiet: 14713


In [None]:
from collections import defaultdict

# Słownik do przechowywania unikalnych słów (zbiory zapewniają brak duplikatów)
tier_keywords = {
    0: {"descriptors": set(), "themes": set()},
    1: {"descriptors": set(), "themes": set()},
    2: {"descriptors": set(), "themes": set()},
    3: {"descriptors": set(), "themes": set()}
}

tier_names = {0: "Everyone", 1: "Teen", 2: "Mature", 3: "Adults Only"}

# Iteracja przez załadowane etykiety
for app_id, data in mature_labels.items():
    tier = data.get('tier')

    # Pomijamy, jeśli tier nie mieści się w zakresie 0-3
    if tier in tier_keywords:
        # Pobieramy deskryptory i tematy (używamy .get, by uniknąć błędów przy braku klucza)
        descriptors = data.get('descriptors', [])
        themes = data.get('themes', [])

        # Dodajemy do odpowiednich zbiorów
        tier_keywords[tier]["descriptors"].update(descriptors)
        tier_keywords[tier]["themes"].update(themes)

# Wyświetlanie wyników
print("🔍 ANALIZA SŁÓW KLUCZOWYCH DLA POSZCZEGÓLNYCH TIERÓW")
print("="*60)

for tier in range(4):
    name = tier_names[tier]
    desc_list = sorted(list(tier_keywords[tier]["descriptors"]))
    theme_list = sorted(list(tier_keywords[tier]["themes"]))

    print(f"TIER {tier} ({name.upper()})")
    print(f"  🔹 Deskryptory ({len(desc_list)}):")
    print(f"     {', '.join(desc_list) if desc_list else 'brak'}")

    print(f"  🔸 Tematy ({len(theme_list)}):")
    print(f"     {', '.join(theme_list) if theme_list else 'brak'}")
    print("-" * 60)

🔍 ANALIZA SŁÓW KLUCZOWYCH DLA POSZCZEGÓLNYCH TIERÓW
TIER 0 (EVERYONE)
  🔹 Deskryptory (7):
     bad_language, drugs, fear, gambling, in-game-purchase, sex, violence
  🔸 Tematy (8):
     adult, family_friendly, gambling, horror, mature_language, monetization, substance_use, violent
------------------------------------------------------------
TIER 1 (TEEN)
  🔹 Deskryptory (7):
     bad_language, drugs, fear, gambling, in-game-purchase, sex, violence
  🔸 Tematy (8):
     adult, family_friendly, gambling, horror, mature_language, monetization, substance_use, violent
------------------------------------------------------------
TIER 2 (MATURE)
  🔹 Deskryptory (7):
     bad_language, drugs, fear, gambling, in-game-purchase, sex, violence
  🔸 Tematy (8):
     adult, family_friendly, gambling, horror, mature_language, monetization, substance_use, violent
------------------------------------------------------------
TIER 3 (ADULTS ONLY)
  🔹 Deskryptory (7):
     bad_language, discrimination, drug

In [None]:
from collections import Counter
import re

# Definicja keywordów do zliczenia (pogrupowane tierami dla przejrzystości)
keywords_to_track = {
    "Tier 3 (Adults Only)": [
        "sexual violence", "non-consensual", "rape", "explicit sexual",
        "adults only", "pornography", "hardcore", "sexual intercourse",
        "sexual assault", "bondage", "sadism", "sex", "gore"
    ],
    "Tier 2 (Mature)": [
        "violence", "blood", "nudity", "sexual content", "sexual themes",
        "drugs", "alcohol", "strong language", "profanity", "intense",
        "realistic", "slaughter", "suicide", "self-harm", "mutilation", "torture",
        "gambling", "erotica", "horror", "death", "cannibalism"
    ],
    "Tier 1 (Teen)": [
        "mild", "fantasy violence", "suggestive", "crude humor", "minimal",
        "tobacco", "alcohol reference", "cartoon violence", "fear", "slight",
        "low level", "infrequent"
    ],
    "Tier 0 (Everyone)": [
        "all ages", "everyone", "family friendly", "no mature content",
        "suitable for all"
    ]
}

# Inicjalizacja liczników
keyword_counts = Counter()
games_with_mature_desc = 0

print("🔍 Analizuję pole 'mature_content' w bazie gier...")

for game in all_games:
    content = game.get("mature_content")

    # Sprawdzamy tylko gry, które mają wypełniony opis dojrzałości
    if content and isinstance(content, str) and len(content.strip()) > 0:
        games_with_mature_desc += 1
        content_lower = content.lower()

        # Sprawdzamy każdy keyword
        for category, keywords in keywords_to_track.items():
            for kw in keywords:
                if kw in content_lower:
                    keyword_counts[kw] += 1

# --- GENEROWANIE RAPORTU ---
print("\n" + "="*60)
print(f"📊 STATYSTYKI SŁÓW KLUCZOWYCH (Na podstawie {games_with_mature_desc} opisów)")
print("="*60)

for category, keywords in keywords_to_track.items():
    print(f"\n▶ {category}:")
    # Sortujemy keywordy wewnątrz kategorii od najczęstszych
    sorted_kws = sorted([(kw, keyword_counts[kw]) for kw in keywords], key=lambda x: x[1], reverse=True)

    found_any = False
    for kw, count in sorted_kws:
        if count > 0:
            percentage = (count / games_with_mature_desc) * 100
            print(f"   - {kw:<20} : {count:>5} wystąpień ({percentage:>5.1f}%)")
            found_any = True

    if not found_any:
        print("   (brak wystąpień)")

print("\n" + "="*60)
print(f"SUMA GIER Z OPISEM MATURE: {games_with_mature_desc}")
print("="*60)

🔍 Analizuję pole 'mature_content' w bazie gier...

📊 STATYSTYKI SŁÓW KLUCZOWYCH (Na podstawie 25385 opisów)

▶ Tier 3 (Adults Only):
   - gore                 :  6215 wystąpień ( 24.5%)
   - sex                  :  3615 wystąpień ( 14.2%)
   - sexual assault       :   199 wystąpień (  0.8%)
   - non-consensual       :   138 wystąpień (  0.5%)
   - explicit sexual      :   104 wystąpień (  0.4%)
   - sexual violence      :    77 wystąpień (  0.3%)
   - rape                 :    49 wystąpień (  0.2%)
   - pornography          :    24 wystąpień (  0.1%)
   - sexual intercourse   :    13 wystąpień (  0.1%)
   - sadism               :     8 wystąpień (  0.0%)
   - adults only          :     7 wystąpień (  0.0%)
   - bondage              :     6 wystąpień (  0.0%)
   - hardcore             :     3 wystąpień (  0.0%)

▶ Tier 2 (Mature):
   - violence             : 10728 wystąpień ( 42.3%)
   - blood                :  5949 wystąpień ( 23.4%)
   - nudity               :  2525 wystąpień (  9.9%)

In [None]:
import json
import os
from collections import Counter
import re

def assign_tier_from_steam_text(text):
    if not text or not isinstance(text, str):
        return None

    text = text.lower()

    # TIER 3: Adults Only (Treści ekstremalne, pornografia, hazard na pieniądze)
    t3_keywords = [
        "sexual violence", "non-consensual", "rape", "explicit sexual",
        "adults only", "pornography", "hardcore", "sexual intercourse",
        "sexual assault", "bondage", "sadism", "real gambling", "fetish",
        "extreme violence", "sex", "sexual deviance", "gore", "suicide"
    ]

    # TIER 2: Mature (Silna przemoc, krew, nagość, używki, horror)
    t2_keywords = [
        "violence", "blood", "nudity", "sexual content", "sexual themes",
        "drugs", "alcohol", "strong language", "profanity", "intense",
        "realistic", "slaughter", "self-harm", "mutilation", "torture",
        "gambling", "erotica", "horror", "death", "cannibalism", "strong violence",
        "dismemberment", "execution", "substance abuse", "mature humor", "drug use",
        "sexualized", "f-word", "vulgar"
    ]

    # TIER 1: Teen (Lekka przemoc, nastolatki, łagodny humor, sugestywność)
    t1_keywords = [
        "mild", "fantasy violence", "suggestive", "crude humor", "minimal",
        "tobacco", "alcohol reference", "cartoon violence", "fear", "slight",
        "low level", "infrequent", "comic mischief", "mild lyrics", "mild blood",
        "mild language", "some violence", "minor", "pegi 12", "esrb t", "teen",
        "simulated gambling", "brief nudity", "mild suggestive themes", "adolescent",
        "slapstick", "animated blood", "scantily clad", "high school life",
        "recreational drugs", "dating", "romance", "partial nudity"
    ]

    # TIER 0: Everyone (Dla wszystkich, dzieci, brak negatywnych treści)
    t0_keywords = [
        "all ages", "everyone", "family friendly", "no mature content",
        "suitable for all", "suitable for children", "kids", "child-friendly",
        "educational", "no blood", "no violence", "non-violent", "harmless",
        "pegi 3", "pegi 7", "e-rated", "e for everyone", "preschool",
        "kindergarten", "kid friendly", "whimsical", "colorful", "learning",
        "alphabet", "nursery", "brain teaser", "peaceful", "calm", "relaxing",
        "no profanity", "clean language", "suitable for all audiences",
        "unrated", "general audience", "preschoolers", "fun for kids"
    ]

    # Logika sprawdzania - od najcięższych do najlżejszych (Hierarchia ważności)
    if any(k in text for k in t3_keywords): return 3
    if any(k in text for k in t2_keywords): return 2
    if any(k in text for k in t1_keywords): return 1
    if any(k in text for k in t0_keywords): return 0

    return None # Jeśli nie pasuje do żadnego - skip

# --- INICJALIZACJA STATYSTYK ---
stats = {
    "total_processed": 0,
    "already_in_unified": 0,
    "skipped_null_steam": 0,
    "skipped_no_keywords": 0,
    "newly_added": 0,
    "tier_counts": Counter()
}

# Mapowanie nazw dla raportu
tier_names = {0: "Everyone", 1: "Teen", 2: "Mature", 3: "Adults Only"}

# Tworzymy kopię bazowego zbioru (pamiętaj, że mature_labels musi być wcześniej załadowane)
extended_labels = mature_labels.copy()
total_to_check = len(all_games)

print(f"🚀 Rozpoczynam rozszerzanie zbioru ({total_to_check} gier) z pełną listą keywordów...")

for i, game in enumerate(all_games):
    stats["total_processed"] += 1
    appid = str(game.get("app_id"))

    # Zachowaj oryginalne ratingi PEGI/ESRB jeśli istnieją
    if appid in extended_labels:
        stats["already_in_unified"] += 1
    else:
        steam_mc = game.get("mature_content")

        # Analizujemy tylko jeśli opis Steam nie jest pusty
        if steam_mc and isinstance(steam_mc, str) and len(steam_mc.strip()) > 5:
            tier = assign_tier_from_steam_text(steam_mc)

            if tier is not None:
                extended_labels[appid] = {
                    "tier": tier,
                    "tier_label": tier_names[tier],
                    "source": "steam_self_declaration",
                    "original_rating": "Steam Mature Description",
                    "found_title": game.get("title", "Unknown")
                }
                stats["newly_added"] += 1
                stats["tier_counts"][tier] += 1
            else:
                stats["skipped_no_keywords"] += 1
        else:
            stats["skipped_null_steam"] += 1

    # Print progress co 5000 plików
    if (i + 1) % 5000 == 0 or (i + 1) == total_to_check:
        print(f"  [Postęp: {i+1}/{total_to_check}] Nowe: {stats['newly_added']} | Skip (brak keyword): {stats['skipped_no_keywords']} ", end='\r')

# --- RAPORT KOŃCOWY ---
print("\n\n" + "="*50)
print("📊 RAPORT KOŃCOWY: ROZSZERZANIE ZBIORU")
print("="*50)
print(f"1. Gry w katalogu:            {stats['total_processed']}")
print(f"2. Zachowano z PEGI/ESRB:     {stats['already_in_unified']}")
print(f"3. Dodano nowo ze Steam:      {stats['newly_added']} ✨")
print(f"4. Pominięto (brak keyword):  {stats['skipped_no_keywords']}")
print("-" * 50)
print("PODZIAŁ NOWYCH ETYKIET ZE STEAM:")
for t in range(4):
    count = stats["tier_counts"][t]
    percentage = (count / stats["newly_added"] * 100) if stats["newly_added"] > 0 else 0
    print(f"  - Tier {t} ({tier_names[t]:<11}): {count} gier ({percentage:.1f}%)")
print("-" * 50)
print(f"Finalna liczba etykiet:      {len(extended_labels)}")
print("="*50)

# Zapis do pliku
output_file = "unified_mature_content_pegi_esrb_steam.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(extended_labels, f, indent=2, ensure_ascii=False)
print(f"💾 Plik zapisany jako: {output_file}")

🚀 Rozpoczynam rozszerzanie zbioru (123567 gier) z pełną listą keywordów...
  [Postęp: 123567/123567] Nowe: 16954 | Skip (brak keyword): 5786 

📊 RAPORT KOŃCOWY: ROZSZERZANIE ZBIORU
1. Gry w katalogu:            123567
2. Zachowano z PEGI/ESRB:     13516
3. Dodano nowo ze Steam:      16954 ✨
4. Pominięto (brak keyword):  5786
--------------------------------------------------
PODZIAŁ NOWYCH ETYKIET ZE STEAM:
  - Tier 0 (Everyone   ): 1098 gier (6.5%)
  - Tier 1 (Teen       ): 242 gier (1.4%)
  - Tier 2 (Mature     ): 7062 gier (41.7%)
  - Tier 3 (Adults Only): 8552 gier (50.4%)
--------------------------------------------------
Finalna liczba etykiet:      31667
💾 Plik zapisany jako: unified_mature_content_pegi_esrb_steam.json


In [None]:
import json
from collections import Counter
import random

# 1. Wczytanie rozszerzonego pliku
file_path = "unified_mature_content_pegi_esrb_steam.json"

try:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    print(f"✅ Pomyślnie wczytano {len(data)} etykiet.\n")
except FileNotFoundError:
    print(f"❌ Nie znaleziono pliku {file_path}. Upewnij się, że skrypt rozszerzający został uruchomiony.")
    data = {}

if data:
    tier_names = {0: "Everyone", 1: "Teen", 2: "Mature", 3: "Adults Only"}

    # Organizujemy dane do statystyk i przykładów
    organized_data = {0: [], 1: [], 2: [], 3: []}

    for appid, info in data.items():
        tier = info.get('tier')
        if tier in organized_data:
            organized_data[tier].append({
                'title': info.get('found_title', 'Unknown'),
                'source': info.get('source', 'unknown'),
                'appid': appid
            })

    print("="*70)
    print(f"{'TIER':<15} | {'LICZBA GIER':<12} | {'ŹRÓDŁA (PEGI/ESRB/STEAM)':<30}")
    print("-"*70)

    for tier in range(4):
        games = organized_data[tier]
        count = len(games)

        # Podliczamy źródła wewnątrz danego Tieru
        sources = Counter([g['source'] for g in games])
        source_str = f"P:{sources['pegi']} / E:{sources['esrb']} / S:{sources['steam_self_declaration']}"

        print(f"{tier_names[tier]:<15} | {count:<12} | {source_str}")

    print("="*70)
    print("\nPRZYKŁADOWE GRY (po 30 na Tier):")

    for tier in range(4):
        games = organized_data[tier]
        print(f"\n>>> TIER {tier} ({tier_names[tier].upper()}) " + "-"*40)

        # Losujemy 30 przykładów (lub bierzemy wszystkie, jeśli jest ich mniej)
        sample_size = min(len(games), 30)
        sample = random.sample(games, sample_size)

        # Wyświetlamy w dwóch kolumnach dla oszczędności miejsca
        for i in range(0, len(sample), 2):
            pair = sample[i:i+2]
            line = ""
            for item in pair:
                title_short = (item['title'][:25] + '..') if len(item['title']) > 25 else item['title']
                src_short = "STM" if item['source'] == "steam_self_declaration" else item['source'].upper()
                line += f"• {title_short:<28} [{src_short:<4}]   "
            print(line)

✅ Pomyślnie wczytano 31667 etykiet.

TIER            | LICZBA GIER  | ŹRÓDŁA (PEGI/ESRB/STEAM)      
----------------------------------------------------------------------
Everyone        | 7945         | P:4979 / E:1868 / S:1098
Teen            | 4461         | P:3163 / E:1056 / S:242
Mature          | 9625         | P:2035 / E:528 / S:7062
Adults Only     | 9636         | P:1082 / E:2 / S:8552

PRZYKŁADOWE GRY (po 30 na Tier):

>>> TIER 0 (EVERYONE) ----------------------------------------
• SpongeBob Squarepants: Th..  [PEGI]   • Rym 9000                     [PEGI]   
• FARM TOGETHER DELUXE EDIT..  [PEGI]   • F-22 Lightning 3             [ESRB]   
• Lords of Magic Special Ed..  [ESRB]   • TOSS N GO                    [ESRB]   
• Melon Dusk                   [STM ]   • Iridio: Prologue             [STM ]   
• Dogurai                      [PEGI]   • Xtreme Boarders: Precisio..  [ESRB]   
• Silent Still                 [STM ]   • BlasterBall 3                [ESRB]   
• Winter Games Co

In [None]:
def prepare_data_for_cluster(all_games, clusters, cluster_id):
    texts = []
    labels = []

    for game, c in zip(all_games, clusters):
        if c != cluster_id:
            continue

        appid = str(game["app_id"])
        if appid not in mature_labels:
            continue

        label = mature_labels[appid]

        text = game.get("about_this_game") or ""
        if not text.strip():
            continue

        texts.append(text)
        labels.append(int(bool(label)))

    return texts, labels

In [None]:
# 1. Ładujemy NOWY, rozszerzony plik
unified_labels_path = "unified_mature_content_pegi_esrb_steam.json"
with open(unified_labels_path, "r", encoding="utf-8") as f:
    unified_labels = json.load(f)

print(f"Loaded {len(unified_labels)} labels (PEGI + ESRB + STEAM)")

def prepare_transformer_data(all_games, unified_labels):
    """Przygotowuje teksty (Tagi + Opis) oraz etykiety."""
    texts = []
    labels = []
    app_ids = []
    sources = []

    for game in all_games:
        appid = str(game["app_id"])
        if appid not in unified_labels:
            continue

        label_data = unified_labels[appid]

        # Ekstrakcja tagów - to kluczowy element dla skuteczności modelu
        tags = game.get("tags", [])
        tags_str = ", ".join(tags)

        about = game.get("about_this_game") or ""
        desc = game.get("description") or ""
        clean_desc = f"{about} {desc}".strip()

        # Tworzymy "Super-tekst": Tagi jako kontekst + Opis
        # Transformer nauczy się, że słowa po 'TAGS:' są bardzo ważne
        full_text = f"TAGS: {tags_str} | DESCRIPTION: {clean_desc}"

        if len(clean_desc) < 20: # Omijamy gry bez opisów
            continue

        texts.append(full_text)
        labels.append(label_data['tier'])
        app_ids.append(appid)
        sources.append(label_data['source'])

    return texts, labels, app_ids, sources

# Generujemy dane
all_texts, all_labels, all_app_ids, all_sources = prepare_transformer_data(all_games, unified_labels)

Loaded 31667 labels (PEGI + ESRB + STEAM)


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np
from tqdm import tqdm

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, num_classes=4, dropout=0.3): # Zmniejszamy dropout do 0.3 (0.5 bywa zbyt agresywne)
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

        # Zamrażamy pierwsze 3 warstwy BERTa (opcjonalnie), aby nie psuć pre-trained wag na starcie
        # for param in list(self.bert.parameters())[:20]:
        #     param.requires_grad = False

        self.dropout = nn.Dropout(dropout)
        hidden_size = self.bert.config.hidden_size

        # Warstwa pośrednia pomaga "przetrawić" wektor z BERTa przed klasyfikacją
        self.pre_classifier = nn.Linear(hidden_size, hidden_size)
        self.classifier = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]  # CLS token

        pooled = self.dropout(pooled)
        hidden = self.relu(self.pre_classifier(pooled))
        hidden = self.dropout(hidden)
        logits = self.classifier(hidden)
        return logits

In [None]:
# Obliczanie wag klas (zbalansowane pod nowy rozkład)
from sklearn.utils.class_weight import compute_class_weight

# Obliczamy wagi na podstawie zbioru treningowego
weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(all_labels),
    y=all_labels
)
class_weights = weights.tolist()

print(f"Nowe wagi klas: {class_weights}")

Nowe wagi klas: [1.0246838848533764, 1.8592872833780816, 0.8114081806561568, 0.7975604648727882]


In [None]:
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

# Definicja macierzy podobieństwa jako stałej globalnej lub przekazywanej do funkcji
W_MATRIX = torch.tensor([
    [1.0, 0.5, 0.0, 0.0],
    [0.5, 1.0, 0.5, 0.0],
    [0.0, 0.5, 1.0, 0.5],
    [0.0, 0.0, 0.5, 1.0],
], dtype=torch.float)

def train_transformer(model, train_loader, val_loader, epochs=3, lr=2e-5, class_weights=None):
    model = model.to(device)
    W = W_MATRIX.to(device) # Przenosimy macierz na GPU

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)

    if class_weights is not None:
        class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
        criterion = nn.CrossEntropyLoss(weight=class_weights)
    else:
        criterion = nn.CrossEntropyLoss()

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2, min_lr=1e-7
    )

    history = {'train_loss': [], 'val_loss': [], 'val_acc': [], 'val_similarity': []}

    best_val_loss = float('inf')
    early_stop_patience = 4
    patience_counter = 0
    best_model_state = None

    for epoch in range(epochs):
        # --- TRENING ---
        model.train()
        train_losses = []
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            train_losses.append(loss.item())

        avg_train_loss = np.mean(train_losses)

        # --- WALIDACJA ---
        model.eval()
        val_losses = []
        all_preds = []
        all_labels = []
        all_similarity_scores = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                val_losses.append(loss.item())

                # Obliczanie Similarity Score (Twoja nowa logika)
                probs = torch.softmax(outputs, dim=1)
                # Wyciągamy wiersze z W odpowiadające etykietom w batchu i mnożymy przez prawdopodobieństwa
                batch_scores = torch.sum(probs * W[labels], dim=1)
                all_similarity_scores.extend(batch_scores.cpu().numpy())

                preds = torch.argmax(outputs, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = np.mean(val_losses)
        avg_sim_score = np.mean(all_similarity_scores)
        val_acc = accuracy_score(all_labels, all_preds)

        history['train_loss'].append(avg_train_loss)
        history['val_loss'].append(avg_val_loss)
        history['val_acc'].append(val_acc)
        history['val_similarity'].append(avg_sim_score)

        scheduler.step(avg_val_loss)

        # Zapisywanie najlepszego modelu
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            status = "✓ (best)"
        else:
            patience_counter += 1
            status = f"(patience {patience_counter}/{early_stop_patience})"

        print(f"Epoch {epoch+1}: Loss {avg_val_loss:.4f} | Acc: {val_acc:.4f} | SimScore: {avg_sim_score:.4f} {status}")

        if patience_counter >= early_stop_patience:
            break

    if best_model_state:
        model.load_state_dict(best_model_state)

    return model.to(device), history

def evaluate_model(model, test_loader):
    model.eval()
    W = W_MATRIX.to(device)

    all_preds, all_labels, all_scores = [], [], []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)

            probs = torch.softmax(outputs, dim=1)
            batch_score = torch.sum(probs * W[labels], dim=1)

            all_scores.extend(batch_score.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_preds, all_labels, all_scores

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer
from sklearn.model_selection import train_test_split

# 1. Inicjalizacja tokenizera
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# 2. Podział danych (80% trening + walidacja, 20% test)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    all_texts, all_labels, test_size=0.2, random_state=42, stratify=all_labels
)

# Podział treningowego na właściwy trening i walidację (90/10)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.1, random_state=42, stratify=y_train_full
)

# 3. Klasa Dataset (jeśli jej jeszcze nie zdefiniowałeś w tej sesji)
class GameTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=384):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# 4. Tworzenie DataLoaderów
BATCH_SIZE = 16 # Zwiększ do 32, jeśli masz mocne GPU
MAX_LEN = 384    # Optymalna długość dla opisów z tagami

train_loader = DataLoader(GameTextDataset(X_train, y_train, tokenizer, MAX_LEN), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(GameTextDataset(X_val, y_val, tokenizer, MAX_LEN), batch_size=BATCH_SIZE)
test_loader = DataLoader(GameTextDataset(X_test, y_test, tokenizer, MAX_LEN), batch_size=BATCH_SIZE)

print(f"Gotowe! Trening: {len(X_train)} gier, Walidacja: {len(X_val)}, Test: {len(X_test)}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Gotowe! Trening: 21938 gier, Walidacja: 2438, Test: 6094


In [None]:
from transformers import get_linear_schedule_with_warmup, DistilBertTokenizer
from sklearn.utils.class_weight import compute_class_weight

# --- 1. PRZYGOTOWANIE "SUPER-TEKSTU" (TAGI + OPIS) ---
def prepare_transformer_data(all_games, unified_labels):
    texts, labels, app_ids = [], [], []
    for game in all_games:
        appid = str(game["app_id"])
        if appid not in unified_labels: continue

        tags = ", ".join(game.get("tags", []))
        about = game.get("about_this_game") or ""
        desc = game.get("description") or ""
        # Łączymy tagi i opis - to daje modelowi potężny kontekst
        full_text = f"TAGS: {tags} | DESCRIPTION: {about} {desc}".strip()

        if len(full_text) < 50: continue

        texts.append(full_text)
        labels.append(unified_labels[appid]['tier'])
        app_ids.append(appid)
    return texts, labels, app_ids

all_texts, all_labels, all_app_ids = prepare_transformer_data(all_games, unified_labels)

# --- 2. OBLICZANIE WAG KLAS ---
weights = compute_class_weight('balanced', classes=np.unique(all_labels), y=all_labels)
class_weights = torch.tensor(weights, dtype=torch.float).to(device)

# --- 3. PODZIAŁ I DATALOADERY ---
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
X_train, X_val, y_train, y_val = train_test_split(all_texts, all_labels, test_size=0.1, stratify=all_labels, random_state=42)

MAX_LEN = 384 # Optymalna długość
train_loader = DataLoader(GameTextDataset(X_train, y_train, tokenizer, MAX_LEN), batch_size=16, shuffle=True)
val_loader = DataLoader(GameTextDataset(X_val, y_val, tokenizer, MAX_LEN), batch_size=16)

In [None]:
def train_transformer(model, train_loader, val_loader, epochs=10, lr=1e-5):
    model = model.to(device)
    W = W_MATRIX.to(device)

    # Optimizer i Scheduler z Warmupem
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    total_steps = len(train_loader) * epochs
    warmup_steps = int(0.1 * total_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

    criterion = nn.CrossEntropyLoss(weight=class_weights)

    history = {'train_loss': [], 'val_loss': [], 'val_acc': [], 'val_similarity': []}
    best_sim_score = 0
    patience_counter = 0
    early_stop_patience = 3

    for epoch in range(epochs):
        # TRENING
        model.train()
        train_losses = []
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]"):
            input_ids, mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, mask)
            loss = criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step() # Krok schudulera po każdym batchu!
            train_losses.append(loss.item())

        # WALIDACJA
        model.eval()
        val_losses, all_preds, all_labels, all_sims = [], [], [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids, mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
                outputs = model(input_ids, mask)
                val_losses.append(criterion(outputs, labels).item())

                probs = torch.softmax(outputs, dim=1)
                sim_scores = torch.sum(probs * W[labels], dim=1)
                all_sims.extend(sim_scores.cpu().numpy())
                all_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_sim = np.mean(all_sims)
        avg_acc = accuracy_score(all_labels, all_preds)

        history['train_loss'].append(np.mean(train_losses))
        history['val_similarity'].append(avg_sim)
        history['val_acc'].append(avg_acc)

        # KLUCZOWE: Early Stopping patrzy na SimScore, a nie na Loss!
        if avg_sim > best_sim_score:
            best_sim_score = avg_sim
            patience_counter = 0
            torch.save(model.state_dict(), "best_maturity_model.pt")
            status = "✓ (Nowy rekord SimScore)"
        else:
            patience_counter += 1
            status = f"(brak poprawy {patience_counter}/{early_stop_patience})"

        print(f"Epoch {epoch+1}: TrainLoss: {history['train_loss'][-1]:.4f} | Acc: {avg_acc:.4f} | SimScore: {avg_sim:.4f} {status}")

        if patience_counter >= early_stop_patience:
            print("Early Stopping!")
            break

    model.load_state_dict(torch.load("best_maturity_model.pt"))
    return model, history

In [None]:
# Inicjalizacja architektury
model = TransformerClassifier(num_classes=4, dropout=0.3)

# Start
trained_model, history = train_transformer(
    model,
    train_loader,
    val_loader,
    epochs=10,
    lr=1e-5 # Bardzo bezpieczne tempo
)

Epoch 1/10 [Train]: 100%|██████████| 1714/1714 [19:09<00:00,  1.49it/s]


Epoch 1: TrainLoss: 1.2578 | Acc: 0.5038 | SimScore: 0.5855 ✓ (Nowy rekord SimScore)


Epoch 2/10 [Train]: 100%|██████████| 1714/1714 [18:36<00:00,  1.53it/s]


Epoch 2: TrainLoss: 1.1262 | Acc: 0.4969 | SimScore: 0.5966 ✓ (Nowy rekord SimScore)


Epoch 3/10 [Train]: 100%|██████████| 1714/1714 [18:37<00:00,  1.53it/s]


Epoch 3: TrainLoss: 1.0764 | Acc: 0.5041 | SimScore: 0.6084 ✓ (Nowy rekord SimScore)


Epoch 4/10 [Train]: 100%|██████████| 1714/1714 [18:37<00:00,  1.53it/s]


Epoch 4: TrainLoss: 1.0129 | Acc: 0.4972 | SimScore: 0.6104 ✓ (Nowy rekord SimScore)


Epoch 5/10 [Train]: 100%|██████████| 1714/1714 [18:36<00:00,  1.54it/s]


Epoch 5: TrainLoss: 0.9373 | Acc: 0.4989 | SimScore: 0.6222 ✓ (Nowy rekord SimScore)


Epoch 6/10 [Train]: 100%|██████████| 1714/1714 [18:37<00:00,  1.53it/s]


Epoch 6: TrainLoss: 0.8639 | Acc: 0.4818 | SimScore: 0.6218 (brak poprawy 1/3)


Epoch 7/10 [Train]: 100%|██████████| 1714/1714 [18:35<00:00,  1.54it/s]


Epoch 7: TrainLoss: 0.7910 | Acc: 0.4966 | SimScore: 0.6388 ✓ (Nowy rekord SimScore)


Epoch 8/10 [Train]: 100%|██████████| 1714/1714 [18:43<00:00,  1.53it/s]


Epoch 8: TrainLoss: 0.7286 | Acc: 0.5021 | SimScore: 0.6423 ✓ (Nowy rekord SimScore)


Epoch 9/10 [Train]: 100%|██████████| 1714/1714 [19:09<00:00,  1.49it/s]


Epoch 9: TrainLoss: 0.6850 | Acc: 0.4959 | SimScore: 0.6409 (brak poprawy 1/3)


Epoch 10/10 [Train]: 100%|██████████| 1714/1714 [19:09<00:00,  1.49it/s]


Epoch 10: TrainLoss: 0.6499 | Acc: 0.4979 | SimScore: 0.6449 ✓ (Nowy rekord SimScore)


In [None]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# 1. Pobranie przewidywań (używamy funkcji evaluate_model zdefiniowanej wcześniej)
# Jeśli restartowałeś środowisko, upewnij się, że funkcja evaluate_model jest wczytana
preds, true_labels, similarity_scores = evaluate_model(trained_model, test_loader)

# 2. Obliczanie statystyk
final_acc = accuracy_score(true_labels, preds)
final_sim = np.mean(similarity_scores)

print("\n" + "="*60)
print("📊 WYNIKI MODELU NA ZBIORZE TESTOWYM")
print("="*60)
print(f"Accuracy (Idealne trafienia):    {final_acc:.4f}")
print(f"Similarity Score (Wagi W):      {final_sim:.4f}")
print("-" * 60)

# 3. Wyświetlenie raportu klasyfikacji
tier_names_list = ["Everyone (T0)", "Teen (T1)", "Mature (T2)", "Adults Only (T3)"]
print("\nSzczegółowy raport per klasa:")
print(classification_report(true_labels, preds, target_names=tier_names_list))

# 4. Szybka wizualizacja porównawcza
import plotly.express as px
res_df = pd.DataFrame({'Metryka': ['Accuracy', 'Similarity Score'], 'Wynik': [final_acc, final_sim]})
fig_res = px.bar(res_df, x='Metryka', y='Wynik', text_auto='.4f',
                 color='Metryka', range_y=[0, 1], title="Porównanie Celności i Podobieństwa")
fig_res.show()

Evaluating: 100%|██████████| 381/381 [01:46<00:00,  3.57it/s]


📊 WYNIKI MODELU NA ZBIORZE TESTOWYM
Accuracy (Idealne trafienia):    0.6336
Similarity Score (Wagi W):      0.7161
------------------------------------------------------------

Szczegółowy raport per klasa:
                  precision    recall  f1-score   support

   Everyone (T0)       0.76      0.68      0.72      1487
       Teen (T1)       0.55      0.62      0.58       819
     Mature (T2)       0.57      0.59      0.58      1878
Adults Only (T3)       0.65      0.64      0.65      1910

        accuracy                           0.63      6094
       macro avg       0.63      0.63      0.63      6094
    weighted avg       0.64      0.63      0.64      6094






In [None]:
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff

# 1. Przygotowanie danych do macierzy
cm = confusion_matrix(true_labels, preds)
tier_names = ["Everyone (T0)", "Teen (T1)", "Mature (T2)", "Adults Only (T3)"]

# Konwersja do tekstu dla etykiet wewnątrz kafelków
cm_text = [[str(y) for y in x] for x in cm]

# 2. Tworzenie interaktywnej Heatmapy
fig_cm = ff.create_annotated_heatmap(
    z=cm,
    x=tier_names,
    y=tier_names,
    annotation_text=cm_text,
    colorscale='Viridis'
)

# 3. Konfiguracja layoutu
fig_cm.update_layout(
    title='Macierz Pomyłek (Confusion Matrix)',
    xaxis=dict(title='Przewidziana klasa (Predicted)'),
    yaxis=dict(title='Prawdziwa klasa (Actual)', autorange='reversed'),
    height=600,
    width=700,
    margin=dict(l=150, r=50, b=100, t=100)
)

# Dodanie opisów osi (dla pewności, że będą widoczne)
fig_cm.add_annotation(dict(x=0.5, y=-0.12, showarrow=False, text="Przewidziana klasa", xref="paper", yref="paper"))
fig_cm.add_annotation(dict(x=-0.2, y=0.5, showarrow=False, text="Prawdziwa klasa", textangle=-90, xref="paper", yref="paper"))

fig_cm.show()