# Steam Review Summarization - Optimized

**Model:** distilbart-cnn-12-6 (download via terminal first)

```bash
# Mac/Linux
pip install -U "huggingface_hub[cli]" hf_transfer
export HF_HUB_ENABLE_HF_TRANSFER=1
huggingface-cli download sshleifer/distilbart-cnn-12-6 --local-dir ~/hf_cache/distilbart
```

In [None]:
# Configuration - EDIT THESE SETTINGS
import pandas as pd
import os

# START FROM SPECIFIC APP_ID (set to None to start from beginning)
START_APP_ID = None  # Example: 1000 to start from app_id 1000

# Paths
DATA_DIR = '../data'
PROCESSED_DIR = os.path.join(DATA_DIR, 'processed')
MODEL_PATH = '/Users/radimsoukal/hf_cache/distilbart'  # Adjust for Windows

# Processing settings
NUM_WORKERS = 7  # Adjust based on CPU cores
CHECKPOINT_EVERY = 100

# Files to use/create
COMBINED_REVIEWS_FILE = os.path.join(PROCESSED_DIR, 'combined_reviews_cache.pkl')
FINAL_OUTPUT_FILE = os.path.join(PROCESSED_DIR, 'review_summaries_COMPLETE.csv')

print("✓ Configuration loaded")
print(f"  Start from app_id: {START_APP_ID if START_APP_ID else 'beginning'}")
print(f"  Model path: {MODEL_PATH}")
print(f"  Workers: {NUM_WORKERS}")

# Load or create combined reviews (efficient - uses cache)
import pickle
import glob
import re
import string
import warnings
warnings.filterwarnings('ignore')

os.makedirs(PROCESSED_DIR, exist_ok=True)

# Try to load from cache first
if os.path.exists(COMBINED_REVIEWS_FILE):
    print("Loading combined reviews from cache...")
    combined_df = pd.read_pickle(COMBINED_REVIEWS_FILE)
    print(f"✓ Loaded {len(combined_df)} games from cache")
else:
    print("Creating combined reviews (first time)...")
    
    # Load raw CSV files
    csv_files = sorted(glob.glob(os.path.join(DATA_DIR, 'raw', 'app_reviews_*.csv')))
    print(f"Found {len(csv_files)} CSV files")
    
    dfs = [pd.read_csv(f) for f in csv_files]
    combined_df = pd.concat(dfs, ignore_index=True)
    print(f"Combined shape: {combined_df.shape}")
    
    # Clean and combine reviews
    def clean_text(text):
        if not isinstance(text, str) or not text.strip():
            return None
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
        text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+', '', text)
        text = text.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')
        allowed_chars = set(string.ascii_letters + string.digits + ' .,!?\'-')
        text = ''.join(char for char in text if char in allowed_chars)
        text = re.sub(r'\s+', ' ', text).strip()
        return text if text else None

    def combine_reviews(row, review_cols):
        reviews = []
        for col in review_cols:
            review = row.get(col)
            if review is not None and review != '':
                cleaned = clean_text(review)
                if cleaned:
                    reviews.append(cleaned)
        unique_reviews = list(dict.fromkeys(reviews))  # Remove duplicates, preserve order
        return ' [SEP] '.join(unique_reviews) if unique_reviews else ''

    review_columns = [f'review_{i}' for i in range(1, 101)]
    combined_df['combined_reviews'] = combined_df.apply(
        lambda row: combine_reviews(row, review_columns), axis=1
    )
    
    # Save to cache
    combined_df.to_pickle(COMBINED_REVIEWS_FILE)
    print(f"✓ Created and cached {len(combined_df)} combined reviews")

print(f"Non-empty reviews: {(combined_df['combined_reviews'] != '').sum()}")
print(f"Avg length: {combined_df['combined_reviews'].str.len().mean():.0f} chars")

In [None]:
# Load summarization model from local cache
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

print(f"Loading model from: {MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH, local_files_only=True)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=-1)

print("✓ Model loaded successfully")

In [None]:
# Warm-up model (first inference is slow)
import time

print("Testing model (warm-up, takes 2-5 min first time)...")
test_text = "This game features engaging gameplay, beautiful graphics, and challenging difficulty."
start = time.time()
test_result = summarizer(test_text, max_length=50, min_length=10, do_sample=False)
elapsed = time.time() - start

print(f"✓ Test complete in {elapsed:.1f}s")
print(f"Output: {test_result[0]['summary_text']}")

In [None]:
# Summarization functions with chunking and guidance

GUIDANCE_PROMPT = """Summarize the following user reviews into a concise, neutral third-person description of the game. Focus on: genre, gameplay mechanics, core features or modes, campaign or multiplayer aspects, and overall atmosphere or difficulty. Avoid opinions, memes, and repetition.

Reviews:"""

def split_into_chunks(text, max_chunk_size=3500, overlap=200):
    if not text or len(text) <= max_chunk_size:
        return [text] if text else []
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chunk_size
        if end < len(text):
            for sep in ['. ', '! ', '? ', ' [SEP] ']:
                last_sep = text[start:end].rfind(sep)
                if last_sep != -1:
                    end = start + last_sep + len(sep)
                    break
        chunks.append(text[start:end].strip())
        start = end - overlap if end < len(text) else end
    return chunks

def summarize_reviews(text, summarizer):
    if not text or len(text.strip()) == 0:
        return ""
    try:
        # Short text - direct summarization
        if len(text) <= 1000:
            prompt = f"{GUIDANCE_PROMPT}\n\n{text}"
            result = summarizer(prompt, max_length=130, min_length=30, num_beams=2, 
                              no_repeat_ngram_size=3, do_sample=False, truncation=True)
            return result[0]['summary_text']
        
        # Long text - chunked approach
        chunks = split_into_chunks(text)
        chunk_summaries = []
        for chunk in chunks:
            if len(chunk) < 50:
                continue
            prompt = f"{GUIDANCE_PROMPT}\n\n{chunk}"
            result = summarizer(prompt, max_length=80, min_length=30, num_beams=2,
                              no_repeat_ngram_size=3, do_sample=False, truncation=True)
            chunk_summaries.append(result[0]['summary_text'])
        
        if not chunk_summaries:
            return ""
        
        # Final pass
        combined = ' '.join(chunk_summaries)
        if len(combined) > 200:
            prompt = f"{GUIDANCE_PROMPT}\n\n{combined}"
            result = summarizer(prompt, max_length=130, min_length=40, num_beams=2,
                              no_repeat_ngram_size=3, do_sample=False, truncation=True)
            return result[0]['summary_text']
        return combined
    except Exception as e:
        print(f"Error: {e}")
        return ""

print("✓ Summarization functions ready")

In [None]:
# Parallel processing with START_APP_ID support
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Sort and filter by START_APP_ID
combined_df_sorted = combined_df.sort_values('app_id').reset_index(drop=True)

if START_APP_ID is not None:
    combined_df_sorted = combined_df_sorted[combined_df_sorted['app_id'] >= START_APP_ID].reset_index(drop=True)
    print(f"Starting from app_id {START_APP_ID}")

n_rows = len(combined_df_sorted)
print(f"Processing {n_rows} games")
print(f"Checkpoint every {CHECKPOINT_EVERY} games")
print(f"Workers: {NUM_WORKERS}")
print("-" * 60)

# Initialize
if 'reviews_summary' not in combined_df_sorted.columns:
    combined_df_sorted['reviews_summary'] = ''

# Worker function
def process_game(idx):
    try:
        row_idx = combined_df_sorted.index[idx]
        text = combined_df_sorted.loc[row_idx, 'combined_reviews']
        app_id = combined_df_sorted.loc[row_idx, 'app_id']
        
        if not text or len(text.strip()) == 0:
            return (idx, app_id, '', 'empty')
        
        summary = summarize_reviews(text, summarizer)
        return (idx, app_id, summary, None)
    except Exception as e:
        app_id = combined_df_sorted.loc[combined_df_sorted.index[idx], 'app_id']
        return (idx, app_id, '', str(e)[:100])

# Process in parallel
successful = 0
failed = 0
skipped = 0
checkpoint_counter = 0
checkpoint_start_id = None
checkpoint_end_id = None

print("Starting...\n")

with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
    futures = {executor.submit(process_game, idx): idx for idx in range(n_rows)}
    
    with tqdm(total=n_rows, desc="Summarizing", unit="game") as pbar:
        for future in as_completed(futures):
            idx, app_id, summary, error = future.result()
            row_idx = combined_df_sorted.index[idx]
            
            if checkpoint_counter == 0:
                checkpoint_start_id = app_id
            checkpoint_end_id = app_id
            
            if error == 'empty':
                skipped += 1
            elif error:
                failed += 1
            else:
                combined_df_sorted.loc[row_idx, 'reviews_summary'] = summary
                successful += 1
            
            pbar.update(1)
            checkpoint_counter += 1
            
            # Checkpoint
            if checkpoint_counter >= CHECKPOINT_EVERY or idx == n_rows - 1:
                checkpoint_file = os.path.join(
                    PROCESSED_DIR,
                    f'checkpoint_summaries_appid_{checkpoint_start_id:06d}_to_{checkpoint_end_id:06d}.csv'
                )
                combined_df_sorted[['app_id', 'reviews_summary']].to_csv(checkpoint_file, index=False)
                pbar.write(f"💾 Checkpoint: {os.path.basename(checkpoint_file)}")
                checkpoint_counter = 0

print(f"\n{'='*60}")
print(f"✓ Complete!")
print(f"  Successful: {successful}")
print(f"  Failed: {failed}")
print(f"  Skipped: {skipped}")
print(f"  Success rate: {successful/(n_rows-skipped)*100:.1f}%")
print(f"{'='*60}")

# Save final
combined_df_sorted[['app_id', 'combined_reviews', 'reviews_summary']].to_csv(FINAL_OUTPUT_FILE, index=False)
print(f"\n✓ Final saved: {FINAL_OUTPUT_FILE}")

In [None]:
# OPTIONAL: Process all remaining rows (uncomment to run)

# print("Processing all remaining rows...")
# 
# successful = 0
# failed = 0
# 
# # Process rows that don't have summaries yet
# rows_to_process = combined_df[combined_df['reviews_summary'] == ''].index
# print(f"Remaining rows to process: {len(rows_to_process)}")
# 
# for row_idx in tqdm(rows_to_process, desc="Summarizing remaining reviews"):
#     combined_text = combined_df.loc[row_idx, 'combined_reviews']
#     
#     if not combined_text or len(combined_text.strip()) == 0:
#         failed += 1
#         continue
#     
#     try:
#         summary = summarize_reviews(combined_text, summarizer)
#         combined_df.loc[row_idx, 'reviews_summary'] = summary
#         successful += 1
#     except Exception as e:
#         print(f"\nError at index {row_idx}: {e}")
#         failed += 1
#         continue
# 
# print("\n" + "=" * 60)
# print(f"✓ Full processing complete!")
# print(f"  Additional successful: {successful}")
# print(f"  Additional failed: {failed}")
# print(f"  Total with summaries: {(combined_df['reviews_summary'] != '').sum()}")