In [None]:
# ==============================================================================
# CELL 1: ENVIRONMENT SETUP & DEPENDENCIES
# ==============================================================================

import os
from dotenv import load_dotenv
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
import spacy

# 1. LOAD ENVIRONMENT VARIABLES
load_success = load_dotenv()
if load_success:
    print(f"‚úÖ Environment variables loaded from .env")
else:
    print(f"‚ö†Ô∏è Warning: .env file not found or empty.")

# 2. CONSTRUCT DATABASE CONNECTION
# Fetch individual parts from your specific .env structure
db_user = os.getenv("DB_USER")
db_pass = os.getenv("DB_PASS")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")

# Verify we have everything (except password, which might be empty for local dev sometimes)
if not all([db_user, db_host, db_name]):
    print("‚ùå Error: Missing DB_USER, DB_HOST, or DB_NAME in .env file.")
else:
    # Build the SQLAlchemy connection string: postgresql://<user>:<pass>@<host>:<port>/<db>
    # We use f-strings to assemble it dynamically
    DB_STRING = f"postgresql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}"
    
    try:
        # Create Engine
        engine = create_engine(DB_STRING)
        
        # Test Connection
        with engine.connect() as conn:
            print(f"‚úÖ Database connection established to {db_host}")
            
    except Exception as e:
        print(f"‚ùå Database connection failed. Check your password in .env.")
        print(f"   Error details: {e}")

# 3. SPACY MODEL CHECK
try:
    nlp = spacy.load("en_core_web_sm")
    print("‚úÖ spaCy model 'en_core_web_sm' loaded.")
except OSError:
    print("‚ùå spaCy model not found. Run 'python -m spacy download en_core_web_sm' in terminal.")

# 4. CONFIGURATION
pd.set_option('display.max_colwidth', None)
print("üöÄ Setup Complete.")

In [None]:
# ==============================================================================
# CELL 2: BUILD FTS QUERY FROM LOTR ENTITIES JSON
# ==============================================================================
# Parse lotr_entities.json to construct a comprehensive tsquery string
# This will be used to search the entire corpus via the content_tsv GIN index

import json

# Load the entity file
entity_file = "../lotr_entities.json"

with open(entity_file, 'r') as f:
    lotr_data = json.load(f)

# Extract all searchable terms (canonical names + aliases)
all_terms = []

def extract_terms(node):
    """Recursively extract canonical names and aliases from nested structure."""
    if isinstance(node, list):
        for item in node:
            if isinstance(item, dict) and "canonical" in item:
                all_terms.append(item["canonical"])
                all_terms.extend(item.get("aliases", []))
    elif isinstance(node, dict):
        for val in node.values():
            extract_terms(val)

extract_terms(lotr_data)

print(f"‚úÖ Extracted {len(all_terms):,} searchable terms from {entity_file}")

# Clean and deduplicate
all_terms = list(set(all_terms))  # Remove duplicates
all_terms = [t for t in all_terms if len(t) >= 2]  # Drop single characters
print(f"   After deduplication: {len(all_terms):,} unique terms")

# Build tsquery string
# Escape single quotes for SQL, wrap multi-word phrases appropriately
def term_to_tsquery(term):
    """Convert a term to tsquery format."""
    # Escape single quotes
    escaped = term.replace("'", "''")
    # Multi-word terms need adjacency operator
    words = escaped.split()
    if len(words) > 1:
        return " <-> ".join(f"'{w}'" for w in words)
    else:
        return f"'{escaped}'"

tsquery_parts = [term_to_tsquery(t) for t in all_terms]
lotr_tsquery = " | ".join(tsquery_parts)

print(f"   Built tsquery with {len(tsquery_parts):,} OR clauses")
print(f"\nüìã Sample terms: {all_terms[:20]}")
print(f"\nüîç Query preview (first 500 chars):\n{lotr_tsquery[:500]}...")

In [None]:
# ==============================================================================
# CELL 2a: FILTER OUT LOW-PRECISION TERMS
# ==============================================================================
# Remove terms that are too generic or overlap heavily with non-LOTR contexts

generic_terms = {
    # Single common words
    'Mark', 'Warg', 'Wargs', 'Orcs', 'Goblins', 'Uruks', 'Eagles', 'Dwarves', 
    'Elves', 'Men', 'Ring', 'Rings', 'Tower', 'King', 'Shadow', 'Fire', 'Light',
    'Dark', 'White', 'Black', 'Grey', 'Gray', 'Gold', 'Silver', 'Iron', 'Stone',
    'Dragon', 'Dragons', 'Troll', 'Trolls', 'Spider', 'Spiders',
    
    # Generic phrases
    'The War', 'The Mouth', 'The Eye', 'The Ring', 'The Tower', 'The King',
    'The Shadow', 'The Dark', 'The White', 'The Black', 'The Grey',
    'The Fellowship', 'The Company',
    
    # Too short / ambiguous
    'Sam', 'Tom', 'Bill', 'Ted', 'Bert', 'Fatty',
}

before_filter = len(all_terms)
all_terms = [t for t in all_terms if t not in generic_terms]
print(f"üßπ Filtered out {before_filter - len(all_terms)} generic/ambiguous terms")

In [None]:
# ==============================================================================
# CELL 3: CORPUS-WIDE LOTR COMMENT PULL
# ==============================================================================

print("üîç PULLING LOTR MENTIONS CORPUS-WIDE...")
print("=" * 50)

lotr_fts_query = """
    'Lord' <-> 'of' <-> 'the' <-> 'Rings' | Gandalf | Frodo | Aragorn | Mordor | 
    Tolkien | Silmarillion | Hobbiton | Rivendell | Gondor | Rohan | LOTR | LotR | 
    Sauron | Gollum | Legolas | Gimli | Boromir | Saruman | Hobbit | Bilbo | Moria | Isengard
"""

comments_sql = f"""
    SELECT 
        rc.id as comment_id,
        rc.post_id,
        rc.subreddit,
        rc.author,
        rc.content,
        rc.score,
        rc.created_at,
        uac.authenticity_score,
        uac.subreddit_count as author_subreddit_count,
        uac.active_days as author_active_days
    FROM reddit_comments rc
    LEFT JOIN user_authenticity_cache uac ON rc.author = uac.author
    WHERE rc.content_tsv @@ to_tsquery('english', $${lotr_fts_query}$$)
      AND rc.is_deleted = FALSE
"""

df_lotr_comments = pd.read_sql(comments_sql, engine)

print(f"‚úÖ Loaded {len(df_lotr_comments):,} comments")
print(f"   Unique subreddits: {df_lotr_comments['subreddit'].nunique():,}")
print(f"   Unique authors: {df_lotr_comments['author'].nunique():,}")
print(f"   Date range: {df_lotr_comments['created_at'].min()} to {df_lotr_comments['created_at'].max()}")
print(f"\nüìä Authenticity distribution:")
print(df_lotr_comments['authenticity_score'].value_counts(dropna=False))

In [None]:
# ==============================================================================
# CELL 4: CORPUS-WIDE LOTR POST PULL
# ==============================================================================

print("üîç PULLING LOTR POSTS CORPUS-WIDE...")
print("=" * 50)

posts_sql = f"""
    SELECT 
        rp.post_id,
        rp.subreddit,
        rp.author,
        rp.title,
        rp.content,
        rp.url,
        rp.score,
        rp.num_comments,
        rp.created_at,
        uac.authenticity_score,
        uac.subreddit_count as author_subreddit_count,
        uac.active_days as author_active_days
    FROM reddit_posts rp
    LEFT JOIN user_authenticity_cache uac ON rp.author = uac.author
    WHERE rp.content_tsv @@ to_tsquery('english', $${lotr_fts_query}$$)
"""

df_lotr_posts = pd.read_sql(posts_sql, engine)

print(f"‚úÖ Loaded {len(df_lotr_posts):,} posts")
print(f"   Unique subreddits: {df_lotr_posts['subreddit'].nunique():,}")
print(f"   Unique authors: {df_lotr_posts['author'].nunique():,}")
print(f"   Date range: {df_lotr_posts['created_at'].min()} to {df_lotr_posts['created_at'].max()}")
print(f"\nüìä Authenticity distribution:")
print(df_lotr_posts['authenticity_score'].value_counts(dropna=False))

In [None]:
# ==============================================================================
# CELL 5: MERGE POST CONTEXT ONTO COMMENTS
# ==============================================================================

print("üîó MERGING POST CONTEXT ONTO COMMENTS...")
print("=" * 50)

# Create a lookup of post_id -> title
post_titles = df_lotr_posts[['post_id', 'title']].drop_duplicates()
post_titles = post_titles.rename(columns={'title': 'post_title'})

# Merge onto comments
df_lotr_comments = df_lotr_comments.merge(post_titles, on='post_id', how='left')

# Check coverage
has_title = df_lotr_comments['post_title'].notna().sum()
missing_title = df_lotr_comments['post_title'].isna().sum()

print(f"‚úÖ Comments with post title: {has_title:,} ({has_title/len(df_lotr_comments)*100:.1f}%)")
print(f"   Comments missing post title: {missing_title:,} ({missing_title/len(df_lotr_comments)*100:.1f}%)")

In [None]:
# ==============================================================================
# CELL 5b: FETCH MISSING POST TITLES
# ==============================================================================

# Get post_ids that are missing titles
missing_post_ids = df_lotr_comments[df_lotr_comments['post_title'].isna()]['post_id'].unique()
print(f"üîç Fetching titles for {len(missing_post_ids):,} posts...")

# Batch fetch from reddit_posts
missing_ids_str = "','".join(missing_post_ids)

missing_titles_sql = f"""
    SELECT post_id, title
    FROM reddit_posts
    WHERE post_id IN ('{missing_ids_str}')
"""

df_missing_titles = pd.read_sql(missing_titles_sql, engine)
print(f"   Found {len(df_missing_titles):,} titles")

# Update the main dataframe
title_lookup = dict(zip(df_missing_titles['post_id'], df_missing_titles['title']))
mask = df_lotr_comments['post_title'].isna()
df_lotr_comments.loc[mask, 'post_title'] = df_lotr_comments.loc[mask, 'post_id'].map(title_lookup)

# Final coverage
has_title = df_lotr_comments['post_title'].notna().sum()
print(f"\n‚úÖ Final coverage: {has_title:,} / {len(df_lotr_comments):,} ({has_title/len(df_lotr_comments)*100:.1f}%)")

In [None]:
# ==============================================================================
# CELL 6: SUBREDDIT DISTRIBUTION
# ==============================================================================

print("üìä LOTR MENTIONS BY SUBREDDIT")
print("=" * 50)

# Comments by subreddit
sub_comments = df_lotr_comments.groupby('subreddit').agg(
    comment_count=('comment_id', 'count'),
    unique_authors=('author', 'nunique'),
    avg_score=('score', 'mean'),
    high_auth_pct=('authenticity_score', lambda x: (x == 'HIGH').sum() / len(x) * 100)
).round(1)

sub_comments = sub_comments.sort_values('comment_count', ascending=False)

print(f"\nTop 30 subreddits by LOTR comment volume:\n")
print(sub_comments.head(30).to_string())

# How much is concentrated in dedicated LOTR subs?
lotr_dedicated = ['lotr', 'tolkienfans', 'LOTR_on_Prime', 'lordoftherings', 
                  'Rings_Of_Power', 'lotro', 'lotrlcg', 'lotrmemes', 
                  'TheWarOfTheRohirrim', 'Silmarillionmemes']

dedicated_count = df_lotr_comments[df_lotr_comments['subreddit'].isin(lotr_dedicated)]['comment_id'].count()
other_count = len(df_lotr_comments) - dedicated_count

print(f"\nüìç CONCENTRATION:")
print(f"   Dedicated LOTR subs: {dedicated_count:,} ({dedicated_count/len(df_lotr_comments)*100:.1f}%)")
print(f"   Other communities: {other_count:,} ({other_count/len(df_lotr_comments)*100:.1f}%)")

In [None]:
# ==============================================================================
# CELL 7: EXTERNAL COMMUNITIES - LOTR AS CULTURAL REFERENCE
# ==============================================================================

print("üåç LOTR MENTIONS IN NON-DEDICATED COMMUNITIES")
print("=" * 50)

# Updated dedicated list
lotr_dedicated = ['lotr', 'tolkienfans', 'LOTR_on_Prime', 'lordoftherings', 
                  'Rings_Of_Power', 'lotro', 'lotrlcg', 'lotrmemes', 
                  'TheWarOfTheRohirrim', 'Silmarillionmemes', 'TalesofTheShiregamers']

# Filter to external only
df_lotr_external = df_lotr_comments[~df_lotr_comments['subreddit'].isin(lotr_dedicated)]

print(f"Total external comments: {len(df_lotr_external):,}")
print(f"Unique subreddits: {df_lotr_external['subreddit'].nunique()}")
print(f"Unique authors: {df_lotr_external['author'].nunique():,}")

# Full list by volume
external_subs = df_lotr_external.groupby('subreddit').agg(
    comment_count=('comment_id', 'count'),
    unique_authors=('author', 'nunique'),
    avg_score=('score', 'mean')
).round(1).sort_values('comment_count', ascending=False)

print(f"\nüìã ALL {len(external_subs)} EXTERNAL SUBREDDITS:\n")
print(external_subs.to_string())

In [None]:
# ==============================================================================
# CELL 8: SAMPLE COMMENTS FROM TOP EXTERNAL COMMUNITIES
# ==============================================================================

# Top 15 external subreddits by volume
top_external = external_subs.head(15).index.tolist()

for sub in top_external:
    print(f"\n{'='*60}")
    print(f"r/{sub} ({external_subs.loc[sub, 'comment_count']} comments)")
    print('='*60)
    
    samples = df_lotr_external[df_lotr_external['subreddit'] == sub].nlargest(3, 'score')
    
    for _, row in samples.iterrows():
        print(f"\n[score: {row['score']}]")
        print(f"{row['content'][:400]}...")

In [None]:
# ==============================================================================
# CELL 9: TALES OF THE SHIRE DEEP DIVE
# ==============================================================================

df_tales = df_lotr_comments[df_lotr_comments['subreddit'] == 'TalesofTheShiregamers']

print(f"üìä r/TalesofTheShiregamers")
print(f"=" * 50)
print(f"Total comments: {len(df_tales):,}")
print(f"Unique authors: {df_tales['author'].nunique()}")
print(f"Date range: {df_tales['created_at'].min().date()} to {df_tales['created_at'].max().date()}")
print(f"Avg score: {df_tales['score'].mean():.1f}")

print(f"\nüìà Score distribution:")
print(df_tales['score'].describe())

print(f"\nüìù Top 20 comments by score:\n")
for _, row in df_tales.nlargest(20, 'score').iterrows():
    print(f"[{row['score']}] u/{row['author']} | {row['created_at'].date()}")
    print(f"Post: {row['post_title'][:80] if pd.notna(row['post_title']) else 'N/A'}...")
    print(f"{row['content'][:300]}...")
    print()

In [None]:
# ==============================================================================
# CELL 10: TALES OF THE SHIRE - FULL COMMENT CORPUS
# ==============================================================================

print("üè° PULLING ALL r/TalesofTheShiregamers COMMENTS WITH POST CONTEXT...")
print("=" * 50)

tales_full_sql = """
    SELECT 
        rc.id as comment_id,
        rc.post_id,
        rc.author,
        rc.content,
        rc.score,
        rc.created_at,
        rp.title as post_title,
        rp.content as post_content,
        rp.score as post_score,
        rp.num_comments as post_num_comments
    FROM reddit_comments rc
    LEFT JOIN reddit_posts rp ON rc.post_id = rp.post_id
    WHERE rc.subreddit = 'TalesofTheShiregamers'
      AND rc.is_deleted = FALSE
"""

df_tales_full = pd.read_sql(tales_full_sql, engine)

print(f"‚úÖ Loaded {len(df_tales_full):,} comments")
print(f"   Unique posts: {df_tales_full['post_id'].nunique()}")
print(f"   Unique authors: {df_tales_full['author'].nunique()}")
print(f"   Date range: {df_tales_full['created_at'].min().date()} to {df_tales_full['created_at'].max().date()}")

print(f"\nüìä Top 15 posts by comment count:\n")
top_posts = df_tales_full.groupby(['post_id', 'post_title']).agg(
    comment_count=('comment_id', 'count'),
    post_score=('post_score', 'first')
).sort_values('comment_count', ascending=False).head(15)

for (post_id, title), row in top_posts.iterrows():
    print(f"[{row['comment_count']} comments | score {row['post_score']}] {title[:70]}...")

In [None]:
# ==============================================================================
# CELL 11: TALES OF THE SHIRE AUTHOR CROSS-PARTICIPATION
# ==============================================================================

print("üîç WHERE ELSE DO TotS CONTRIBUTORS PARTICIPATE?")
print("=" * 50)

# Get all TotS authors
tots_authors = df_tales_full['author'].unique().tolist()
print(f"TotS unique authors: {len(tots_authors):,}")

# Find all their comments across the corpus
authors_str = "','".join([a.replace("'", "''") for a in tots_authors])

cross_participation_sql = f"""
    SELECT 
        subreddit,
        COUNT(*) as comment_count,
        COUNT(DISTINCT author) as author_count
    FROM reddit_comments
    WHERE author IN ('{authors_str}')
      AND is_deleted = FALSE
    GROUP BY subreddit
    ORDER BY author_count DESC
"""

df_cross = pd.read_sql(cross_participation_sql, engine)

# Add percentage of TotS authors who participate in each sub
df_cross['pct_of_tots_authors'] = (df_cross['author_count'] / len(tots_authors) * 100).round(1)

print(f"\n‚úÖ TotS authors appear in {len(df_cross):,} subreddits")
print(f"\nüìä Top 40 subreddits by TotS author overlap:\n")
print(df_cross.head(40).to_string(index=False))

In [None]:
# ==============================================================================
# CELL 12: TOTS AUTHORS IN r/CozyGamers
# ==============================================================================

print("üéÆ WHAT DO TotS AUTHORS SAY IN r/CozyGamers?")
print("=" * 50)

# Get TotS authors' comments in CozyGamers
tots_authors_str = "','".join([a.replace("'", "''") for a in tots_authors])

cozygamers_sql = f"""
    SELECT 
        rc.id as comment_id,
        rc.post_id,
        rc.author,
        rc.content,
        rc.score,
        rc.created_at,
        rp.title as post_title
    FROM reddit_comments rc
    LEFT JOIN reddit_posts rp ON rc.post_id = rp.post_id
    WHERE rc.author IN ('{tots_authors_str}')
      AND rc.subreddit = 'CozyGamers'
      AND rc.is_deleted = FALSE
    ORDER BY rc.score DESC
"""

df_tots_cozy = pd.read_sql(cozygamers_sql, engine)

print(f"‚úÖ {len(df_tots_cozy):,} comments from {df_tots_cozy['author'].nunique()} TotS authors")
print(f"   Date range: {df_tots_cozy['created_at'].min().date()} to {df_tots_cozy['created_at'].max().date()}")

print(f"\nüìù Top 25 comments by score:\n")
for _, row in df_tots_cozy.head(25).iterrows():
    print(f"[{row['score']}] u/{row['author']} | {row['created_at'].date()}")
    print(f"Post: {row['post_title'][:70] if pd.notna(row['post_title']) else 'N/A'}...")
    print(f"{row['content'][:250]}...")
    print()

In [None]:
# ==============================================================================
# CELL 13: GAME MENTIONS BY TOTS AUTHORS IN r/CozyGamers
# ==============================================================================

print("üéÆ GAMES MENTIONED BY TotS AUTHORS IN r/CozyGamers")
print("=" * 50)

# Define games to search for (common cozy games)
cozy_games = [
    'Animal Crossing', 'Stardew Valley', 'Disney Dreamlight Valley', 'Dreamlight Valley',
    'Coral Island', 'Fae Farm', 'Rune Factory', 'Story of Seasons', 'Harvest Moon',
    'Wylde Flowers', 'Cat Cafe Manager', 'Luma Island', 'Spiritfarer', 'Cozy Grove',
    'Sun Haven', 'Ooblets', 'Slime Rancher', 'Unpacking', 'A Short Hike',
    'Moonstone Island', 'Palia', 'Roots of Pacha', 'Dinkum', 'Portia', 'Sandrock',
    'Fantasy Life', 'Littlewood', 'Garden Story', 'Witchbrook', 'Haunted Chocolatier',
    'Tales of the Shire', 'Tales from the Shire', 'TotS', 'Fields of Mistria',
    'Calico', 'Bear and Breakfast', 'Snacko', 'Mineko', 'Yokai Inn', 'Echoes of the Plum Grove',
    'Sims', 'My Time at Portia', 'My Time at Sandrock', 'Everholm', 'Travellers Rest'
]

# Count mentions
game_counts = {}
for game in cozy_games:
    count = df_tots_cozy['content'].str.contains(game, case=False, na=False).sum()
    if count > 0:
        game_counts[game] = count

# Sort by frequency
game_counts_sorted = dict(sorted(game_counts.items(), key=lambda x: x[1], reverse=True))

print(f"\nüìä Game mentions (in {len(df_tots_cozy)} comments):\n")
for game, count in game_counts_sorted.items():
    pct = count / len(df_tots_cozy) * 100
    print(f"{game:30} {count:4} ({pct:.1f}%)")

In [None]:
# ==============================================================================
# CELL 14: CHECK NLP LIBRARIES
# ==============================================================================

print("üîç CHECKING NLP LIBRARIES...")
print("=" * 50)

libraries = {
    'transformers': None,
    'torch': None,
    'textstat': None,
    'scipy': None,
    'sklearn': None
}

for lib in libraries:
    try:
        module = __import__(lib)
        version = getattr(module, '__version__', 'installed')
        libraries[lib] = version
        print(f"‚úÖ {lib}: {version}")
    except ImportError:
        print(f"‚ùå {lib}: NOT INSTALLED")

print("\nüìã Summary:")
missing = [lib for lib, ver in libraries.items() if ver is None]
if missing:
    print(f"   Missing: {', '.join(missing)}")
    print(f"   Install with: pip install {' '.join(missing)}")
else:
    print("   All libraries available")

In [None]:
# ==============================================================================
# CELL 14b: CHECK NLP MODELS
# ==============================================================================

print("üîç CHECKING NLP MODELS...")
print("=" * 50)

from transformers import pipeline
import warnings
warnings.filterwarnings('ignore')

# Test GoEmotions (Reddit-trained emotion detection)
print("\n1. GoEmotions (Reddit-trained, 27 emotions)...")
try:
    emotion_classifier = pipeline("text-classification", 
                                  model="SamLowe/roberta-base-go_emotions", 
                                  top_k=5)
    test = emotion_classifier("This game rocks! I love it!")
    print(f"   ‚úÖ Loaded successfully")
    print(f"   Test output: {test[0][:3]}")
except Exception as e:
    print(f"   ‚ùå Failed: {e}")

# Test sentiment (social media trained)
print("\n2. Twitter-RoBERTa Sentiment...")
try:
    sentiment_classifier = pipeline("sentiment-analysis",
                                    model="cardiffnlp/twitter-roberta-base-sentiment-latest")
    test = sentiment_classifier("This game rocks! I love it!")
    print(f"   ‚úÖ Loaded successfully")
    print(f"   Test output: {test}")
except Exception as e:
    print(f"   ‚ùå Failed: {e}")

# Test textstat
print("\n3. Textstat (reading level)...")
try:
    import textstat
    test_text = "The hobbits of the Shire enjoy a peaceful life of farming and good food."
    fk = textstat.flesch_kincaid_grade(test_text)
    print(f"   ‚úÖ Loaded successfully")
    print(f"   Test Flesch-Kincaid grade: {fk}")
except Exception as e:
    print(f"   ‚ùå Failed: {e}")

print("\nüìã Ready to process!")

In [None]:
# ==============================================================================
# CELL 14c: TRY ALTERNATIVE SENTIMENT MODELS
# ==============================================================================

print("üîç TRYING ALTERNATIVE SENTIMENT MODELS...")
print("=" * 50)

from transformers import pipeline
import warnings
warnings.filterwarnings('ignore')

# Option 1: distilbert sentiment (very reliable)
print("\n1. DistilBERT Sentiment...")
try:
    sentiment_classifier = pipeline("sentiment-analysis",
                                    model="distilbert-base-uncased-finetuned-sst-2-english")
    test = sentiment_classifier("This game rocks! I love it!")
    print(f"   ‚úÖ Loaded successfully")
    print(f"   Test output: {test}")
except Exception as e:
    print(f"   ‚ùå Failed: {e}")

# Option 2: nlptown 5-star sentiment (more granular)
print("\n2. NLPTown 5-star Sentiment...")
try:
    sentiment_5star = pipeline("sentiment-analysis",
                               model="nlptown/bert-base-multilingual-uncased-sentiment")
    test = sentiment_5star("This game rocks! I love it!")
    print(f"   ‚úÖ Loaded successfully")
    print(f"   Test output: {test}")
except Exception as e:
    print(f"   ‚ùå Failed: {e}")

# Option 3: cardiffnlp older version
print("\n3. Cardiff Twitter Sentiment (older)...")
try:
    sentiment_twitter = pipeline("sentiment-analysis",
                                 model="cardiffnlp/twitter-roberta-base-sentiment")
    test = sentiment_twitter("This game rocks! I love it!")
    print(f"   ‚úÖ Loaded successfully")
    print(f"   Test output: {test}")
except Exception as e:
    print(f"   ‚ùå Failed: {e}")

In [None]:
# ==============================================================================
# CELL 15: NLP ANALYSIS ON TALES OF THE SHIRE
# ==============================================================================

import textstat
from tqdm import tqdm

print("üß† RUNNING NLP ANALYSIS ON TotS COMMENTS...")
print(f"   Processing {len(df_tales_full):,} comments")
print("=" * 50)

# Combine post_title + content for context
df_tales_full['full_text'] = df_tales_full.apply(
    lambda row: f"Post: {row['post_title']} Comment: {row['content']}" 
    if pd.notna(row['post_title']) else row['content'], 
    axis=1
)

# Truncate to 512 tokens (model limit)
df_tales_full['full_text_truncated'] = df_tales_full['full_text'].str[:512]

# Initialize results
sentiments = []
emotions = []
reading_levels = []

# Process in batches
print("\n‚è≥ Processing (this may take a few minutes)...")

for idx, row in tqdm(df_tales_full.iterrows(), total=len(df_tales_full)):
    text = row['full_text_truncated']
    
    # Sentiment (5-star)
    try:
        sent = sentiment_5star(text)[0]
        sentiments.append({'label': sent['label'], 'score': sent['score']})
    except:
        sentiments.append({'label': None, 'score': None})
    
    # Emotion (top 3)
    try:
        emo = emotion_classifier(text)[0][:3]
        emotions.append(emo)
    except:
        emotions.append(None)
    
    # Reading level (on comment only, not combined)
    try:
        rl = textstat.flesch_kincaid_grade(row['content'])
        reading_levels.append(rl)
    except:
        reading_levels.append(None)

# Add to dataframe
df_tales_full['sentiment_label'] = [s['label'] for s in sentiments]
df_tales_full['sentiment_score'] = [s['score'] for s in sentiments]
df_tales_full['emotions'] = emotions
df_tales_full['reading_level'] = reading_levels

print("\n‚úÖ NLP analysis complete!")

In [None]:
# ==============================================================================
# CELL 16: TALES OF THE SHIRE NLP SUMMARY
# ==============================================================================

print("üìä TALES OF THE SHIRE NLP ANALYSIS RESULTS")
print("=" * 50)

# Sentiment distribution
print("\nüé≠ SENTIMENT (5-star scale):")
print(df_tales_full['sentiment_label'].value_counts())

# Average sentiment score
print(f"\nAverage confidence: {df_tales_full['sentiment_score'].mean():.2f}")

# Reading level
print(f"\nüìñ READING LEVEL (Flesch-Kincaid Grade):")
print(f"   Mean: {df_tales_full['reading_level'].mean():.1f}")
print(f"   Median: {df_tales_full['reading_level'].median():.1f}")
print(f"   Std: {df_tales_full['reading_level'].std():.1f}")

# Top emotions
print("\nüí´ TOP EMOTIONS (aggregated):")
emotion_counts = {}
for emo_list in df_tales_full['emotions'].dropna():
    for e in emo_list:
        label = e['label']
        emotion_counts[label] = emotion_counts.get(label, 0) + 1

emotion_sorted = sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True)
for emotion, count in emotion_sorted[:15]:
    pct = count / len(df_tales_full) * 100
    print(f"   {emotion:20} {count:5} ({pct:.1f}%)")

In [None]:
# ==============================================================================
# CELL 17: WHAT'S DRIVING NEGATIVE EMOTIONS?
# ==============================================================================

print("üîç COMMENTS WITH HIGH CONFUSION/DISAPPOINTMENT")
print("=" * 50)

# Find comments where confusion or disappointment was top emotion
def get_top_emotion(emo_list):
    if emo_list and len(emo_list) > 0:
        return emo_list[0]['label']
    return None

df_tales_full['top_emotion'] = df_tales_full['emotions'].apply(get_top_emotion)

# Confusion drivers
print("\nüòï TOP CONFUSION COMMENTS:\n")
confused = df_tales_full[df_tales_full['top_emotion'] == 'confusion'].nlargest(10, 'score')
for _, row in confused.iterrows():
    print(f"[score {row['score']}] Post: {row['post_title'][:50]}...")
    print(f"   {row['content'][:200]}...")
    print()

# Disappointment drivers
print("\nüòû TOP DISAPPOINTMENT COMMENTS:\n")
disappointed = df_tales_full[df_tales_full['top_emotion'] == 'disappointment'].nlargest(10, 'score')
for _, row in disappointed.iterrows():
    print(f"[score {row['score']}] Post: {row['post_title'][:50]}...")
    print(f"   {row['content'][:200]}...")
    print()

In [None]:
# ==============================================================================
# CELL 18: TOTS FEATURE REQUESTS / WISHLIST
# ==============================================================================

print("üìù TALES OF THE SHIRE WISHLIST ANALYSIS")
print("=" * 50)

# Find wishlist-type posts
wishlist_keywords = ['feature', 'want', 'wish', 'add', 'should', 'could', 'expand', 'missing', 'need', 'hope']
pattern = '|'.join(wishlist_keywords)

wishlist_posts = df_tales_full[df_tales_full['post_title'].str.contains(pattern, case=False, na=False)]['post_title'].unique()

print(f"Found {len(wishlist_posts)} potential wishlist posts:\n")
for title in wishlist_posts[:30]:
    count = len(df_tales_full[df_tales_full['post_title'] == title])
    print(f"[{count} comments] {title[:70]}...")

In [None]:
# ==============================================================================
# CELL 19: EXTRACT SPECIFIC FEATURE REQUESTS
# ==============================================================================

print("üìù DETAILED WISHLIST EXTRACTION")
print("=" * 50)

# Key wishlist posts
wishlist_titles = [
    'What features would you like to see added?',
    'Add-ons you think would improve the game even more!',
    'Loving it!!! But I\'m missing a few things',
    'I wish you could sell meals...',
    'I wish I could rearrange!!',
    'Wish there was seasonal events',
    'Have they said anything about adding multiplayer?',
    'What could be expanded?'
]

df_wishlist = df_tales_full[df_tales_full['post_title'].isin(wishlist_titles)]
print(f"Wishlist comments: {len(df_wishlist)}")

print("\n" + "="*60)
for title in wishlist_titles:
    post_comments = df_tales_full[df_tales_full['post_title'] == title]
    if len(post_comments) > 0:
        print(f"\nüìå {title}")
        print(f"   ({len(post_comments)} comments)")
        print("-" * 50)
        for _, row in post_comments.nlargest(5, 'score').iterrows():
            print(f"\n   [{row['score']}] {row['content'][:250]}...")

In [None]:
# ==============================================================================
# CELL 20: THEME VALIDATION BEYOND WISHLIST POSTS
# ==============================================================================

print("üîç VALIDATING THEMES ACROSS FULL CORPUS (excluding wishlist posts)")
print("=" * 60)

# Wishlist posts we already analyzed
wishlist_titles = [
    'What features would you like to see added?',
    'Add-ons you think would improve the game even more!',
    'Loving it!!! But I\'m missing a few things',
    'I wish you could sell meals...',
    'I wish I could rearrange!!',
    'Wish there was seasonal events',
    'Have they said anything about adding multiplayer?',
    'What could be expanded?'
]

# Exclude wishlist posts, require 100+ char comments
df_non_wishlist = df_tales_full[
    (~df_tales_full['post_title'].isin(wishlist_titles)) &
    (df_tales_full['content'].str.len() >= 100)
]

print(f"Corpus: {len(df_non_wishlist):,} comments (100+ chars, excluding wishlist posts)")

# Theme keywords
themes = {
    'furniture_sitting': r'\b(sit|sitting|chair|bench|couch|furniture)\b',
    'animals': r'\b(pet|petting|chicken|chickens|animal|animals|horse|duck)\b',
    'social_npc': r'\b(invite|invited|dinner|lonely|npc|dialogue|gossip)\b',
    'birthday_events': r'\b(birthday|party|festival|event|celebration)\b',
}

print("\n" + "="*60)

for theme, pattern in themes.items():
    matches = df_non_wishlist[
        df_non_wishlist['content'].str.contains(pattern, case=False, na=False, regex=True)
    ]
    
    print(f"\nüìå {theme.upper()}: {len(matches)} mentions outside wishlist posts")
    print("-" * 50)
    
    for _, row in matches.nlargest(5, 'score').iterrows():
        print(f"\n[score {row['score']}] Post: {row['post_title'][:60]}...")
        print(f"   {row['content'][:300]}...")

In [None]:
# ==============================================================================
# CELL 21: WHAT'S WORKING (PRAISE EXTRACTION)
# ==============================================================================

print("üíö WHAT PLAYERS PRAISE (positive emotion comments)")
print("=" * 60)

# Filter to comments where top emotion is positive
positive_emotions = ['love', 'admiration', 'joy', 'gratitude']

df_praise = df_tales_full[
    (df_tales_full['top_emotion'].isin(positive_emotions)) &
    (df_tales_full['content'].str.len() >= 100)
]

print(f"Comments with positive top emotion (100+ chars): {len(df_praise):,}")
print(f"\nBreakdown:")
print(df_praise['top_emotion'].value_counts())

print("\n" + "="*60)

for emotion in positive_emotions:
    matches = df_praise[df_praise['top_emotion'] == emotion]
    if len(matches) == 0:
        continue
        
    print(f"\nüí´ {emotion.upper()}: {len(matches)} comments")
    print("-" * 50)
    
    for _, row in matches.nlargest(5, 'score').iterrows():
        print(f"\n[score {row['score']}] Post: {row['post_title'][:60]}...")
        print(f"   {row['content'][:300]}...")

In [None]:
# ==============================================================================
# CELL 22: TEMPORAL ANALYSIS
# ==============================================================================

print("üìÖ TEMPORAL ANALYSIS: ARE THEMES PERSISTENT OR FADING?")
print("=" * 60)

import matplotlib.pyplot as plt

# Add week column
df_tales_full['week'] = df_tales_full['created_at'].dt.to_period('W').apply(lambda x: x.start_time)

# Overall volume by week
weekly_volume = df_tales_full.groupby('week').size()

print(f"Date range: {df_tales_full['week'].min()} to {df_tales_full['week'].max()}")
print(f"Total weeks: {len(weekly_volume)}")

# Theme keywords (same as Cell 20)
themes = {
    'furniture_sitting': r'\b(sit|sitting|chair|bench|couch|furniture)\b',
    'animals': r'\b(pet|petting|chicken|chickens|animal|animals|horse|duck)\b',
    'social_npc': r'\b(invite|invited|dinner|lonely|npc|dialogue|gossip)\b',
    'birthday_events': r'\b(birthday|party|festival|event|celebration)\b',
}

# Count theme mentions by week
theme_weekly = {}
for theme, pattern in themes.items():
    matches = df_tales_full[
        df_tales_full['content'].str.contains(pattern, case=False, na=False, regex=True)
    ]
    theme_weekly[theme] = matches.groupby('week').size()

# Create dataframe for plotting
df_temporal = pd.DataFrame({
    'total_volume': weekly_volume,
    **theme_weekly
}).fillna(0)

# Calculate percentage of total (controls for volume fluctuation)
for theme in themes.keys():
    df_temporal[f'{theme}_pct'] = df_temporal[theme] / df_temporal['total_volume'] * 100

print("\nüìä WEEKLY THEME MENTIONS (as % of total comments):\n")
print(df_temporal[[f'{t}_pct' for t in themes.keys()]].round(1).tail(15).to_string())

# Summary stats
print("\n" + "="*60)
print("üìà THEME PERSISTENCE (avg % of weekly comments):\n")
for theme in themes.keys():
    col = f'{theme}_pct'
    first_half = df_temporal[col].iloc[:len(df_temporal)//2].mean()
    second_half = df_temporal[col].iloc[len(df_temporal)//2:].mean()
    change = ((second_half - first_half) / first_half * 100) if first_half > 0 else 0
    print(f"   {theme:20} First half: {first_half:.1f}%  |  Second half: {second_half:.1f}%  |  Change: {change:+.0f}%")

In [None]:
# Show all DataFrame variables in memory
import pandas as pd
for name, obj in list(globals().items()):
    if isinstance(obj, pd.DataFrame):
        print(f"{name}: {obj.shape[0]:,} rows, {obj.shape[1]} cols")

In [None]:
import os
os.makedirs('../output', exist_ok=True)

df_lotr_comments.to_csv('../output/lotr_comments_69k.csv', index=False)
df_lotr_posts.to_csv('../output/lotr_posts_5k.csv', index=False)
df_wishlist.to_csv('../output/wishlist_signals.csv', index=False)
df_praise.to_csv('../output/praise_signals.csv', index=False)
df_lotr_external.to_csv('../output/external_sub_comments.csv', index=False)
df_tales_full.to_csv('../output/tales_full.csv', index=False)
df_temporal.to_csv('../output/temporal_analysis.csv', index=False)

print("Done. Files saved to LOTR/output/")