In [1]:
# ==============================================================================
# CELL 1: ENVIRONMENT SETUP & DEPENDENCIES
# ==============================================================================

import os
from dotenv import load_dotenv
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
import spacy

# 1. LOAD ENVIRONMENT VARIABLES
load_success = load_dotenv()
if load_success:
    print(f"✅ Environment variables loaded from .env")
else:
    print(f"⚠️ Warning: .env file not found or empty.")

# 2. CONSTRUCT DATABASE CONNECTION
# Fetch individual parts from your specific .env structure
db_user = os.getenv("DB_USER")
db_pass = os.getenv("DB_PASS")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")

# Verify we have everything (except password, which might be empty for local dev sometimes)
if not all([db_user, db_host, db_name]):
    print("❌ Error: Missing DB_USER, DB_HOST, or DB_NAME in .env file.")
else:
    # Build the SQLAlchemy connection string: postgresql://<user>:<pass>@<host>:<port>/<db>
    # We use f-strings to assemble it dynamically
    DB_STRING = f"postgresql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}"
    
    try:
        # Create Engine
        engine = create_engine(DB_STRING)
        
        # Test Connection
        with engine.connect() as conn:
            print(f"✅ Database connection established to {db_host}")
            
    except Exception as e:
        print(f"❌ Database connection failed. Check your password in .env.")
        print(f"   Error details: {e}")

# 3. SPACY MODEL CHECK
try:
    nlp = spacy.load("en_core_web_sm")
    print("✅ spaCy model 'en_core_web_sm' loaded.")
except OSError:
    print("❌ spaCy model not found. Run 'python -m spacy download en_core_web_sm' in terminal.")

# 4. CONFIGURATION
pd.set_option('display.max_colwidth', None)
print("🚀 Setup Complete.")

✅ Environment variables loaded from .env
✅ Database connection established to db.nfrqwoqhwncixlqkpvwl.supabase.co
✅ spaCy model 'en_core_web_sm' loaded.
🚀 Setup Complete.


In [8]:
# ==============================================================================
# CELL 2: BUILD FTS QUERY FROM LOTR ENTITIES JSON
# ==============================================================================
# Parse lotr_entities.json to construct a comprehensive tsquery string
# This will be used to search the entire corpus via the content_tsv GIN index

import json

# Load the entity file
entity_file = "../lotr_entities.json"

with open(entity_file, 'r') as f:
    lotr_data = json.load(f)

# Extract all searchable terms (canonical names + aliases)
all_terms = []

def extract_terms(node):
    """Recursively extract canonical names and aliases from nested structure."""
    if isinstance(node, list):
        for item in node:
            if isinstance(item, dict) and "canonical" in item:
                all_terms.append(item["canonical"])
                all_terms.extend(item.get("aliases", []))
    elif isinstance(node, dict):
        for val in node.values():
            extract_terms(val)

extract_terms(lotr_data)

print(f"✅ Extracted {len(all_terms):,} searchable terms from {entity_file}")

# Clean and deduplicate
all_terms = list(set(all_terms))  # Remove duplicates
all_terms = [t for t in all_terms if len(t) >= 2]  # Drop single characters
print(f"   After deduplication: {len(all_terms):,} unique terms")

# Build tsquery string
# Escape single quotes for SQL, wrap multi-word phrases appropriately
def term_to_tsquery(term):
    """Convert a term to tsquery format."""
    # Escape single quotes
    escaped = term.replace("'", "''")
    # Multi-word terms need adjacency operator
    words = escaped.split()
    if len(words) > 1:
        return " <-> ".join(f"'{w}'" for w in words)
    else:
        return f"'{escaped}'"

tsquery_parts = [term_to_tsquery(t) for t in all_terms]
lotr_tsquery = " | ".join(tsquery_parts)

print(f"   Built tsquery with {len(tsquery_parts):,} OR clauses")
print(f"\n📋 Sample terms: {all_terms[:20]}")
print(f"\n🔍 Query preview (first 500 chars):\n{lotr_tsquery[:500]}...")

✅ Extracted 563 searchable terms from ../lotr_entities.json
   After deduplication: 561 unique terms
   Built tsquery with 561 OR clauses

📋 Sample terms: ['Durin the Deathless', 'Gaffer Gamgee', 'Durin I', 'Bombadil', 'Luthien Tinuviel', 'Barad Dur', 'The Mouth', 'Lord of the Nazgûl', 'Hobbit movies', 'Arwen Undomiel', 'Numenor', 'Elvenking', 'The War', 'Mark', 'Dwalin', 'Nazgul', 'Goblins', 'Lord of the Nazgul', 'Thranduil', 'Goldberry']

🔍 Query preview (first 500 chars):
'Durin' <-> 'the' <-> 'Deathless' | 'Gaffer' <-> 'Gamgee' | 'Durin' <-> 'I' | 'Bombadil' | 'Luthien' <-> 'Tinuviel' | 'Barad' <-> 'Dur' | 'The' <-> 'Mouth' | 'Lord' <-> 'of' <-> 'the' <-> 'Nazgûl' | 'Hobbit' <-> 'movies' | 'Arwen' <-> 'Undomiel' | 'Numenor' | 'Elvenking' | 'The' <-> 'War' | 'Mark' | 'Dwalin' | 'Nazgul' | 'Goblins' | 'Lord' <-> 'of' <-> 'the' <-> 'Nazgul' | 'Thranduil' | 'Goldberry' | 'Orcs' | 'Grima' | 'Siege' <-> 'of' <-> 'Gondor' | 'Uruks' | 'Smeagol' | 'Ringbearer' | 'Jackson...


In [9]:
# ==============================================================================
# CELL 2a: FILTER OUT LOW-PRECISION TERMS
# ==============================================================================
# Remove terms that are too generic or overlap heavily with non-LOTR contexts

generic_terms = {
    # Single common words
    'Mark', 'Warg', 'Wargs', 'Orcs', 'Goblins', 'Uruks', 'Eagles', 'Dwarves', 
    'Elves', 'Men', 'Ring', 'Rings', 'Tower', 'King', 'Shadow', 'Fire', 'Light',
    'Dark', 'White', 'Black', 'Grey', 'Gray', 'Gold', 'Silver', 'Iron', 'Stone',
    'Dragon', 'Dragons', 'Troll', 'Trolls', 'Spider', 'Spiders',
    
    # Generic phrases
    'The War', 'The Mouth', 'The Eye', 'The Ring', 'The Tower', 'The King',
    'The Shadow', 'The Dark', 'The White', 'The Black', 'The Grey',
    'The Fellowship', 'The Company',
    
    # Too short / ambiguous
    'Sam', 'Tom', 'Bill', 'Ted', 'Bert', 'Fatty',
}

before_filter = len(all_terms)
all_terms = [t for t in all_terms if t not in generic_terms]
print(f"🧹 Filtered out {before_filter - len(all_terms)} generic/ambiguous terms")

🧹 Filtered out 17 generic/ambiguous terms


In [11]:
# ==============================================================================
# CELL 3: CORPUS-WIDE LOTR COMMENT PULL
# ==============================================================================

print("🔍 PULLING LOTR MENTIONS CORPUS-WIDE...")
print("=" * 50)

lotr_fts_query = """
    'Lord' <-> 'of' <-> 'the' <-> 'Rings' | Gandalf | Frodo | Aragorn | Mordor | 
    Tolkien | Silmarillion | Hobbiton | Rivendell | Gondor | Rohan | LOTR | LotR | 
    Sauron | Gollum | Legolas | Gimli | Boromir | Saruman | Hobbit | Bilbo | Moria | Isengard
"""

comments_sql = f"""
    SELECT 
        rc.id as comment_id,
        rc.post_id,
        rc.subreddit,
        rc.author,
        rc.content,
        rc.score,
        rc.created_at,
        uac.authenticity_score,
        uac.subreddit_count as author_subreddit_count,
        uac.active_days as author_active_days
    FROM reddit_comments rc
    LEFT JOIN user_authenticity_cache uac ON rc.author = uac.author
    WHERE rc.content_tsv @@ to_tsquery('english', $${lotr_fts_query}$$)
      AND rc.is_deleted = FALSE
"""

df_lotr_comments = pd.read_sql(comments_sql, engine)

print(f"✅ Loaded {len(df_lotr_comments):,} comments")
print(f"   Unique subreddits: {df_lotr_comments['subreddit'].nunique():,}")
print(f"   Unique authors: {df_lotr_comments['author'].nunique():,}")
print(f"   Date range: {df_lotr_comments['created_at'].min()} to {df_lotr_comments['created_at'].max()}")
print(f"\n📊 Authenticity distribution:")
print(df_lotr_comments['authenticity_score'].value_counts(dropna=False))

🔍 PULLING LOTR MENTIONS CORPUS-WIDE...
✅ Loaded 69,037 comments
   Unique subreddits: 168
   Unique authors: 23,000
   Date range: 2020-03-05 17:21:42+00:00 to 2026-01-27 17:27:18+00:00

📊 Authenticity distribution:
authenticity_score
LOW       48096
HIGH      13636
MEDIUM     4865
None       2440
Name: count, dtype: int64


In [12]:
# ==============================================================================
# CELL 4: CORPUS-WIDE LOTR POST PULL
# ==============================================================================

print("🔍 PULLING LOTR POSTS CORPUS-WIDE...")
print("=" * 50)

posts_sql = f"""
    SELECT 
        rp.post_id,
        rp.subreddit,
        rp.author,
        rp.title,
        rp.content,
        rp.url,
        rp.score,
        rp.num_comments,
        rp.created_at,
        uac.authenticity_score,
        uac.subreddit_count as author_subreddit_count,
        uac.active_days as author_active_days
    FROM reddit_posts rp
    LEFT JOIN user_authenticity_cache uac ON rp.author = uac.author
    WHERE rp.content_tsv @@ to_tsquery('english', $${lotr_fts_query}$$)
"""

df_lotr_posts = pd.read_sql(posts_sql, engine)

print(f"✅ Loaded {len(df_lotr_posts):,} posts")
print(f"   Unique subreddits: {df_lotr_posts['subreddit'].nunique():,}")
print(f"   Unique authors: {df_lotr_posts['author'].nunique():,}")
print(f"   Date range: {df_lotr_posts['created_at'].min()} to {df_lotr_posts['created_at'].max()}")
print(f"\n📊 Authenticity distribution:")
print(df_lotr_posts['authenticity_score'].value_counts(dropna=False))

🔍 PULLING LOTR POSTS CORPUS-WIDE...
✅ Loaded 5,308 posts
   Unique subreddits: 98
   Unique authors: 3,648
   Date range: 2020-07-12 02:36:17+00:00 to 2026-01-27 08:27:42+00:00

📊 Authenticity distribution:
authenticity_score
LOW       3711
None      1038
HIGH       311
MEDIUM     248
Name: count, dtype: int64


In [13]:
# ==============================================================================
# CELL 5: MERGE POST CONTEXT ONTO COMMENTS
# ==============================================================================

print("🔗 MERGING POST CONTEXT ONTO COMMENTS...")
print("=" * 50)

# Create a lookup of post_id -> title
post_titles = df_lotr_posts[['post_id', 'title']].drop_duplicates()
post_titles = post_titles.rename(columns={'title': 'post_title'})

# Merge onto comments
df_lotr_comments = df_lotr_comments.merge(post_titles, on='post_id', how='left')

# Check coverage
has_title = df_lotr_comments['post_title'].notna().sum()
missing_title = df_lotr_comments['post_title'].isna().sum()

print(f"✅ Comments with post title: {has_title:,} ({has_title/len(df_lotr_comments)*100:.1f}%)")
print(f"   Comments missing post title: {missing_title:,} ({missing_title/len(df_lotr_comments)*100:.1f}%)")

🔗 MERGING POST CONTEXT ONTO COMMENTS...
✅ Comments with post title: 47,641 (69.0%)
   Comments missing post title: 21,396 (31.0%)


In [14]:
# ==============================================================================
# CELL 5b: FETCH MISSING POST TITLES
# ==============================================================================

# Get post_ids that are missing titles
missing_post_ids = df_lotr_comments[df_lotr_comments['post_title'].isna()]['post_id'].unique()
print(f"🔍 Fetching titles for {len(missing_post_ids):,} posts...")

# Batch fetch from reddit_posts
missing_ids_str = "','".join(missing_post_ids)

missing_titles_sql = f"""
    SELECT post_id, title
    FROM reddit_posts
    WHERE post_id IN ('{missing_ids_str}')
"""

df_missing_titles = pd.read_sql(missing_titles_sql, engine)
print(f"   Found {len(df_missing_titles):,} titles")

# Update the main dataframe
title_lookup = dict(zip(df_missing_titles['post_id'], df_missing_titles['title']))
mask = df_lotr_comments['post_title'].isna()
df_lotr_comments.loc[mask, 'post_title'] = df_lotr_comments.loc[mask, 'post_id'].map(title_lookup)

# Final coverage
has_title = df_lotr_comments['post_title'].notna().sum()
print(f"\n✅ Final coverage: {has_title:,} / {len(df_lotr_comments):,} ({has_title/len(df_lotr_comments)*100:.1f}%)")

🔍 Fetching titles for 4,978 posts...
   Found 4,928 titles

✅ Final coverage: 68,838 / 69,037 (99.7%)


In [15]:
# ==============================================================================
# CELL 6: SUBREDDIT DISTRIBUTION
# ==============================================================================

print("📊 LOTR MENTIONS BY SUBREDDIT")
print("=" * 50)

# Comments by subreddit
sub_comments = df_lotr_comments.groupby('subreddit').agg(
    comment_count=('comment_id', 'count'),
    unique_authors=('author', 'nunique'),
    avg_score=('score', 'mean'),
    high_auth_pct=('authenticity_score', lambda x: (x == 'HIGH').sum() / len(x) * 100)
).round(1)

sub_comments = sub_comments.sort_values('comment_count', ascending=False)

print(f"\nTop 30 subreddits by LOTR comment volume:\n")
print(sub_comments.head(30).to_string())

# How much is concentrated in dedicated LOTR subs?
lotr_dedicated = ['lotr', 'tolkienfans', 'LOTR_on_Prime', 'lordoftherings', 
                  'Rings_Of_Power', 'lotro', 'lotrlcg', 'lotrmemes', 
                  'TheWarOfTheRohirrim', 'Silmarillionmemes']

dedicated_count = df_lotr_comments[df_lotr_comments['subreddit'].isin(lotr_dedicated)]['comment_id'].count()
other_count = len(df_lotr_comments) - dedicated_count

print(f"\n📍 CONCENTRATION:")
print(f"   Dedicated LOTR subs: {dedicated_count:,} ({dedicated_count/len(df_lotr_comments)*100:.1f}%)")
print(f"   Other communities: {other_count:,} ({other_count/len(df_lotr_comments)*100:.1f}%)")

📊 LOTR MENTIONS BY SUBREDDIT

Top 30 subreddits by LOTR comment volume:

                       comment_count  unique_authors  avg_score  high_auth_pct
subreddit                                                                     
lotr                           21670            9664       15.6           18.6
tolkienfans                    19837            4281        9.7           28.7
LOTR_on_Prime                   9105            2135        7.1           20.4
lordoftherings                  5616            3132       10.1           13.5
Rings_Of_Power                  4268            1661        4.9           17.5
lotrlcg                         1400             430        3.8            1.7
lotro                           1301             621        6.5            4.5
Fantasy                         1007             753        9.0            4.6
TalesofTheShiregamers            867             465        7.0            4.4
TheWarOfTheRohirrim              701             336      

In [16]:
# ==============================================================================
# CELL 7: EXTERNAL COMMUNITIES - LOTR AS CULTURAL REFERENCE
# ==============================================================================

print("🌍 LOTR MENTIONS IN NON-DEDICATED COMMUNITIES")
print("=" * 50)

# Updated dedicated list
lotr_dedicated = ['lotr', 'tolkienfans', 'LOTR_on_Prime', 'lordoftherings', 
                  'Rings_Of_Power', 'lotro', 'lotrlcg', 'lotrmemes', 
                  'TheWarOfTheRohirrim', 'Silmarillionmemes', 'TalesofTheShiregamers']

# Filter to external only
df_lotr_external = df_lotr_comments[~df_lotr_comments['subreddit'].isin(lotr_dedicated)]

print(f"Total external comments: {len(df_lotr_external):,}")
print(f"Unique subreddits: {df_lotr_external['subreddit'].nunique()}")
print(f"Unique authors: {df_lotr_external['author'].nunique():,}")

# Full list by volume
external_subs = df_lotr_external.groupby('subreddit').agg(
    comment_count=('comment_id', 'count'),
    unique_authors=('author', 'nunique'),
    avg_score=('score', 'mean')
).round(1).sort_values('comment_count', ascending=False)

print(f"\n📋 ALL {len(external_subs)} EXTERNAL SUBREDDITS:\n")
print(external_subs.to_string())

🌍 LOTR MENTIONS IN NON-DEDICATED COMMUNITIES
Total external comments: 4,269
Unique subreddits: 158
Unique authors: 3,597

📋 ALL 158 EXTERNAL SUBREDDITS:

                       comment_count  unique_authors  avg_score
subreddit                                                      
Fantasy                         1007             753        9.0
movies                           381             354       21.9
writing                          187             165        5.0
warcraftlore                     175             107        5.7
witcher                          134             108       15.1
gaming                           114             102        6.5
collapse                          88              76       17.7
FanFiction                        87              71        7.2
ledzeppelin                       78              72        7.5
AO3                               77              72       23.2
trees                             77              75        7.9
atheism       

In [17]:
# ==============================================================================
# CELL 8: SAMPLE COMMENTS FROM TOP EXTERNAL COMMUNITIES
# ==============================================================================

# Top 15 external subreddits by volume
top_external = external_subs.head(15).index.tolist()

for sub in top_external:
    print(f"\n{'='*60}")
    print(f"r/{sub} ({external_subs.loc[sub, 'comment_count']} comments)")
    print('='*60)
    
    samples = df_lotr_external[df_lotr_external['subreddit'] == sub].nlargest(3, 'score')
    
    for _, row in samples.iterrows():
        print(f"\n[score: {row['score']}]")
        print(f"{row['content'][:400]}...")


r/Fantasy (1007 comments)

[score: 480]
The answer is Memory, Sorrow, and Thorn by Tad Williams. 

This is the series that inspired GRRM to write Game of Thrones. It is an almost exact midpoint between Lord of the Rings and ASOIAF.

There is also a sequel series that just wrapped up that feels more modern like Game of Thrones and was incredible as well....

[score: 340]
Is there an incredibly long-lived race, like Tolkien's elves? That can have a significant effect on the pace of change in a setting. 5000 years is a long time, but 5 generations kind of isn't, or at least wasn't until fairly recently....

[score: 268]
Hey, I'm gonna try to give a comprehensive answer since a lot of people are ignoring what you're asking for (like, Gilgamesh is great but its only 30,000 words long- not "sprawling" by any means, haha) This is by no means complete but should give you a good place to start.

**PRE-GREEK STUFF** (Not too long, all by anonymous authors, still interesting. All of this has onl

In [18]:
# ==============================================================================
# CELL 9: TALES OF THE SHIRE DEEP DIVE
# ==============================================================================

df_tales = df_lotr_comments[df_lotr_comments['subreddit'] == 'TalesofTheShiregamers']

print(f"📊 r/TalesofTheShiregamers")
print(f"=" * 50)
print(f"Total comments: {len(df_tales):,}")
print(f"Unique authors: {df_tales['author'].nunique()}")
print(f"Date range: {df_tales['created_at'].min().date()} to {df_tales['created_at'].max().date()}")
print(f"Avg score: {df_tales['score'].mean():.1f}")

print(f"\n📈 Score distribution:")
print(df_tales['score'].describe())

print(f"\n📝 Top 20 comments by score:\n")
for _, row in df_tales.nlargest(20, 'score').iterrows():
    print(f"[{row['score']}] u/{row['author']} | {row['created_at'].date()}")
    print(f"Post: {row['post_title'][:80] if pd.notna(row['post_title']) else 'N/A'}...")
    print(f"{row['content'][:300]}...")
    print()

📊 r/TalesofTheShiregamers
Total comments: 867
Unique authors: 465
Date range: 2025-07-29 to 2026-01-26
Avg score: 7.0

📈 Score distribution:
count    867.000000
mean       6.957324
std       10.859560
min       -8.000000
25%        2.000000
50%        3.000000
75%        8.000000
max      123.000000
Name: score, dtype: float64

📝 Top 20 comments by score:

[123] u/DarksideofHyrule | 2025-08-08
Post: Is Tales of the Shire really THAT bad?...
This game rocks! I hope the devs know much it does too. 

I don’t think the average person understands hobbits enough to get how wonderful this game captures being one!...

[119] u/PumpkinObjective4108 | 2025-08-13
Post: I guess it’s goodbye!...
PS 2.: I didn’t play half of it half as well as I should like, and I like less than half of it half as well as it deserves or whatever Bilbo said....

[91] u/Icy-Paleontologist97 | 2025-08-18
Post: "Canon" Rant...
First, I’m super into Tolkien canon and I don’t think this game violates. Second, even if it di

In [19]:
# ==============================================================================
# CELL 10: TALES OF THE SHIRE - FULL COMMENT CORPUS
# ==============================================================================

print("🏡 PULLING ALL r/TalesofTheShiregamers COMMENTS WITH POST CONTEXT...")
print("=" * 50)

tales_full_sql = """
    SELECT 
        rc.id as comment_id,
        rc.post_id,
        rc.author,
        rc.content,
        rc.score,
        rc.created_at,
        rp.title as post_title,
        rp.content as post_content,
        rp.score as post_score,
        rp.num_comments as post_num_comments
    FROM reddit_comments rc
    LEFT JOIN reddit_posts rp ON rc.post_id = rp.post_id
    WHERE rc.subreddit = 'TalesofTheShiregamers'
      AND rc.is_deleted = FALSE
"""

df_tales_full = pd.read_sql(tales_full_sql, engine)

print(f"✅ Loaded {len(df_tales_full):,} comments")
print(f"   Unique posts: {df_tales_full['post_id'].nunique()}")
print(f"   Unique authors: {df_tales_full['author'].nunique()}")
print(f"   Date range: {df_tales_full['created_at'].min().date()} to {df_tales_full['created_at'].max().date()}")

print(f"\n📊 Top 15 posts by comment count:\n")
top_posts = df_tales_full.groupby(['post_id', 'post_title']).agg(
    comment_count=('comment_id', 'count'),
    post_score=('post_score', 'first')
).sort_values('comment_count', ascending=False).head(15)

for (post_id, title), row in top_posts.iterrows():
    print(f"[{row['comment_count']} comments | score {row['post_score']}] {title[:70]}...")

🏡 PULLING ALL r/TalesofTheShiregamers COMMENTS WITH POST CONTEXT...
✅ Loaded 10,751 comments
   Unique posts: 960
   Unique authors: 2160
   Date range: 2025-07-28 to 2026-01-26

📊 Top 15 posts by comment count:

[194 comments | score 236] Guides! Get your guides!...
[181 comments | score 31] Hobbit names...
[125 comments | score 122] Why is every big company talking trash about TotS?...
[95 comments | score 254] An honest review: everyone's too critical....
[94 comments | score 466] Main Suite and Library...
[85 comments | score 66] Anyone else decide last minute not to buy it immediately?...
[83 comments | score 64] Is Tales of the Shire really THAT bad?...
[70 comments | score 144] Lost my files (100+ hrs). I’m devastated....
[67 comments | score 178] Here's a few tips and tricks from what I've learned....
[62 comments | score 31] What features would you like to see added?...
[57 comments | score 30] What do you like and dislike about Tales of the Shire?...
[56 comments | score 243]

In [20]:
# ==============================================================================
# CELL 11: TALES OF THE SHIRE AUTHOR CROSS-PARTICIPATION
# ==============================================================================

print("🔍 WHERE ELSE DO TotS CONTRIBUTORS PARTICIPATE?")
print("=" * 50)

# Get all TotS authors
tots_authors = df_tales_full['author'].unique().tolist()
print(f"TotS unique authors: {len(tots_authors):,}")

# Find all their comments across the corpus
authors_str = "','".join([a.replace("'", "''") for a in tots_authors])

cross_participation_sql = f"""
    SELECT 
        subreddit,
        COUNT(*) as comment_count,
        COUNT(DISTINCT author) as author_count
    FROM reddit_comments
    WHERE author IN ('{authors_str}')
      AND is_deleted = FALSE
    GROUP BY subreddit
    ORDER BY author_count DESC
"""

df_cross = pd.read_sql(cross_participation_sql, engine)

# Add percentage of TotS authors who participate in each sub
df_cross['pct_of_tots_authors'] = (df_cross['author_count'] / len(tots_authors) * 100).round(1)

print(f"\n✅ TotS authors appear in {len(df_cross):,} subreddits")
print(f"\n📊 Top 40 subreddits by TotS author overlap:\n")
print(df_cross.head(40).to_string(index=False))

🔍 WHERE ELSE DO TotS CONTRIBUTORS PARTICIPATE?
TotS unique authors: 2,160

✅ TotS authors appear in 210 subreddits

📊 Top 40 subreddits by TotS author overlap:

            subreddit  comment_count  author_count  pct_of_tots_authors
TalesofTheShiregamers          10751          2160                100.0
           CozyGamers            661           117                  5.4
                 lotr            796            70                  3.2
        LOTR_on_Prime           2417            31                  1.4
       lordoftherings            452            27                  1.2
          tolkienfans           1320            25                  1.2
                lotro            294            23                  1.1
               gaming            351            18                  0.8
      Anticonsumption            932            16                  0.7
               movies           1048            16                  0.7
             politics           1379           

In [21]:
# ==============================================================================
# CELL 12: TOTS AUTHORS IN r/CozyGamers
# ==============================================================================

print("🎮 WHAT DO TotS AUTHORS SAY IN r/CozyGamers?")
print("=" * 50)

# Get TotS authors' comments in CozyGamers
tots_authors_str = "','".join([a.replace("'", "''") for a in tots_authors])

cozygamers_sql = f"""
    SELECT 
        rc.id as comment_id,
        rc.post_id,
        rc.author,
        rc.content,
        rc.score,
        rc.created_at,
        rp.title as post_title
    FROM reddit_comments rc
    LEFT JOIN reddit_posts rp ON rc.post_id = rp.post_id
    WHERE rc.author IN ('{tots_authors_str}')
      AND rc.subreddit = 'CozyGamers'
      AND rc.is_deleted = FALSE
    ORDER BY rc.score DESC
"""

df_tots_cozy = pd.read_sql(cozygamers_sql, engine)

print(f"✅ {len(df_tots_cozy):,} comments from {df_tots_cozy['author'].nunique()} TotS authors")
print(f"   Date range: {df_tots_cozy['created_at'].min().date()} to {df_tots_cozy['created_at'].max().date()}")

print(f"\n📝 Top 25 comments by score:\n")
for _, row in df_tots_cozy.head(25).iterrows():
    print(f"[{row['score']}] u/{row['author']} | {row['created_at'].date()}")
    print(f"Post: {row['post_title'][:70] if pd.notna(row['post_title']) else 'N/A'}...")
    print(f"{row['content'][:250]}...")
    print()

🎮 WHAT DO TotS AUTHORS SAY IN r/CozyGamers?
✅ 661 comments from 117 TotS authors
   Date range: 2025-11-26 to 2026-01-27

📝 Top 25 comments by score:

[440] u/[deleted] | 2025-12-08
Post: I miss this game "The Simpsons: Tapped Out" It was very cozy, had seas...
[removed]...

[173] u/hanakoflower | 2025-12-07
Post: Best Cozy games of 2025?...
Story of Seasons Grand Bazaar

Rune Factory Guardians of Azuma...

[149] u/enolafaye | 2026-01-14
Post: 3.0 Animal Crossing update out now!...
Oof I need to gather the courage to face my villagers. They will be so mad I've been gone....

[147] u/[deleted] | 2025-12-18
Post: 2025 Steam Winter Sale Megathread December 18 - January 5...
[removed]...

[113] u/Friendly-Ocelot | 2025-12-13
Post: Are there any farm sims in non Western settings?...
Coral island is inspired by Indonesia even though they added all 4 seasons in the game....

[86] u/[deleted] | 2025-12-18
Post: What games are you buying in the Steam winter sale?...
[deleted]...

[84] u/a2brute

In [22]:
# ==============================================================================
# CELL 13: GAME MENTIONS BY TOTS AUTHORS IN r/CozyGamers
# ==============================================================================

print("🎮 GAMES MENTIONED BY TotS AUTHORS IN r/CozyGamers")
print("=" * 50)

# Define games to search for (common cozy games)
cozy_games = [
    'Animal Crossing', 'Stardew Valley', 'Disney Dreamlight Valley', 'Dreamlight Valley',
    'Coral Island', 'Fae Farm', 'Rune Factory', 'Story of Seasons', 'Harvest Moon',
    'Wylde Flowers', 'Cat Cafe Manager', 'Luma Island', 'Spiritfarer', 'Cozy Grove',
    'Sun Haven', 'Ooblets', 'Slime Rancher', 'Unpacking', 'A Short Hike',
    'Moonstone Island', 'Palia', 'Roots of Pacha', 'Dinkum', 'Portia', 'Sandrock',
    'Fantasy Life', 'Littlewood', 'Garden Story', 'Witchbrook', 'Haunted Chocolatier',
    'Tales of the Shire', 'Tales from the Shire', 'TotS', 'Fields of Mistria',
    'Calico', 'Bear and Breakfast', 'Snacko', 'Mineko', 'Yokai Inn', 'Echoes of the Plum Grove',
    'Sims', 'My Time at Portia', 'My Time at Sandrock', 'Everholm', 'Travellers Rest'
]

# Count mentions
game_counts = {}
for game in cozy_games:
    count = df_tots_cozy['content'].str.contains(game, case=False, na=False).sum()
    if count > 0:
        game_counts[game] = count

# Sort by frequency
game_counts_sorted = dict(sorted(game_counts.items(), key=lambda x: x[1], reverse=True))

print(f"\n📊 Game mentions (in {len(df_tots_cozy)} comments):\n")
for game, count in game_counts_sorted.items():
    pct = count / len(df_tots_cozy) * 100
    print(f"{game:30} {count:4} ({pct:.1f}%)")

🎮 GAMES MENTIONED BY TotS AUTHORS IN r/CozyGamers

📊 Game mentions (in 661 comments):

Tales of the Shire               18 (2.7%)
Sandrock                         15 (2.3%)
Animal Crossing                  14 (2.1%)
Dinkum                            9 (1.4%)
Wylde Flowers                     8 (1.2%)
Story of Seasons                  7 (1.1%)
Fantasy Life                      7 (1.1%)
My Time at Sandrock               7 (1.1%)
Dreamlight Valley                 6 (0.9%)
Coral Island                      6 (0.9%)
Fae Farm                          6 (0.9%)
Cozy Grove                        6 (0.9%)
Palia                             6 (0.9%)
Stardew Valley                    5 (0.8%)
Rune Factory                      5 (0.8%)
Luma Island                       5 (0.8%)
Portia                            5 (0.8%)
Disney Dreamlight Valley          4 (0.6%)
Ooblets                           4 (0.6%)
Fields of Mistria                 4 (0.6%)
Sims                              4 (0.6%)
Harvest Mo

In [23]:
# ==============================================================================
# CELL 14: CHECK NLP LIBRARIES
# ==============================================================================

print("🔍 CHECKING NLP LIBRARIES...")
print("=" * 50)

libraries = {
    'transformers': None,
    'torch': None,
    'textstat': None,
    'scipy': None,
    'sklearn': None
}

for lib in libraries:
    try:
        module = __import__(lib)
        version = getattr(module, '__version__', 'installed')
        libraries[lib] = version
        print(f"✅ {lib}: {version}")
    except ImportError:
        print(f"❌ {lib}: NOT INSTALLED")

print("\n📋 Summary:")
missing = [lib for lib, ver in libraries.items() if ver is None]
if missing:
    print(f"   Missing: {', '.join(missing)}")
    print(f"   Install with: pip install {' '.join(missing)}")
else:
    print("   All libraries available")

🔍 CHECKING NLP LIBRARIES...
✅ transformers: 4.53.2
✅ torch: 2.2.2
✅ textstat: (0, 7, 2)
✅ scipy: 1.13.1
✅ sklearn: 1.5.0

📋 Summary:
   All libraries available


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [24]:
# ==============================================================================
# CELL 14b: CHECK NLP MODELS
# ==============================================================================

print("🔍 CHECKING NLP MODELS...")
print("=" * 50)

from transformers import pipeline
import warnings
warnings.filterwarnings('ignore')

# Test GoEmotions (Reddit-trained emotion detection)
print("\n1. GoEmotions (Reddit-trained, 27 emotions)...")
try:
    emotion_classifier = pipeline("text-classification", 
                                  model="SamLowe/roberta-base-go_emotions", 
                                  top_k=5)
    test = emotion_classifier("This game rocks! I love it!")
    print(f"   ✅ Loaded successfully")
    print(f"   Test output: {test[0][:3]}")
except Exception as e:
    print(f"   ❌ Failed: {e}")

# Test sentiment (social media trained)
print("\n2. Twitter-RoBERTa Sentiment...")
try:
    sentiment_classifier = pipeline("sentiment-analysis",
                                    model="cardiffnlp/twitter-roberta-base-sentiment-latest")
    test = sentiment_classifier("This game rocks! I love it!")
    print(f"   ✅ Loaded successfully")
    print(f"   Test output: {test}")
except Exception as e:
    print(f"   ❌ Failed: {e}")

# Test textstat
print("\n3. Textstat (reading level)...")
try:
    import textstat
    test_text = "The hobbits of the Shire enjoy a peaceful life of farming and good food."
    fk = textstat.flesch_kincaid_grade(test_text)
    print(f"   ✅ Loaded successfully")
    print(f"   Test Flesch-Kincaid grade: {fk}")
except Exception as e:
    print(f"   ❌ Failed: {e}")

print("\n📋 Ready to process!")

🔍 CHECKING NLP MODELS...

1. GoEmotions (Reddit-trained, 27 emotions)...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cpu


   ✅ Loaded successfully
   Test output: [{'label': 'love', 'score': 0.9328274726867676}, {'label': 'admiration', 'score': 0.18399330973625183}, {'label': 'joy', 'score': 0.02394336275756359}]

2. Twitter-RoBERTa Sentiment...
   ❌ Failed: Could not load model cardiffnlp/twitter-roberta-base-sentiment-latest with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForSequenceClassification'>, <class 'transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification'>). See the original errors:

while loading with AutoModelForSequenceClassification, an error is thrown:
Traceback (most recent call last):
  File "/Users/jamesroot/anaconda3/envs/core311/lib/python3.11/site-packages/transformers/pipelines/base.py", line 292, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jamesroot/anaconda3/envs/core311/lib/python3.11/site-packages/tra

In [25]:
# ==============================================================================
# CELL 14c: TRY ALTERNATIVE SENTIMENT MODELS
# ==============================================================================

print("🔍 TRYING ALTERNATIVE SENTIMENT MODELS...")
print("=" * 50)

from transformers import pipeline
import warnings
warnings.filterwarnings('ignore')

# Option 1: distilbert sentiment (very reliable)
print("\n1. DistilBERT Sentiment...")
try:
    sentiment_classifier = pipeline("sentiment-analysis",
                                    model="distilbert-base-uncased-finetuned-sst-2-english")
    test = sentiment_classifier("This game rocks! I love it!")
    print(f"   ✅ Loaded successfully")
    print(f"   Test output: {test}")
except Exception as e:
    print(f"   ❌ Failed: {e}")

# Option 2: nlptown 5-star sentiment (more granular)
print("\n2. NLPTown 5-star Sentiment...")
try:
    sentiment_5star = pipeline("sentiment-analysis",
                               model="nlptown/bert-base-multilingual-uncased-sentiment")
    test = sentiment_5star("This game rocks! I love it!")
    print(f"   ✅ Loaded successfully")
    print(f"   Test output: {test}")
except Exception as e:
    print(f"   ❌ Failed: {e}")

# Option 3: cardiffnlp older version
print("\n3. Cardiff Twitter Sentiment (older)...")
try:
    sentiment_twitter = pipeline("sentiment-analysis",
                                 model="cardiffnlp/twitter-roberta-base-sentiment")
    test = sentiment_twitter("This game rocks! I love it!")
    print(f"   ✅ Loaded successfully")
    print(f"   Test output: {test}")
except Exception as e:
    print(f"   ❌ Failed: {e}")

🔍 TRYING ALTERNATIVE SENTIMENT MODELS...

1. DistilBERT Sentiment...


Device set to use cpu


   ✅ Loaded successfully
   Test output: [{'label': 'POSITIVE', 'score': 0.9998838901519775}]

2. NLPTown 5-star Sentiment...


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


   ✅ Loaded successfully
   Test output: [{'label': '5 stars', 'score': 0.9401008486747742}]

3. Cardiff Twitter Sentiment (older)...


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

   ❌ Failed: Could not load model cardiffnlp/twitter-roberta-base-sentiment with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForSequenceClassification'>, <class 'transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification'>). See the original errors:

while loading with AutoModelForSequenceClassification, an error is thrown:
Traceback (most recent call last):
  File "/Users/jamesroot/anaconda3/envs/core311/lib/python3.11/site-packages/transformers/pipelines/base.py", line 292, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jamesroot/anaconda3/envs/core311/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 600, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jamesroot/anaconda3/envs/core311/lib/python3.11/site-packages/transfo

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [26]:
# ==============================================================================
# CELL 15: NLP ANALYSIS ON TALES OF THE SHIRE
# ==============================================================================

import textstat
from tqdm import tqdm

print("🧠 RUNNING NLP ANALYSIS ON TotS COMMENTS...")
print(f"   Processing {len(df_tales_full):,} comments")
print("=" * 50)

# Combine post_title + content for context
df_tales_full['full_text'] = df_tales_full.apply(
    lambda row: f"Post: {row['post_title']} Comment: {row['content']}" 
    if pd.notna(row['post_title']) else row['content'], 
    axis=1
)

# Truncate to 512 tokens (model limit)
df_tales_full['full_text_truncated'] = df_tales_full['full_text'].str[:512]

# Initialize results
sentiments = []
emotions = []
reading_levels = []

# Process in batches
print("\n⏳ Processing (this may take a few minutes)...")

for idx, row in tqdm(df_tales_full.iterrows(), total=len(df_tales_full)):
    text = row['full_text_truncated']
    
    # Sentiment (5-star)
    try:
        sent = sentiment_5star(text)[0]
        sentiments.append({'label': sent['label'], 'score': sent['score']})
    except:
        sentiments.append({'label': None, 'score': None})
    
    # Emotion (top 3)
    try:
        emo = emotion_classifier(text)[0][:3]
        emotions.append(emo)
    except:
        emotions.append(None)
    
    # Reading level (on comment only, not combined)
    try:
        rl = textstat.flesch_kincaid_grade(row['content'])
        reading_levels.append(rl)
    except:
        reading_levels.append(None)

# Add to dataframe
df_tales_full['sentiment_label'] = [s['label'] for s in sentiments]
df_tales_full['sentiment_score'] = [s['score'] for s in sentiments]
df_tales_full['emotions'] = emotions
df_tales_full['reading_level'] = reading_levels

print("\n✅ NLP analysis complete!")

🧠 RUNNING NLP ANALYSIS ON TotS COMMENTS...
   Processing 10,751 comments

⏳ Processing (this may take a few minutes)...


100%|████████████████████████████████████████████████████████████████████████████████████████████| 10751/10751 [1:03:29<00:00,  2.82it/s]



✅ NLP analysis complete!


In [27]:
# ==============================================================================
# CELL 16: TALES OF THE SHIRE NLP SUMMARY
# ==============================================================================

print("📊 TALES OF THE SHIRE NLP ANALYSIS RESULTS")
print("=" * 50)

# Sentiment distribution
print("\n🎭 SENTIMENT (5-star scale):")
print(df_tales_full['sentiment_label'].value_counts())

# Average sentiment score
print(f"\nAverage confidence: {df_tales_full['sentiment_score'].mean():.2f}")

# Reading level
print(f"\n📖 READING LEVEL (Flesch-Kincaid Grade):")
print(f"   Mean: {df_tales_full['reading_level'].mean():.1f}")
print(f"   Median: {df_tales_full['reading_level'].median():.1f}")
print(f"   Std: {df_tales_full['reading_level'].std():.1f}")

# Top emotions
print("\n💫 TOP EMOTIONS (aggregated):")
emotion_counts = {}
for emo_list in df_tales_full['emotions'].dropna():
    for e in emo_list:
        label = e['label']
        emotion_counts[label] = emotion_counts.get(label, 0) + 1

emotion_sorted = sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True)
for emotion, count in emotion_sorted[:15]:
    pct = count / len(df_tales_full) * 100
    print(f"   {emotion:20} {count:5} ({pct:.1f}%)")

📊 TALES OF THE SHIRE NLP ANALYSIS RESULTS

🎭 SENTIMENT (5-star scale):
sentiment_label
1 star     4226
3 stars    2510
5 stars    1646
4 stars    1190
2 stars    1179
Name: count, dtype: int64

Average confidence: 0.43

📖 READING LEVEL (Flesch-Kincaid Grade):
   Mean: 5.4
   Median: 4.4
   Std: 31.0

💫 TOP EMOTIONS (aggregated):
   neutral               7911 (73.6%)
   approval              4673 (43.5%)
   curiosity             3454 (32.1%)
   confusion             2675 (24.9%)
   admiration            1451 (13.5%)
   realization           1438 (13.4%)
   love                  1216 (11.3%)
   annoyance             1133 (10.5%)
   joy                   1126 (10.5%)
   disappointment        1018 (9.5%)
   amusement              914 (8.5%)
   disapproval            882 (8.2%)
   gratitude              879 (8.2%)
   optimism               854 (7.9%)
   desire                 604 (5.6%)


In [28]:
# ==============================================================================
# CELL 17: WHAT'S DRIVING NEGATIVE EMOTIONS?
# ==============================================================================

print("🔍 COMMENTS WITH HIGH CONFUSION/DISAPPOINTMENT")
print("=" * 50)

# Find comments where confusion or disappointment was top emotion
def get_top_emotion(emo_list):
    if emo_list and len(emo_list) > 0:
        return emo_list[0]['label']
    return None

df_tales_full['top_emotion'] = df_tales_full['emotions'].apply(get_top_emotion)

# Confusion drivers
print("\n😕 TOP CONFUSION COMMENTS:\n")
confused = df_tales_full[df_tales_full['top_emotion'] == 'confusion'].nlargest(10, 'score')
for _, row in confused.iterrows():
    print(f"[score {row['score']}] Post: {row['post_title'][:50]}...")
    print(f"   {row['content'][:200]}...")
    print()

# Disappointment drivers
print("\n😞 TOP DISAPPOINTMENT COMMENTS:\n")
disappointed = df_tales_full[df_tales_full['top_emotion'] == 'disappointment'].nlargest(10, 'score')
for _, row in disappointed.iterrows():
    print(f"[score {row['score']}] Post: {row['post_title'][:50]}...")
    print(f"   {row['content'][:200]}...")
    print()

🔍 COMMENTS WITH HIGH CONFUSION/DISAPPOINTMENT

😕 TOP CONFUSION COMMENTS:

[score 34] Post: Alright, I bought it....
   I have not got to play yet, because work, but I feel like the critiques are not fitting. “The quests are just fetching things” yes, its a cozy game. “You can’t leave the village” yes, its a cozy game....

[score 32] Post: What ingredient am I missing?...
   Wait is that how it works? I thought it was dependent on how full your pantry was at the given moment....

[score 31] Post: What ingredient am I missing?...
   Each ingredient has a predetermined place to be visually represented in the pantry. For example, if you have even one tomato, the entire basket of tomatoes will appear in the pantry. I've been slowly ...

[score 31] Post: What did I do wrong?...
   I made this mistake a few times, it means you have to use 3 star mint specifically. The only time I’ve seen it mean the star level of the meal is when it’s asking for “5 star bitter pie” or “4 star sw...

[score 22

In [29]:
# ==============================================================================
# CELL 18: TOTS FEATURE REQUESTS / WISHLIST
# ==============================================================================

print("📝 TALES OF THE SHIRE WISHLIST ANALYSIS")
print("=" * 50)

# Find wishlist-type posts
wishlist_keywords = ['feature', 'want', 'wish', 'add', 'should', 'could', 'expand', 'missing', 'need', 'hope']
pattern = '|'.join(wishlist_keywords)

wishlist_posts = df_tales_full[df_tales_full['post_title'].str.contains(pattern, case=False, na=False)]['post_title'].unique()

print(f"Found {len(wishlist_posts)} potential wishlist posts:\n")
for title in wishlist_posts[:30]:
    count = len(df_tales_full[df_tales_full['post_title'] == title])
    print(f"[{count} comments] {title[:70]}...")

📝 TALES OF THE SHIRE WISHLIST ANALYSIS
Found 51 potential wishlist posts:

[35 comments] New player - what do you wish you’d known?...
[7 comments] Should You Play Tales Of The Shire In 2025?...
[6 comments] So anyone want to tell me where nefi tunnel is...
[4 comments] Thought I should share this here as well!...
[62 comments] What features would you like to see added?...
[44 comments] Add-ons you think would improve the game even more!...
[22 comments] I wish you could sell meals......
[16 comments] Just wanted to share my hobbit...
[7 comments] I wish I could rearrange!!...
[1 comments] For those who want to report bugs, forget Reddit, follow the link belo...
[17 comments] Save Data Missing...
[3 comments] I got a bird on my shoulder...
[4 comments] I'm working on which up-trades are the most profitable - help needed...
[8 comments] So I just cooked 3 - 2* meals and used white button. Am I missing some...
[3 comments] Switch 1 users ! Honest review big LOTR fan wanting thoughts on g

In [30]:
# ==============================================================================
# CELL 19: EXTRACT SPECIFIC FEATURE REQUESTS
# ==============================================================================

print("📝 DETAILED WISHLIST EXTRACTION")
print("=" * 50)

# Key wishlist posts
wishlist_titles = [
    'What features would you like to see added?',
    'Add-ons you think would improve the game even more!',
    'Loving it!!! But I\'m missing a few things',
    'I wish you could sell meals...',
    'I wish I could rearrange!!',
    'Wish there was seasonal events',
    'Have they said anything about adding multiplayer?',
    'What could be expanded?'
]

df_wishlist = df_tales_full[df_tales_full['post_title'].isin(wishlist_titles)]
print(f"Wishlist comments: {len(df_wishlist)}")

print("\n" + "="*60)
for title in wishlist_titles:
    post_comments = df_tales_full[df_tales_full['post_title'] == title]
    if len(post_comments) > 0:
        print(f"\n📌 {title}")
        print(f"   ({len(post_comments)} comments)")
        print("-" * 50)
        for _, row in post_comments.nlargest(5, 'score').iterrows():
            print(f"\n   [{row['score']}] {row['content'][:250]}...")

📝 DETAILED WISHLIST EXTRACTION
Wishlist comments: 234


📌 What features would you like to see added?
   (62 comments)
--------------------------------------------------

   [31] i want them to be able to go into other hobbit’s houses and have more casual dialogue options with the NPC hobbits...

   [23] Pipe smoking emote and more dialogue when just talking to npcs like village gossip etc would be neat! And special travelling npcs who visit for a limited time you can invite for meals and get rewards for slowly building them up over the seasons! Espe...

   [18] I want to be able to own farm animals, and I know they said no romance but I feel like romance would work in this game. And festivals would be great ...

   [17] Farm animals

Talk to people walking around

Use your furniture

Have people invite you for dinner- why am I the only one doing it?? Makes no sense.

Jump

Annual events that raise friendship, or other things together. 

The game feels kind of lonely...

   [17] The mai

In [31]:
# ==============================================================================
# CELL 20: THEME VALIDATION BEYOND WISHLIST POSTS
# ==============================================================================

print("🔍 VALIDATING THEMES ACROSS FULL CORPUS (excluding wishlist posts)")
print("=" * 60)

# Wishlist posts we already analyzed
wishlist_titles = [
    'What features would you like to see added?',
    'Add-ons you think would improve the game even more!',
    'Loving it!!! But I\'m missing a few things',
    'I wish you could sell meals...',
    'I wish I could rearrange!!',
    'Wish there was seasonal events',
    'Have they said anything about adding multiplayer?',
    'What could be expanded?'
]

# Exclude wishlist posts, require 100+ char comments
df_non_wishlist = df_tales_full[
    (~df_tales_full['post_title'].isin(wishlist_titles)) &
    (df_tales_full['content'].str.len() >= 100)
]

print(f"Corpus: {len(df_non_wishlist):,} comments (100+ chars, excluding wishlist posts)")

# Theme keywords
themes = {
    'furniture_sitting': r'\b(sit|sitting|chair|bench|couch|furniture)\b',
    'animals': r'\b(pet|petting|chicken|chickens|animal|animals|horse|duck)\b',
    'social_npc': r'\b(invite|invited|dinner|lonely|npc|dialogue|gossip)\b',
    'birthday_events': r'\b(birthday|party|festival|event|celebration)\b',
}

print("\n" + "="*60)

for theme, pattern in themes.items():
    matches = df_non_wishlist[
        df_non_wishlist['content'].str.contains(pattern, case=False, na=False, regex=True)
    ]
    
    print(f"\n📌 {theme.upper()}: {len(matches)} mentions outside wishlist posts")
    print("-" * 50)
    
    for _, row in matches.nlargest(5, 'score').iterrows():
        print(f"\n[score {row['score']}] Post: {row['post_title'][:60]}...")
        print(f"   {row['content'][:300]}...")

🔍 VALIDATING THEMES ACROSS FULL CORPUS (excluding wishlist posts)
Corpus: 5,315 comments (100+ chars, excluding wishlist posts)


📌 FURNITURE_SITTING: 199 mentions outside wishlist posts
--------------------------------------------------

[score 76] Post: NEW Patch Information...
   That's the way to write patch notes and keep people in a game!!!!  Who wrote that, Hornblower?  One of the Cotton's?  It's not written like a non-player, sitting at a desk bored with like developer.  No, it's all in on the lore and environment.  Love it!...

[score 43] Post: Difficulty level of the game...
   Short answer: No.


Hobbits' lives aren't difficult. That's the whole point of their society as Tolkien described it. The game is meant to evoke calm, friendship, light humor, and an appreciation for a slow life. 


Long answer: So I have 90 hours played atm. There is a slight bump in difficulty in ...

[score 39] Post: How do I wash clothes/take a bath?...
   I have a whole bathroom set up with a bath

In [32]:
# ==============================================================================
# CELL 21: WHAT'S WORKING (PRAISE EXTRACTION)
# ==============================================================================

print("💚 WHAT PLAYERS PRAISE (positive emotion comments)")
print("=" * 60)

# Filter to comments where top emotion is positive
positive_emotions = ['love', 'admiration', 'joy', 'gratitude']

df_praise = df_tales_full[
    (df_tales_full['top_emotion'].isin(positive_emotions)) &
    (df_tales_full['content'].str.len() >= 100)
]

print(f"Comments with positive top emotion (100+ chars): {len(df_praise):,}")
print(f"\nBreakdown:")
print(df_praise['top_emotion'].value_counts())

print("\n" + "="*60)

for emotion in positive_emotions:
    matches = df_praise[df_praise['top_emotion'] == emotion]
    if len(matches) == 0:
        continue
        
    print(f"\n💫 {emotion.upper()}: {len(matches)} comments")
    print("-" * 50)
    
    for _, row in matches.nlargest(5, 'score').iterrows():
        print(f"\n[score {row['score']}] Post: {row['post_title'][:60]}...")
        print(f"   {row['content'][:300]}...")

💚 WHAT PLAYERS PRAISE (positive emotion comments)
Comments with positive top emotion (100+ chars): 1,155

Breakdown:
top_emotion
love          451
joy           244
gratitude     243
admiration    217
Name: count, dtype: int64


💫 LOVE: 451 comments
--------------------------------------------------

[score 121] Post: Some of these hobbits are fast...
   I love when I get a gift from Nelfi while we're still in my dining room and she ends the dialogue with "Why are you still here?" 

Nelfi, I live here....

[score 77] Post: I didn’t realize...
   It was really lovely as a long time LOTR fan!! I've read the trilogy countless times, and absolutely loved all the history tidbits sprinkled through Tales of the Shire, and seeing the characters we read about like Noaks or Rosie!...

[score 76] Post: NEW Patch Information...
   That's the way to write patch notes and keep people in a game!!!!  Who wrote that, Hornblower?  One of the Cotton's?  It's not written like a non-player, sitting at a de

In [33]:
# ==============================================================================
# CELL 22: TEMPORAL ANALYSIS
# ==============================================================================

print("📅 TEMPORAL ANALYSIS: ARE THEMES PERSISTENT OR FADING?")
print("=" * 60)

import matplotlib.pyplot as plt

# Add week column
df_tales_full['week'] = df_tales_full['created_at'].dt.to_period('W').apply(lambda x: x.start_time)

# Overall volume by week
weekly_volume = df_tales_full.groupby('week').size()

print(f"Date range: {df_tales_full['week'].min()} to {df_tales_full['week'].max()}")
print(f"Total weeks: {len(weekly_volume)}")

# Theme keywords (same as Cell 20)
themes = {
    'furniture_sitting': r'\b(sit|sitting|chair|bench|couch|furniture)\b',
    'animals': r'\b(pet|petting|chicken|chickens|animal|animals|horse|duck)\b',
    'social_npc': r'\b(invite|invited|dinner|lonely|npc|dialogue|gossip)\b',
    'birthday_events': r'\b(birthday|party|festival|event|celebration)\b',
}

# Count theme mentions by week
theme_weekly = {}
for theme, pattern in themes.items():
    matches = df_tales_full[
        df_tales_full['content'].str.contains(pattern, case=False, na=False, regex=True)
    ]
    theme_weekly[theme] = matches.groupby('week').size()

# Create dataframe for plotting
df_temporal = pd.DataFrame({
    'total_volume': weekly_volume,
    **theme_weekly
}).fillna(0)

# Calculate percentage of total (controls for volume fluctuation)
for theme in themes.keys():
    df_temporal[f'{theme}_pct'] = df_temporal[theme] / df_temporal['total_volume'] * 100

print("\n📊 WEEKLY THEME MENTIONS (as % of total comments):\n")
print(df_temporal[[f'{t}_pct' for t in themes.keys()]].round(1).tail(15).to_string())

# Summary stats
print("\n" + "="*60)
print("📈 THEME PERSISTENCE (avg % of weekly comments):\n")
for theme in themes.keys():
    col = f'{theme}_pct'
    first_half = df_temporal[col].iloc[:len(df_temporal)//2].mean()
    second_half = df_temporal[col].iloc[len(df_temporal)//2:].mean()
    change = ((second_half - first_half) / first_half * 100) if first_half > 0 else 0
    print(f"   {theme:20} First half: {first_half:.1f}%  |  Second half: {second_half:.1f}%  |  Change: {change:+.0f}%")

📅 TEMPORAL ANALYSIS: ARE THEMES PERSISTENT OR FADING?
Date range: 2025-07-28 00:00:00 to 2026-01-26 00:00:00
Total weeks: 27

📊 WEEKLY THEME MENTIONS (as % of total comments):

            furniture_sitting_pct  animals_pct  social_npc_pct  birthday_events_pct
week                                                                               
2025-10-20                    0.0          2.2             0.0                  0.0
2025-10-27                    2.7          2.7             0.0                  0.0
2025-11-03                    5.1          0.0             0.0                  0.0
2025-11-10                    0.0          0.0             0.0                  0.0
2025-11-17                    1.1          0.0             0.0                  0.0
2025-11-24                    6.0          0.0             2.0                  0.0
2025-12-01                    2.9         13.2             2.9                  0.0
2025-12-08                    1.3          5.3             2.6     

In [35]:
# Show all DataFrame variables in memory
import pandas as pd
for name, obj in list(globals().items()):
    if isinstance(obj, pd.DataFrame):
        print(f"{name}: {obj.shape[0]:,} rows, {obj.shape[1]} cols")

df_lotr_comments: 69,037 rows, 11 cols
df_lotr_posts: 5,308 rows, 12 cols
post_titles: 5,308 rows, 2 cols
df_missing_titles: 4,928 rows, 2 cols
sub_comments: 168 rows, 4 cols
df_lotr_external: 4,269 rows, 11 cols
external_subs: 158 rows, 3 cols
samples: 3 rows, 11 cols
df_tales: 867 rows, 11 cols
df_tales_full: 10,751 rows, 18 cols
top_posts: 15 rows, 2 cols
df_cross: 210 rows, 4 cols
df_tots_cozy: 661 rows, 7 cols
confused: 10 rows, 17 cols
disappointed: 10 rows, 17 cols
df_wishlist: 234 rows, 17 cols
post_comments: 53 rows, 17 cols
df_non_wishlist: 5,315 rows, 17 cols
matches: 91 rows, 18 cols
df_praise: 1,155 rows, 17 cols
df_temporal: 27 rows, 9 cols


In [36]:
import os
os.makedirs('../output', exist_ok=True)

df_lotr_comments.to_csv('../output/lotr_comments_69k.csv', index=False)
df_lotr_posts.to_csv('../output/lotr_posts_5k.csv', index=False)
df_wishlist.to_csv('../output/wishlist_signals.csv', index=False)
df_praise.to_csv('../output/praise_signals.csv', index=False)
df_lotr_external.to_csv('../output/external_sub_comments.csv', index=False)
df_tales_full.to_csv('../output/tales_full.csv', index=False)
df_temporal.to_csv('../output/temporal_analysis.csv', index=False)

print("Done. Files saved to LOTR/output/")

Done. Files saved to LOTR/output/
