In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from collections import Counter

df = pd.read_csv('/content/drive/MyDrive/data/search_1_clustered.csv')

df['creation_time'] = pd.to_datetime(df['creation_time'], errors='coerce')
df['year'] = df['creation_time'].dt.year

print(f"Total posts: {len(df):,}")
print(f"Clusters: {sorted(df['cluster'].dropna().unique())}")

In [None]:
clusters = sorted(df['cluster'].dropna().unique())
total_posts = len(df)

for cluster_num in clusters:
    cluster_data = df[df['cluster'] == cluster_num]
    
    size = len(cluster_data)
    proportion = (size / total_posts) * 100
    top_authors = cluster_data['post_owner.name'].value_counts().head(5)
    
    print(f"\n=== Cluster {cluster_num} ===")
    print(f"Size: {size:,} posts ({proportion:.1f}%)")
    print(f"Top authors:")
    for author, count in top_authors.items():
        print(f"  {author}: {count} posts")

In [None]:
def extract_keywords(text):
    if pd.isna(text):
        return []
    text_lower = str(text).lower()
    keywords = []
    
    if any(w in text_lower for w in ['beps', 'base erosion', 'profit shifting', 'oecd']):
        keywords.append('BEPS')
    if any(w in text_lower for w in ['pillar two', 'pillar 2', 'global minimum', 'g7 tax', 'g20 tax']):
        keywords.append('BEPS2.0')
    if any(w in text_lower for w in ['tcja', 'tax cuts and jobs act', 'trump tax']):
        keywords.append('TCJA')
    if any(w in text_lower for w in ['tariff', 'trade war', 'import tax']):
        keywords.append('Tariffs')
    
    return keywords

for cluster_num in clusters:
    cluster_data = df[df['cluster'] == cluster_num]
    
    all_keywords = []
    for _, row in cluster_data.iterrows():
        text = f"{row.get('text', '')} {row.get('full_text', '')}"
        all_keywords.extend(extract_keywords(text))
    
    keyword_counts = Counter(all_keywords)
    total = len(cluster_data)
    
    print(f"\n=== Cluster {cluster_num} ===")
    for keyword, count in keyword_counts.items():
        pct = (count / total) * 100 if total > 0 else 0
        print(f"{keyword}: {count} posts ({pct:.1f}%)")