In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import os

# --- 1. LOAD ALL YOUR DATA ---
reddit_path = '../data/raw/reddit_titles_filtered.csv'
twitter_path = '../data/raw/twitter_data.csv'

# Load the two datasets you created
df_reddit = pd.read_csv(reddit_path)
df_twitter = pd.read_csv(twitter_path)

print("Successfully loaded data from Reddit and X.")

# --- 2. FIND SHARED KEYWORDS (THE SIGNAL) ---
reddit_titles = df_reddit['title'].dropna()
twitter_titles = df_twitter['title'].dropna()
    
try:
    # We use a single vectorizer to learn the vocabulary from ALL titles
    vectorizer = CountVectorizer(stop_words='english', max_features=50)
    vectorizer.fit(pd.concat([reddit_titles, twitter_titles]))

    # Find the keywords present in each source
    reddit_keywords = {word for word in vectorizer.get_feature_names_out() if vectorizer.transform(reddit_titles).toarray().sum(axis=0)[vectorizer.vocabulary_[word]] > 0}
    twitter_keywords = {word for word in vectorizer.get_feature_names_out() if vectorizer.transform(twitter_titles).toarray().sum(axis=0)[vectorizer.vocabulary_[word]] > 0}
    
    # Find the keywords that appear in BOTH lists
    shared_keywords = reddit_keywords.intersection(twitter_keywords)

    print("\n--- Analysis Results ---")
    print(f"Top Reddit Keywords: {list(reddit_keywords)}")
    print(f"Top X Keywords:   {list(twitter_keywords)}")
    print("\n Shared Signal Keywords:", list(shared_keywords) if shared_keywords else "None found.")

except ValueError as e:
    print(f"\n--- Analysis Error --- \n{e}")

Successfully loaded data from Reddit and X.

--- Analysis Results ---
Top Reddit Keywords: ['centers', 'industry', 'crisis', '2025', 'energy', 'coal', 'help', 'solar', 'electric', '100', 'gas', 'uk', 'power', 'trump', 'electricity', 'prices', 'new', 'utility', 'grid', 'data']
Top X Keywords:   ['newsom', 'https', 'centers', 'cost', 'industry', 'rt', 'crisis', 'backup', 'demand', 'low', '2025', 'available', 'families', 'income', 'energy', 'coal', 'help', 'reliability', 'solar', 'electric', 'incentive', 'install', 'future', 'october', '100', 'mayorofla', 'gas', 'uk', 'capacity', 'power', 'self', 'trump', 'electricity', 'city', 'generation', 'prices', 'allowing', 'austin', 'new', 'texas', 'utility', 'lower', 'control', 'cover', 'program', 'grid', 'amp', 'need', 'bills', 'data']

 Shared Signal Keywords: ['centers', 'industry', 'crisis', '2025', 'energy', 'coal', 'help', 'solar', 'electric', '100', 'gas', 'uk', 'power', 'trump', 'electricity', 'prices', 'new', 'utility', 'grid', 'data']
