In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import os

# --- 1. LOAD YOUR SOCIAL DATA ---
social_path = '../data/raw/reddit_titles.csv'

if not os.path.exists(social_path):
    print("Error: The 'reddit_titles.csv' file was not found.")
    print(f"- Please make sure you have run the '02_reddit_scraper.ipynb' notebook successfully to create: {social_path}")
else:
    df_social = pd.read_csv(social_path)
    print("Successfully loaded Reddit social data.")
    display(df_social.head())

    # --- 2. FIND TOP KEYWORDS (THE SIGNAL) ---
    social_titles = df_social['title'].dropna()

    # (Optional) Print your titles to see what the script is analyzing
    print("\n--- Reddit Titles for Analysis ---")
    print(social_titles.to_list())
    
    try:
        # We use CountVectorizer to count word frequencies, ignoring common English stopwords
        vectorizer = CountVectorizer(stop_words='english', max_features=25)
        
        # Learn the vocabulary and count the words from social titles
        social_counts = vectorizer.fit_transform(social_titles)
        social_keywords = vectorizer.get_feature_names_out()

        print("\n--- Analysis Results ---")
        print(f"Top Social Keywords:   {list(social_keywords)}")
        print("\nThis analysis is currently based only on social media data.")
        print("Once your 'official' data source is working, we can compare keywords.")

    except ValueError as e:
        print("\n--- Analysis Error ---")
        print(f"Could not process keywords from social data. This often means the text is too short after removing common words.")
        print(f"Original Error: {e}")
        

Successfully loaded Reddit social data.


Unnamed: 0,source,title,url
0,Reddit r/energy,DOE props up dying coal with $625M days after ...,https://electrek.co/2025/09/29/doe-props-up-dy...
1,Reddit r/energy,Trump Opens 13 Million Acres for Coal Mines to...,https://www.bloomberg.com/news/articles/2025-0...
2,Reddit r/energy,The Trump administration’s war on wind,https://www.hcn.org/articles/the-trump-adminis...
3,Reddit r/energy,Frequent Fox appearances are central to Trump’...,https://www.mediamatters.org/fox-news/frequent...
4,Reddit r/energy,Solar was the EU's largest single source of el...,https://ec.europa.eu/eurostat/web/products-eur...



--- Reddit Titles for Analysis ---
['DOE props up dying coal with $625M days after Energy Secretary Chris Wright mocks clean energy subsidies', 'Trump Opens 13 Million Acres for Coal Mines to Aid Ailing Sector', 'The Trump administration’s war on wind', 'Frequent Fox appearances are central to Trump’s climate and energy disinformation strategy. Fox News and Fox Business have not simply covered Trump’s destructive energy agenda — they have served as the administration’s preferred vehicles for selling it to the public.', "Solar was the EU's largest single source of electricity in June, at 22%, nearly x2 as much as natural gas", 'Are Renewables Raising Electricity Prices? Probably Not.', "Gordon cheers Trump's coal revival plan while environmentalists warn of disaster", 'See How EV Road Trips Went From Impossible to Easy. Routes that once required careful planning now have abundant fast chargers. The vast majority routes between major cities now have a fast charger at least every 100 mil