In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import os

# --- 1. LOAD YOUR SOCIAL DATA ---
social_path = '../data/raw/reddit_titles_filtered.csv'

if not os.path.exists(social_path):
    print("Error: The 'reddit_titles.csv' file was not found.")
    print(f"- Please make sure you have run the '02_reddit_scraper.ipynb' notebook successfully to create: {social_path}")
else:
    df_social = pd.read_csv(social_path)
    print("Successfully loaded Reddit social data.")
    display(df_social.head())

    # --- 2. FIND TOP KEYWORDS (THE SIGNAL) ---
    social_titles = df_social['title'].dropna()

    # (Optional) Print your titles to see what the script is analyzing
    print("\n--- Reddit Titles for Analysis ---")
    print(social_titles.to_list())
    
    try:
        # We use CountVectorizer to count word frequencies, ignoring common English stopwords
        vectorizer = CountVectorizer(stop_words='english', max_features=25)
        
        # Learn the vocabulary and count the words from social titles
        social_counts = vectorizer.fit_transform(social_titles)
        social_keywords = vectorizer.get_feature_names_out()

        print("\n--- Analysis Results ---")
        print(f"Top Social Keywords:   {list(social_keywords)}")
        print("\nThis analysis is currently based only on social media data.")
        print("Once your 'official' data source is working, we can compare keywords.")

    except ValueError as e:
        print("\n--- Analysis Error ---")
        print(f"Could not process keywords from social data. This often means the text is too short after removing common words.")
        print(f"Original Error: {e}")
        

Successfully loaded Reddit social data.


Unnamed: 0,source,title,url
0,Reddit r/energy,As more renewable energy sources come onto the...,https://www.lpm.org/news/2025-10-03/wind-turbi...
1,Reddit r/energy,Seems all a state needs is the will to push fo...,https://constructionreviewonline.com/news/reps...



--- Reddit Titles for Analysis ---
['As more renewable energy sources come onto the grid, Kentucky is trying to find its role in this emerging economy.', "Seems all a state needs is the will to push for renewables. High-quality solar irradiance, a largely deregulated grid also play a big part in Texas' growing renewables portfolio. The Lone Star state continues to increase its capacity after breaking ground a 595 MW solar project in Leon County."]

--- Analysis Results ---
Top Social Keywords:   ['595', 'big', 'breaking', 'capacity', 'come', 'continues', 'grid', 'kentucky', 'largely', 'leon', 'lone', 'mw', 'needs', 'play', 'portfolio', 'project', 'push', 'quality', 'renewable', 'renewables', 'role', 'solar', 'sources', 'star', 'state']

This analysis is currently based only on social media data.
Once your 'official' data source is working, we can compare keywords.
