In [None]:
"""
Reddit Post Sentiment Analysis - Economy-Related Posts from Midwest States
===========================================================================

This script scrapes Reddit posts about the economy from five Midwest states,
performs sentiment analysis on the post titles, and exports the results to CSV.

Author: [Your Name]
Date: December 2024
Python Version: 3.12+

Dependencies:
- praw (Reddit API wrapper)
- pandas (data manipulation)
- nltk (sentiment analysis)

Setup Instructions:
1. Install required packages: pip install praw pandas nltk
2. Create a Reddit app at https://www.reddit.com/prefs/apps
3. Replace the client_id and client_secret with your credentials
4. Run the notebook cells in order
"""

In [None]:
# CELL 1: Install PRAW (Python Reddit API Wrapper)
# Uncomment and run this if you need to install praw
!pip install praw

In [None]:
# CELL 2: Import Required Libraries and Scrape Reddit Posts

import time
import praw
import pandas as pd
from datetime import datetime

# Initialize Reddit API client
# IMPORTANT: Replace these credentials with your own from https://www.reddit.com/prefs/apps
reddit = praw.Reddit(
    client_id="YOUR_CLIENT_ID_HERE",          # Replace with your client ID
    client_secret="YOUR_CLIENT_SECRET_HERE",   # Replace with your client secret
    user_agent="economy_sentiment_scraper by u/YOUR_USERNAME"  # Replace with your username
)

def scrape_economy_posts_by_state(limit=500, requests_per_minute=50):
    """
    Scrape Reddit posts about the economy for specified Midwest states.
    
    Parameters:
    -----------
    limit : int, default=500
        Maximum number of posts to retrieve per state
    requests_per_minute : int, default=50
        Rate limit for API requests (Reddit allows 60/minute for most apps)
    
    Returns:
    --------
    Creates global DataFrames for each state (Ohio, Wisconsin, Indiana, Illinois, Michigan)
    """
    
    # Define the states to analyze
    states = ["Ohio", "Wisconsin", "Indiana", "Illinois", "Michigan"]
    
    # Define date range: 2015-01-01 to 2025-12-31
    # Convert to Unix timestamps (seconds since 1970-01-01)
    start_range = int(datetime(2015, 1, 1).timestamp())
    end_range = int(datetime(2025, 12, 31, 23, 59, 59).timestamp())
    
    # Loop through each state
    for state in states:
        print(f"Scraping posts for {state}...")
        posts = []
        request_count = 0
        start_time = time.time()
        
        # Create search query: "economy" + state name
        search_query = f"economy {state}"
        
        # Search all of Reddit for posts matching the query
        for submission in reddit.subreddit("all").search(search_query, limit=limit):
            
            # Only keep posts created between 2015 and 2025
            if start_range <= submission.created_utc <= end_range:
                posts.append({
                    "state": state,
                    "title": submission.title,
                    "score": submission.score,  # Reddit upvote score
                    "url": submission.url,
                    "created_utc": submission.created_utc,  # Unix timestamp
                    "date": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                    "num_comments": submission.num_comments,
                    "selftext": submission.selftext  # Body text of post (if any)
                })
            
            # Rate limit protection: pause after reaching request limit
            request_count += 1
            if request_count >= requests_per_minute:
                elapsed = time.time() - start_time
                if elapsed < 60:
                    sleep_time = 60 - elapsed
                    print(f"Rate limit reached. Sleeping {sleep_time:.2f} seconds...")
                    time.sleep(sleep_time)
                request_count = 0
                start_time = time.time()
        
        # Create a global DataFrame for this state
        globals()[state] = pd.DataFrame(posts)
        print(f"Collected {len(posts)} posts for {state}.\n")

# Run the scraping function
if __name__ == "__main__":
    scrape_economy_posts_by_state(limit=500)
    
    # Display sample of Ohio data
    print("Ohio sample:")
    print(Ohio[['title', 'date', 'score']].head())

In [None]:
# CELL 3: Perform Sentiment Analysis on Post Titles

from nltk import download
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download VADER lexicon (only needed once)
# VADER = Valence Aware Dictionary and sEntiment Reasoner
# Specifically tuned for social media text
download('vader_lexicon')

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# List of state DataFrames to process
states = ["Ohio", "Wisconsin", "Indiana", "Illinois", "Michigan"]

# Loop through each state and add sentiment scores
for state in states:
    print(f"Adding sentiment scores to {state}...")
    
    # Get the post titles from the DataFrame
    corpus = globals()[state]['title']
    
    # Analyze sentiment for each title
    sentiment_results = []
    for sentence in corpus:
        # polarity_scores returns a dictionary with:
        # - 'neg': negative sentiment (0-1)
        # - 'neu': neutral sentiment (0-1)
        # - 'pos': positive sentiment (0-1)
        # - 'compound': overall sentiment (-1 to +1)
        ss = sia.polarity_scores(sentence)
        sentiment_results.append(ss)
    
    # Convert sentiment results to DataFrame
    sentiment_df = pd.DataFrame(sentiment_results)
    
    # Add sentiment columns to the existing state DataFrame
    globals()[state] = pd.concat(
        [globals()[state].reset_index(drop=True), sentiment_df.reset_index(drop=True)],
        axis=1
    )

# Display sample with sentiment scores
print("Ohio sample with sentiment:")
print(Ohio[['title', 'neg', 'neu', 'pos', 'compound']].head())

In [None]:
# CELL 4: Export Data to CSV Files

# Save each state's data to a separate CSV file
Ohio.to_csv("Ohio_2015-2025.csv", index=False)
Wisconsin.to_csv("Wisconsin_2015-2025.csv", index=False)
Indiana.to_csv("Indiana_2015-2025.csv", index=False)
Michigan.to_csv("Michigan_2015-2025.csv", index=False)
Illinois.to_csv("Illinois_2015-2025.csv", index=False)

print("All data exported successfully!")


In [None]:
# CELL 5: Create Summary Statistics Table (Optional)

# This section demonstrates how to create a formatted summary table
# showing sentiment trends by year

# Read back one of the CSV files (example with Illinois)
df = pd.read_csv("Illinois_2015-2025.csv")

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year

# Define sentiment columns to analyze
sentiment_cols = ['compound', 'neg', 'neu', 'pos']

# Group by year and calculate mean values
summary_by_year = (
    df.groupby('year')[sentiment_cols]
      .agg(['mean'])
      .round(4)
)

# Flatten column names for cleaner display
summary_by_year.columns = [f"{col}_{stat}" for col, stat in summary_by_year.columns]

# Create styled table with color gradient
styled_table = (
    summary_by_year.style
        .set_caption("Sentiment Summary Statistics by Year")
        .set_table_styles([
            {"selector": "caption", 
             "props": [("font-size", "16px"), 
                      ("font-weight", "bold"), 
                      ("text-align", "center"), 
                      ("padding", "10px")]},
            {"selector": "th", 
             "props": [("background-color", "#e8e8e8"), 
                      ("font-weight", "bold"), 
                      ("border", "1px solid #bfbfbf")]},
            {"selector": "td", 
             "props": [("border", "1px solid #d9d9d9"), 
                      ("padding", "6px")]}
        ])
        .background_gradient(cmap="Blues", axis=0)
        .highlight_max(axis=0, color="#a3d0ff")
        .highlight_min(axis=0, color="#ffd6cc")
)

# Display the styled table
styled_table

In [None]:
"""
OUTPUT FILES:
-------------
- Ohio_2015-2025.csv
- Wisconsin_2015-2025.csv
- Indiana_2015-2025.csv
- Illinois_2015-2025.csv
- Michigan_2015-2025.csv

COLUMNS IN OUTPUT:
------------------
- state: State name
- title: Reddit post title
- score: Reddit upvote score
- url: Post URL
- created_utc: Unix timestamp
- date: Human-readable date
- num_comments: Number of comments
- selftext: Post body text
- neg: Negative sentiment score (0-1)
- neu: Neutral sentiment score (0-1)
- pos: Positive sentiment score (0-1)
- compound: Overall sentiment score (-1 to +1)

INTERPRETING SENTIMENT SCORES:
-------------------------------
Compound Score:
  - Positive sentiment: compound > 0.05
  - Neutral sentiment: -0.05 <= compound <= 0.05
  - Negative sentiment: compound < -0.05
  
The neg, neu, and pos scores always sum to 1.0 and represent 
the proportion of text that falls into each category.

NOTES:
------
- Reddit API has rate limits (typically 60 requests/minute)
- The script includes automatic rate limiting and pauses
- Sentiment analysis is performed on titles only (not post bodies)
- VADER is optimized for social media text and handles emojis, 
  capitalization, and punctuation
"""