In [None]:
"""
Midwest Sentiment Analysis - Data Preparation Script
====================================================

This script combines Reddit sentiment data with economic indicators for 
Midwest states (Illinois, Indiana, Ohio, Michigan, Wisconsin) to prepare
data for modeling grocery price sentiment from 2016-2025.

Required CSV files:
- Illinois 2015-2025.csv
- Indiana 2015-2025.csv  
- Ohio 2015-2025.csv
- Michigan 2015-2025.csv
- Wisconsin 2015-2025.csv
- Master_df.csv (economic indicators)

Output: ModelData.csv (final merged dataset for modeling)
"""

import pandas as pd

# STEP 1: Load state-specific Reddit sentiment data

# Each CSV contains Reddit posts about the economy from state subreddits
# Columns: state, title, score, url, created_utc, date, num_comments, 
#          selftext, compound (sentiment score)

IL = pd.read_csv("Illinois 2015-2025.csv")
IN = pd.read_csv("Indiana 2015-2025.csv")
OH = pd.read_csv("Ohio 2015-2025.csv")
MI = pd.read_csv("Michigan 2015-2025.csv")
WI = pd.read_csv("Wisconsin 2015-2025.csv")

# STEP 2: Combine all state DataFrames into one

state_dfs = [OH, WI, IN, IL, MI]
all_states_df = pd.concat(state_dfs, ignore_index=True)

print(all_states_df.head())
print(f"Total rows: {len(all_states_df)}")

# STEP 3: Clean up sentiment columns

# Remove individual sentiment components, keep only compound score
# Compound score ranges from -1 (most negative) to +1 (most positive)
all_states_df = all_states_df.drop(columns=["pos", "neg", "neu"])

# STEP 4: Add binary sentiment classification
def add_sentiment_label(df):
    """
    Convert compound score to binary sentiment label:
    - positive: compound >= 0
    - negative: compound < 0
    """
    df["sentiment"] = df["compound"].apply(
        lambda x: "positive" if x >= 0 else "negative"
    )
    return df

all_states_df = add_sentiment_label(all_states_df)

# STEP 5: Save combined sentiment data
all_states_df.to_csv("MWSentiment.csv", index=False)
print("\nSaved: MWSentiment.csv")





In [None]:
# STEP 7: Load and prepare economic indicators

# Master_df.csv contains economic data with Year and Month columns
Midwest = pd.read_csv('Master_df.csv')

# Combine Year + Month into "YYYY Mon" format to match sentiment data
Midwest['Year_Month'] = Midwest['Year'].astype(str) + ' ' + Midwest['Month']

# Drop original Year/Month columns
Midwest = Midwest.drop(['Year', 'Month'], axis=1)

print("\nEconomic indicators preview:")
print(Midwest.head())

# STEP 8: Merge sentiment data with economic indicators

# First merge: Add overall monthly sentiment
Midwest = Midwest.merge(monthly_sentiment, on="Year_Month", how="left")

# Second merge: Add state-level monthly sentiment
Midwest = Midwest.merge(state_month_sentiment, on="Year_Month", how="left")

print("\nMerged data preview:")
print(Midwest.head())

In [None]:
# STEP 9: Filter to analysis period (2016-2025)

# Convert Year_Month to datetime for filtering
Midwest['YM_dt'] = pd.to_datetime(Midwest['Year_Month'], format="%Y %b")

# Define date range for analysis
start = "2016-01-01"
end = "2025-09-01"

# Filter data
Midwest_subset = Midwest[
    (Midwest['YM_dt'] >= start) & (Midwest['YM_dt'] <= end)
].copy()

print(f"\nFiltered to {len(Midwest_subset)} rows between {start} and {end}")

# STEP 10: Save final dataset for modeling

Midwest_subset.to_csv("ModelData.csv", index=False)
print("\nFinal dataset saved: ModelData.csv")
print(f"Shape: {Midwest_subset.shape}")
print(f"Columns: {list(Midwest_subset.columns)}")