In [None]:
"""
Random Forest Classification for Sentiment Prediction
======================================================
This notebook implements Random Forest models to predict overall and state-level 
sentiment based on economic indicators for Midwest states.

Author: [Your Name]
Date: [Date]
Requirements: pandas, sklearn, matplotlib, seaborn
"""

# ============================================================================
# PART 1: INITIAL SETUP AND DATA LOADING
# ============================================================================

import pandas as pd

# Load the dataset containing economic indicators and sentiment data
# File should be in the same directory as this notebook
Midwest = pd.read_csv("ModelData.csv")

# PART 2: DATA EXPLORATION AND PREPROCESSING

# Display the first few rows to understand the data structure
Midwest.head()

# Create binary sentiment labels based on compound scores
# Overall sentiment: positive if compound >= 0, negative otherwise
def overall_sentiment(df):
    """
    Convert overall compound sentiment scores to binary labels.
    
    Parameters:
    df (DataFrame): Input dataframe with 'overall compound' column
    
    Returns:
    DataFrame: Original dataframe with added 'overall sentiment' column
    """
    df["overall sentiment"] = df["overall compound"].apply(
        lambda x: "positive" if x >= 0 else "negative"
    )
    return df

# State-level sentiment: positive if compound >= 0, negative otherwise
def state_sentiment(df):
    """
    Convert state-level compound sentiment scores to binary labels.
    
    Parameters:
    df (DataFrame): Input dataframe with 'state compound' column
    
    Returns:
    DataFrame: Original dataframe with added 'state sentiment' column
    """
    df["state sentiment"] = df["state compound"].apply(
        lambda x: "positive" if x >= 0 else "negative"
    )
    return df

# Apply sentiment labeling functions
Midwest = overall_sentiment(Midwest)
Midwest = state_sentiment(Midwest)

# Display the processed dataframe
Midwest

In [None]:
# PART 3: MODEL 1 - OVERALL SENTIMENT PREDICTION

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    f1_score, confusion_matrix, classification_report
)

# Initialize Random Forest classifier with Gini criterion
# n_estimators=100: Use 100 decision trees in the forest
# random_state=100: Set seed for reproducibility
rforest1 = RandomForestClassifier(
    criterion='gini', 
    n_estimators=100, 
    random_state=100
)

# Define features (economic indicators) and target (overall sentiment)
X = Midwest[[
    'mw_PCE',                # Personal Consumption Expenditures
    'Unemployment',          # Unemployment rate
    'mw_RealDispIncome',    # Real Disposable Income
    'mw_wholesale_PPI',      # Wholesale Producer Price Index
    'mw_supermarket_PPI',    # Supermarket Producer Price Index
    'mw_produce_CPI',        # Produce Consumer Price Index
    'mw_meat_CPI',          # Meat Consumer Price Index
    'mw_diary_CPI'          # Dairy Consumer Price Index
]]

y = Midwest['overall sentiment']

# Split data: 70% training, 30% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=100
)

# Train the model
rforest1.fit(X_train, y_train)

# Make predictions on test set
y_rforest1_pred = rforest1.predict(X_test)

In [None]:
# PART 3: MODEL 1 - OVERALL SENTIMENT PREDICTION

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    f1_score, confusion_matrix, classification_report
)

# Initialize Random Forest classifier with Gini criterion
# n_estimators=100: Use 100 decision trees in the forest
# random_state=100: Set seed for reproducibility
rforest1 = RandomForestClassifier(
    criterion='gini', 
    n_estimators=100, 
    random_state=100
)

# Define features (economic indicators) and target (overall sentiment)
X = Midwest[[
    'mw_PCE',                # Personal Consumption Expenditures
    'Unemployment',          # Unemployment rate
    'mw_RealDispIncome',    # Real Disposable Income
    'mw_wholesale_PPI',      # Wholesale Producer Price Index
    'mw_supermarket_PPI',    # Supermarket Producer Price Index
    'mw_produce_CPI',        # Produce Consumer Price Index
    'mw_meat_CPI',          # Meat Consumer Price Index
    'mw_diary_CPI'          # Dairy Consumer Price Index
]]

y = Midwest['overall sentiment']

# Split data: 70% training, 30% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=100
)

# Train the model
rforest1.fit(X_train, y_train)

# Make predictions on test set
y_rforest1_pred = rforest1.predict(X_test)

In [None]:
# PART 4: MODEL 1 EVALUATION - CONFUSION MATRIX AND METRICS

import matplotlib.pyplot as plt
import seaborn as sns

# Create confusion matrix
cm = confusion_matrix(y_test, y_rforest1_pred)

# Convert to DataFrame for better visualization
confusion_df = pd.DataFrame(
    cm,
    index=["Actual Negative", "Actual Positive"],
    columns=["Predicted Negative", "Predicted Positive"]
)

# Plot confusion matrix as heatmap
plt.figure(figsize=(6,4))
sns.heatmap(confusion_df, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Overall Sentiment")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()

# Calculate per-class performance metrics
metrics_table = pd.DataFrame({
    "Class": ["Negative", "Positive"],
    "Precision": [
        precision_score(y_test, y_rforest1_pred, pos_label="negative"),
        precision_score(y_test, y_rforest1_pred, pos_label="positive")
    ],
    "Recall": [
        recall_score(y_test, y_rforest1_pred, pos_label="negative"),
        recall_score(y_test, y_rforest1_pred, pos_label="positive")
    ],
    "F1 Score": [
        f1_score(y_test, y_rforest1_pred, pos_label="negative"),
        f1_score(y_test, y_rforest1_pred, pos_label="positive")
    ],
    "Support (# Obs)": [
        sum(y_test == "negative"),
        sum(y_test == "positive")
    ]
})

# Style the metrics table for better readability
metrics_table_styled = (
    metrics_table.style
        .set_properties(**{
            'font-size': '14px',
            'padding': '8px',
            'text-align': 'center'
        })
        .set_table_styles([
            {'selector': 'th', 
             'props': [
                 ('font-size', '16px'), 
                 ('font-weight', 'bold'), 
                 ('text-align', 'center'),
                 ('background-color', '#1f4e79'),
                 ('color', 'white'),
                 ('padding', '10px')
             ]},
            {'selector': 'tbody tr:nth-child(even)',
             'props': [('background-color', '#f2f6fc')]},
            {'selector': 'tbody tr:nth-child(odd)',
             'props': [('background-color', 'white')]},
            {'selector': 'table',
             'props': [
                 ('border-collapse', 'collapse'),
                 ('border', '1px solid #ccc'),
                 ('border-radius', '8px'),
                 ('overflow', 'hidden')
             ]},
        ])
        .background_gradient(cmap="Blues", subset=["Precision", "Recall", "F1 Score"])
        .format("{:.3f}", subset=["Precision", "Recall", "F1 Score"])
)

metrics_table_styled

In [None]:
# PART 5: FEATURE IMPORTANCE ANALYSIS

# Extract feature names
features = X.columns

# Count distribution of sentiment classes
positive_count = (y == "positive").sum()
negative_count = (y == "negative").sum()

# Create overall model performance summary
results_table = pd.DataFrame({
    "Metric": [
        "Accuracy",
        "Precision (Macro)",
        "Recall (Macro)",
        "F1 (Macro)",
        "Positive Count",
        "Negative Count"
    ],
    "Value": [
        accuracy_score(y_test, y_rforest1_pred),
        precision_score(y_test, y_rforest1_pred, average='macro'),
        recall_score(y_test, y_rforest1_pred, average='macro'),
        f1_score(y_test, y_rforest1_pred, average='macro'),
        positive_count,
        negative_count
    ]
})

# Create feature importance table
# Shows which economic indicators are most influential in predictions
feature_importance_table = (
    pd.DataFrame({
        "Feature": features,
        "Importance": rforest1.feature_importances_
    })
    .sort_values("Importance", ascending=False)
    .reset_index(drop=True)
)

# Style feature importance table
feature_importance_pretty = (
    feature_importance_table
    .style.hide(axis='index')
    .set_properties(**{
        'background-color': '#eef7ff',
        'border': '1px solid #bbb',
        'padding': '6px'
    })
    .format({"Importance": "{:.4f}"})
)

feature_importance_pretty

In [None]:
# PART 6: MODEL 2 - STATE-LEVEL SENTIMENT PREDICTION

# Redefine target variable for state-level sentiment
y = Midwest['state sentiment']

# Split data again with same parameters for consistency
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=100
)

# Train new model on state sentiment
rforest1.fit(X_train, y_train)
y_rforest1_pred = rforest1.predict(X_test)

In [None]:
# PART 7: MODEL 2 EVALUATION

# Repeat evaluation process for state sentiment model
cm = confusion_matrix(y_test, y_rforest1_pred)

confusion_df = pd.DataFrame(
    cm,
    index=["Actual Negative", "Actual Positive"],
    columns=["Predicted Negative", "Predicted Positive"]
)

# Plot confusion matrix
plt.figure(figsize=(6,4))
sns.heatmap(confusion_df, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - State Sentiment")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()

# Calculate metrics for state sentiment model
metrics_table = pd.DataFrame({
    "Class": ["Negative", "Positive"],
    "Precision": [
        precision_score(y_test, y_rforest1_pred, pos_label="negative"),
        precision_score(y_test, y_rforest1_pred, pos_label="positive")
    ],
    "Recall": [
        recall_score(y_test, y_rforest1_pred, pos_label="negative"),
        recall_score(y_test, y_rforest1_pred, pos_label="positive")
    ],
    "F1 Score": [
        f1_score(y_test, y_rforest1_pred, pos_label="negative"),
        f1_score(y_test, y_rforest1_pred, pos_label="positive")
    ],
    "Support (# Obs)": [
        sum(y_test == "negative"),
        sum(y_test == "positive")
    ]
})

# Apply same styling
metrics_table_styled = (
    metrics_table.style
        .set_properties(**{
            'font-size': '14px',
            'padding': '8px',
            'text-align': 'center'
        })
        .set_table_styles([
            {'selector': 'th', 
             'props': [
                 ('font-size', '16px'), 
                 ('font-weight', 'bold'), 
                 ('text-align', 'center'),
                 ('background-color', '#1f4e79'),
                 ('color', 'white'),
                 ('padding', '10px')
             ]},
            {'selector': 'tbody tr:nth-child(even)',
             'props': [('background-color', '#f2f6fc')]},
            {'selector': 'tbody tr:nth-child(odd)',
             'props': [('background-color', 'white')]},
            {'selector': 'table',
             'props': [
                 ('border-collapse', 'collapse'),
                 ('border', '1px solid #ccc'),
                 ('border-radius', '8px'),
                 ('overflow', 'hidden')
             ]},
        ])
        .background_gradient(cmap="Blues", subset=["Precision", "Recall", "F1 Score"])
        .format("{:.3f}", subset=["Precision", "Recall", "F1 Score"])
)

metrics_table_styled

In [None]:
# PART 8: FEATURE IMPORTANCE FOR STATE MODEL

# Calculate feature importance for state sentiment model
features = X.columns
positive_count = (y == "positive").sum()
negative_count = (y == "negative").sum()

results_table = pd.DataFrame({
    "Metric": [
        "Accuracy",
        "Precision (Macro)",
        "Recall (Macro)",
        "F1 (Macro)",
        "Positive Count",
        "Negative Count"
    ],
    "Value": [
        accuracy_score(y_test, y_rforest1_pred),
        precision_score(y_test, y_rforest1_pred, average='macro'),
        recall_score(y_test, y_rforest1_pred, average='macro'),
        f1_score(y_test, y_rforest1_pred, average='macro'),
        positive_count,
        negative_count
    ]
})

feature_importance_table = (
    pd.DataFrame({
        "Feature": features,
        "Importance": rforest1.feature_importances_
    })
    .sort_values("Importance", ascending=False)
    .reset_index(drop=True)
)

feature_importance_pretty = (
    feature_importance_table
    .style.hide(axis='index')
    .set_properties(**{
        'background-color': '#eef7ff',
        'border': '1px solid #bbb',
        'padding': '6px'
    })
    .format({"Importance": "{:.4f}"})
)

feature_importance_pretty

In [None]:
# PART 9: VISUALIZING TRENDS - REAL DISPOSABLE INCOME OVER TIME

# Convert Date column to datetime format for proper time series analysis
Midwest['Date'] = pd.to_datetime(Midwest['Date'])

# Extract year for grouping
Midwest['Year'] = Midwest['Date'].dt.year

# Create bar chart showing Real Disposable Income trends
plt.figure(figsize=(8,5))
plt.bar(
    Midwest['Year'], 
    Midwest['mw_RealDispIncome'], 
    color='skyblue', 
    edgecolor='black'
)

# Add labels and formatting
plt.xlabel("Year", fontsize=12, fontweight='bold')
plt.ylabel("Real Disposable Income", fontsize=12, fontweight='bold')
plt.title("Real Disposable Income Over Time", fontsize=14, fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='-', alpha=0.7)
plt.tight_layout()
plt.show()