# Modern Dataset Comparison: Post-2020 Financial Social Media Datasets

This notebook compares the three modern primary datasets used in this project:
- **Twitter Financial News Sentiment** (Zeroshot, 2023)
- **Financial Tweets Sentiment** (TimKoornstra, 2023)
- **TweetFinSent** (JP Morgan, 2022)

**Research Question**: How do these modern social media datasets differ in terms of size, label distribution, text characteristics, and noise patterns? This analysis helps understand dataset characteristics for label quality evaluation.

---

## Overview of Modern Datasets

See `data/DATASET_RECOMMENDATIONS.md` for detailed information about each dataset, including:
- Source platform and year
- Label format and mapping
- How well each dataset fits the proposal
- Pros and cons
- License and download links


In [None]:
# Setup
import sys
import os

# Get project root
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath("")))
if os.path.basename(os.getcwd()) == 'notebooks':
    PROJECT_ROOT = os.path.dirname(os.getcwd())
    os.chdir(PROJECT_ROOT)

src_path = os.path.join(PROJECT_ROOT, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter

from dataset_loader import load_dataset
from preprocess import preprocess_batch

%matplotlib inline
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✓ Setup complete")
print(f"Project root: {PROJECT_ROOT}")


## 1. Load Datasets

**Note**: Update the paths below to point to your downloaded dataset files.


In [None]:
# Load datasets
# Update these paths to your actual dataset files
TWITTER_FINANCIAL_PATH = 'data/twitter_financial_train.csv'
FINANCIAL_TWEETS_PATH = 'data/financial_tweets_2023.csv'
TWEETFINSENT_PATH = 'data/tweetfinsent.csv'

datasets = {}

# Load Twitter Financial
try:
    datasets['twitter_financial'] = load_dataset('twitter_financial', TWITTER_FINANCIAL_PATH)
    print(f"✓ Loaded Twitter Financial: {len(datasets['twitter_financial'])} samples")
except FileNotFoundError:
    print(f"⚠ Twitter Financial file not found: {TWITTER_FINANCIAL_PATH}")
    datasets['twitter_financial'] = None
except Exception as e:
    print(f"⚠ Error loading Twitter Financial: {e}")
    datasets['twitter_financial'] = None

# Load Financial Tweets 2023
try:
    datasets['financial_tweets_2023'] = load_dataset('financial_tweets_2023', FINANCIAL_TWEETS_PATH)
    print(f"✓ Loaded Financial Tweets 2023: {len(datasets['financial_tweets_2023'])} samples")
except FileNotFoundError:
    print(f"⚠ Financial Tweets 2023 file not found: {FINANCIAL_TWEETS_PATH}")
    datasets['financial_tweets_2023'] = None
except Exception as e:
    print(f"⚠ Error loading Financial Tweets 2023: {e}")
    datasets['financial_tweets_2023'] = None

# Load TweetFinSent
try:
    datasets['tweetfinsent'] = load_dataset('tweetfinsent', TWEETFINSENT_PATH)
    print(f"✓ Loaded TweetFinSent: {len(datasets['tweetfinsent'])} samples")
except FileNotFoundError:
    print(f"⚠ TweetFinSent file not found: {TWEETFINSENT_PATH}")
    datasets['tweetfinsent'] = None
except Exception as e:
    print(f"⚠ Error loading TweetFinSent: {e}")
    datasets['tweetfinsent'] = None

# Preprocess all datasets
for name, df in datasets.items():
    if df is not None:
        df['cleaned_text'] = preprocess_batch(df['text'])
        df = df[df['cleaned_text'].str.len() > 0]
        datasets[name] = df

print(f"\n✓ Loaded {sum(1 for df in datasets.values() if df is not None)} datasets")


## 2. Dataset Size Comparison


In [None]:
# Dataset sizes
size_data = []
for name, df in datasets.items():
    if df is not None:
        size_data.append({
            'Dataset': name.replace('_', ' ').title(),
            'Size': len(df)
        })

if size_data:
    size_df = pd.DataFrame(size_data)
    print("Dataset Sizes:")
    print("=" * 60)
    print(size_df.to_string(index=False))
    
    # Visualization
    plt.figure(figsize=(10, 6))
    ax = size_df.plot(x='Dataset', y='Size', kind='bar', color='#3498db', edgecolor='black')
    ax.set_title('Dataset Size Comparison', fontweight='bold', fontsize=14)
    ax.set_xlabel('Dataset', fontweight='bold')
    ax.set_ylabel('Number of Samples', fontweight='bold')
    ax.set_xticklabels(size_df['Dataset'], rotation=45, ha='right')
    ax.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig('results/dataset_size_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\n✓ Visualization saved to results/dataset_size_comparison.png")


## 3. Sentiment Distribution Comparison


In [None]:
# Label distribution comparison
label_data = {}
for name, df in datasets.items():
    if df is not None:
        label_counts = df['label'].value_counts()
        label_data[name] = label_counts

if label_data:
    # Create comparison DataFrame
    comparison_df = pd.DataFrame(label_data).fillna(0).astype(int)
    comparison_df = comparison_df.T  # Transpose for better visualization
    
    print("Label Distribution:")
    print("=" * 60)
    print(comparison_df)
    print("\nLabel Proportions (%):")
    print("=" * 60)
    for name, df in datasets.items():
        if df is not None:
            props = (df['label'].value_counts(normalize=True) * 100).round(2)
            print(f"\n{name.replace('_', ' ').title()}:")
            for label, pct in props.items():
                print(f"  {label}: {pct}%")
    
    # Visualization
    fig, axes = plt.subplots(1, len(label_data), figsize=(6*len(label_data), 5))
    if len(label_data) == 1:
        axes = [axes]
    
    colors = {'positive': '#2ecc71', 'neutral': '#95a5a6', 'negative': '#e74c3c'}
    
    for idx, (name, counts) in enumerate(label_data.items()):
        ax = axes[idx]
        labels = counts.index
        values = counts.values
        bars = ax.bar(labels, values, color=[colors.get(l, '#3498db') for l in labels])
        ax.set_title(name.replace('_', ' ').title(), fontweight='bold')
        ax.set_xlabel('Label')
        ax.set_ylabel('Count')
        ax.grid(axis='y', alpha=0.3)
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{int(height)}',
                   ha='center', va='bottom')
    
    plt.tight_layout()
    plt.savefig('results/dataset_label_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\n✓ Visualization saved to results/dataset_label_distribution.png")


## 4. Average Sentence Length Comparison


In [None]:
# Text length analysis
length_data = []
for name, df in datasets.items():
    if df is not None:
        df['text_length'] = df['text'].str.len()
        df['word_count'] = df['cleaned_text'].str.split().str.len()
        
        length_data.append({
            'Dataset': name.replace('_', ' ').title(),
            'Mean Length (chars)': df['text_length'].mean(),
            'Median Length (chars)': df['text_length'].median(),
            'Mean Words': df['word_count'].mean(),
            'Median Words': df['word_count'].median()
        })

if length_data:
    length_df = pd.DataFrame(length_data)
    print("Text Length Statistics:")
    print("=" * 60)
    print(length_df.to_string(index=False))
    
    # Visualization - Boxplot
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Character length boxplot
    char_data = []
    char_labels = []
    for name, df in datasets.items():
        if df is not None:
            char_data.append(df['text_length'].values)
            char_labels.append(name.replace('_', ' ').title())
    
    axes[0].boxplot(char_data, labels=char_labels)
    axes[0].set_title('Text Length Distribution (Characters)', fontweight='bold')
    axes[0].set_ylabel('Characters')
    axes[0].grid(axis='y', alpha=0.3)
    axes[0].tick_params(axis='x', rotation=45)
    
    # Word count boxplot
    word_data = []
    word_labels = []
    for name, df in datasets.items():
        if df is not None:
            word_data.append(df['word_count'].values)
            word_labels.append(name.replace('_', ' ').title())
    
    axes[1].boxplot(word_data, labels=word_labels)
    axes[1].set_title('Word Count Distribution', fontweight='bold')
    axes[1].set_ylabel('Words')
    axes[1].grid(axis='y', alpha=0.3)
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig('results/dataset_length_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\n✓ Visualization saved to results/dataset_length_comparison.png")


## 5. Noise Indicators Analysis

Social media text contains various noise indicators that affect label quality:
- Cashtags (e.g., $TSLA)
- Hashtags (#investing)
- Mentions (@username)
- Emojis


In [None]:
# Noise indicators
def count_cashtags(text):
    return len(re.findall(r'\$[A-Za-z]+', text))

def count_hashtags(text):
    return len(re.findall(r'#\w+', text))

def count_mentions(text):
    return len(re.findall(r'@\w+', text))

def count_emojis(text):
    # Simple emoji pattern (may need refinement)
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return len(emoji_pattern.findall(text))

noise_data = []
for name, df in datasets.items():
    if df is not None:
        df['cashtags'] = df['text'].apply(count_cashtags)
        df['hashtags'] = df['text'].apply(count_hashtags)
        df['mentions'] = df['text'].apply(count_mentions)
        df['emojis'] = df['text'].apply(count_emojis)
        
        noise_data.append({
            'Dataset': name.replace('_', ' ').title(),
            'Avg Cashtags': df['cashtags'].mean(),
            'Avg Hashtags': df['hashtags'].mean(),
            'Avg Mentions': df['mentions'].mean(),
            'Avg Emojis': df['emojis'].mean(),
            '% with Cashtags': (df['cashtags'] > 0).mean() * 100,
            '% with Hashtags': (df['hashtags'] > 0).mean() * 100
        })

if noise_data:
    noise_df = pd.DataFrame(noise_data)
    print("Noise Indicators:")
    print("=" * 60)
    print(noise_df.to_string(index=False))
    
    # Visualization
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    metrics = ['Avg Cashtags', 'Avg Hashtags', 'Avg Mentions', 'Avg Emojis']
    for idx, metric in enumerate(metrics):
        ax = axes[idx // 2, idx % 2]
        ax.bar(noise_df['Dataset'], noise_df[metric], color='#e67e22', edgecolor='black')
        ax.set_title(metric, fontweight='bold')
        ax.set_ylabel('Average Count')
        ax.tick_params(axis='x', rotation=45)
        ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('results/dataset_noise_indicators.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\n✓ Visualization saved to results/dataset_noise_indicators.png")


## 6. Word Clouds by Sentiment

Visualize the most common words for each sentiment class across datasets.


In [None]:
# Word clouds (requires wordcloud library)
try:
    from wordcloud import WordCloud
    
    for name, df in datasets.items():
        if df is not None:
            fig, axes = plt.subplots(1, 3, figsize=(18, 6))
            fig.suptitle(f'Word Clouds: {name.replace("_", " ").title()}', fontsize=16, fontweight='bold')
            
            for idx, label in enumerate(['positive', 'neutral', 'negative']):
                label_texts = df[df['label'] == label]['cleaned_text']
                if len(label_texts) > 0:
                    text = ' '.join(label_texts)
                    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
                    axes[idx].imshow(wordcloud, interpolation='bilinear')
                    axes[idx].set_title(f'{label.title()} Sentiment', fontweight='bold')
                    axes[idx].axis('off')
            
            plt.tight_layout()
            plt.savefig(f'results/wordcloud_{name}.png', dpi=300, bbox_inches='tight')
            plt.show()
            
            print(f"✓ Word cloud saved to results/wordcloud_{name}.png")
    
except ImportError:
    print("⚠ wordcloud library not installed. Skipping word clouds.")
    print("  Install with: pip install wordcloud")


## 7. Sample Texts (Anonymized)

Show 5 random samples from each dataset to understand text characteristics.


In [None]:
# Sample texts (anonymized)
for name, df in datasets.items():
    if df is not None:
        print("=" * 60)
        print(f"Sample Texts: {name.replace('_', ' ').title()}")
        print("=" * 60)
        
        # Show 5 random samples per label
        for label in ['positive', 'neutral', 'negative']:
            label_df = df[df['label'] == label]
            if len(label_df) > 0:
                samples = label_df.sample(min(5, len(label_df)), random_state=42)
                print(f"\n{label.upper()} ({len(label_df)} total):")
                for idx, row in samples.iterrows():
                    # Anonymize: remove @mentions and replace with @user
                    text = re.sub(r'@\w+', '@user', row['text'])
                    # Anonymize: remove specific cashtags (keep format)
                    text = re.sub(r'\$[A-Z]{2,5}', '$STOCK', text)
                    print(f"  {idx+1}. {text[:200]}...")  # Truncate long texts
        
        print()


## 8. Summary and Insights

### Key Findings

1. **Dataset Sizes**: [Fill in after running analysis]
2. **Label Distribution**: [Fill in after running analysis]
3. **Text Length**: [Fill in after running analysis]
4. **Noise Characteristics**: [Fill in after running analysis]

### Implications for Label Quality Evaluation

- **Twitter Financial (Zeroshot)**: Clean labels → good for baseline, identify model ambiguity
- **Financial Tweets 2023**: Large dataset → more noisy labels to detect
- **TweetFinSent**: Expert annotations → identify truly ambiguous text (not annotation errors)

### Next Steps

1. Train models on each dataset
2. Compare label quality metrics across datasets
3. Analyze ambiguous cases specific to each dataset
4. Use findings to improve label quality evaluation methods
