# Exploratory Data Analysis: Modern Financial Social-Media Datasets

This notebook explores the three modern post-2020 Twitter financial sentiment datasets:
- **Twitter Financial News Sentiment** (Zeroshot, 2023)
- **Financial Tweets Sentiment** (TimKoornstra, 2023)
- **TweetFinSent** (JP Morgan, 2022)

**Research Focus**: Understanding the characteristics of financial social-media text for sentiment classification and label quality evaluation.


In [None]:
# Setup
import sys
import os

# Get project root
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath("")))
if os.path.basename(os.getcwd()) == 'notebooks':
    PROJECT_ROOT = os.path.dirname(os.getcwd())
    os.chdir(PROJECT_ROOT)

src_path = os.path.join(PROJECT_ROOT, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

from dataset_loader import load_dataset
from preprocess import preprocess_batch

%matplotlib inline
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✓ Setup complete")
print(f"Project root: {PROJECT_ROOT}")


## 1. Load Dataset

**Choose one of the three modern datasets:**
- `twitter_financial`: Twitter Financial News Sentiment (Zeroshot, 2023)
- `financial_tweets_2023`: Financial Tweets Sentiment (TimKoornstra, 2023)
- `tweetfinsent`: TweetFinSent (JP Morgan, 2022)

Update the paths below to point to your dataset files.


In [None]:
# Configuration - Update these paths
DATA_PATH = 'data/twitter_financial_train.csv'  # Update this
DATASET_NAME = 'twitter_financial'  # 'twitter_financial', 'financial_tweets_2023', or 'tweetfinsent'

# Alternative datasets:
# DATA_PATH = 'data/financial_tweets_2023.csv'
# DATASET_NAME = 'financial_tweets_2023'

# DATA_PATH = 'data/tweetfinsent.csv'
# DATASET_NAME = 'tweetfinsent'

# Load dataset
df = load_dataset(DATASET_NAME, DATA_PATH)
print(f"✓ Loaded {len(df)} samples")
print(f"\nDataset: {DATASET_NAME}")
print(f"Columns: {df.columns.tolist()}")

# Display first few rows
df.head()


## 2. Basic Dataset Statistics


In [None]:
# Basic statistics
print("Dataset Overview:")
print("=" * 60)
print(f"Total samples: {len(df):,}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())
print(f"\nLabel proportions:")
print((df['label'].value_counts(normalize=True) * 100).round(2))

# Check for missing values
print(f"\nMissing values:")
print(df.isnull().sum())

# Text length statistics
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print(f"\nText Length Statistics:")
print(f"  Mean characters: {df['text_length'].mean():.1f}")
print(f"  Median characters: {df['text_length'].median():.1f}")
print(f"  Mean words: {df['word_count'].mean():.1f}")
print(f"  Median words: {df['word_count'].median():.1f}")


## 3. Label Distribution Visualization


In [None]:
# Label distribution visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
label_counts = df['label'].value_counts()
colors = {'positive': '#2ecc71', 'neutral': '#95a5a6', 'negative': '#e74c3c'}
axes[0].bar(label_counts.index, label_counts.values, 
            color=[colors.get(l, '#3498db') for l in label_counts.index])
axes[0].set_title('Label Distribution', fontweight='bold', fontsize=14)
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
label_props = df['label'].value_counts(normalize=True) * 100
axes[1].pie(label_props.values, labels=label_props.index, autopct='%1.1f%%',
            colors=[colors.get(l, '#3498db') for l in label_props.index])
axes[1].set_title('Label Proportions', fontweight='bold', fontsize=14)

plt.tight_layout()
plt.savefig('results/label_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualization saved to results/label_distribution.png")


## 4. Text Length Analysis

Social-media text typically has variable length. Let's analyze the distribution.


In [None]:
# Text length analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Character length distribution
axes[0].hist(df['text_length'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Text Length Distribution (Characters)', fontweight='bold')
axes[0].set_xlabel('Character Count')
axes[0].set_ylabel('Frequency')
axes[0].axvline(df['text_length'].mean(), color='red', linestyle='--', 
                label=f'Mean: {df["text_length"].mean():.1f}')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Word count distribution
axes[1].hist(df['word_count'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('Word Count Distribution', fontweight='bold')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')
axes[1].axvline(df['word_count'].mean(), color='red', linestyle='--',
                label=f'Mean: {df["word_count"].mean():.1f}')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('results/text_length_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualization saved to results/text_length_distribution.png")


## 5. Social-Media Noise Indicators

Social-media text contains various noise indicators. Let's analyze them.


In [None]:
# Noise indicators
def count_cashtags(text):
    return len(re.findall(r'\$[A-Za-z]+', text))

def count_hashtags(text):
    return len(re.findall(r'#\w+', text))

def count_mentions(text):
    return len(re.findall(r'@\w+', text))

def count_urls(text):
    return len(re.findall(r'http\S+|www\S+|https\S+', text))

# Calculate noise indicators
df['cashtags'] = df['text'].apply(count_cashtags)
df['hashtags'] = df['text'].apply(count_hashtags)
df['mentions'] = df['text'].apply(count_mentions)
df['urls'] = df['text'].apply(count_urls)

# Summary statistics
print("Social-Media Noise Indicators:")
print("=" * 60)
print(f"Texts with cashtags: {(df['cashtags'] > 0).sum()} ({(df['cashtags'] > 0).mean() * 100:.2f}%)")
print(f"Texts with hashtags: {(df['hashtags'] > 0).sum()} ({(df['hashtags'] > 0).mean() * 100:.2f}%)")
print(f"Texts with mentions: {(df['mentions'] > 0).sum()} ({(df['mentions'] > 0).mean() * 100:.2f}%)")
print(f"Texts with URLs: {(df['urls'] > 0).sum()} ({(df['urls'] > 0).mean() * 100:.2f}%)")
print(f"\nAverage per text:")
print(f"  Cashtags: {df['cashtags'].mean():.2f}")
print(f"  Hashtags: {df['hashtags'].mean():.2f}")
print(f"  Mentions: {df['mentions'].mean():.2f}")
print(f"  URLs: {df['urls'].mean():.2f}")

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
noise_cols = ['cashtags', 'hashtags', 'mentions', 'urls']
titles = ['Cashtags ($TSLA)', 'Hashtags (#stocks)', 'Mentions (@user)', 'URLs']

for idx, (col, title) in enumerate(zip(noise_cols, titles)):
    ax = axes[idx // 2, idx % 2]
    ax.hist(df[col], bins=20, edgecolor='black', alpha=0.7)
    ax.set_title(title, fontweight='bold')
    ax.set_xlabel('Count')
    ax.set_ylabel('Frequency')
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('results/noise_indicators.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Visualization saved to results/noise_indicators.png")


## 6. Sample Texts by Label

Let's examine sample texts from each sentiment class to understand the data.


In [None]:
# Sample texts by label
print("Sample Texts by Label:")
print("=" * 60)

for label in ['positive', 'neutral', 'negative']:
    label_df = df[df['label'] == label]
    if len(label_df) > 0:
        print(f"\n{label.upper()} ({len(label_df)} samples):")
        print("-" * 60)
        samples = label_df.sample(min(5, len(label_df)), random_state=42)
        for idx, row in samples.iterrows():
            # Anonymize mentions
            text = re.sub(r'@\w+', '@user', row['text'])
            print(f"  {idx+1}. {text[:150]}...")


## 7. Preprocessing Preview

Let's see how preprocessing affects the text.


In [None]:
# Preprocessing preview
df['cleaned_text'] = preprocess_batch(df['text'])
df['cleaned_length'] = df['cleaned_text'].str.len()
df['cleaned_word_count'] = df['cleaned_text'].str.split().str.len()

# Show examples
print("Preprocessing Examples:")
print("=" * 60)
samples = df.sample(min(5, len(df)), random_state=42)
for idx, row in samples.iterrows():
    print(f"\nOriginal ({len(row['text'])} chars, {len(row['text'].split())} words):")
    print(f"  {row['text'][:200]}...")
    print(f"\nCleaned ({len(row['cleaned_text'])} chars, {len(row['cleaned_text'].split())} words):")
    print(f"  {row['cleaned_text'][:200]}...")
    print("-" * 60)

# Compare lengths
print(f"\nLength Comparison:")
print(f"  Original - Mean: {df['text_length'].mean():.1f} chars, {df['word_count'].mean():.1f} words")
print(f"  Cleaned - Mean: {df['cleaned_length'].mean():.1f} chars, {df['cleaned_word_count'].mean():.1f} words")


## 8. Summary

### Key Findings

1. **Dataset Size**: [Fill in after running]
2. **Label Distribution**: [Fill in after running]
3. **Text Characteristics**: [Fill in after running]
4. **Noise Indicators**: [Fill in after running]

### Next Steps

1. Train baseline model using `notebooks/02_train_baseline_modern.ipynb`
2. Analyze label quality using `notebooks/03_label_quality_modern.ipynb`
3. Compare across datasets using `notebooks/03_dataset_comparison.ipynb`
