# Exploratory Data Analysis: Twitter Financial Sentiment

This notebook explores the Twitter Financial News Sentiment dataset (Zeroshot, 2023).

**Dataset**: Twitter Financial News Sentiment (Zeroshot, 2023)
- Real Twitter financial posts
- 3-class labels: positive, neutral, negative
- Social-media text with noise (hashtags, mentions, cashtags)


In [None]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dataset_loader import load_dataset
from preprocess import clean_text, preprocess_batch

%matplotlib inline
plt.style.use('seaborn-v0_8')


## Load Dataset

Replace `data_path` with the actual path to your dataset file.


In [None]:
# Load dataset
data_path = 'data/twitter_financial_train.csv'  # Update with your path
dataset_name = 'twitter_financial'

df = load_dataset(dataset_name, data_path)
print(f"Loaded {len(df)} samples")
print(f"\nDataset info:")
print(f"  Total samples: {len(df)}")
print(f"  Label distribution:")
print(df['label'].value_counts())
print(f"\nLabel proportions:")
print((df['label'].value_counts(normalize=True) * 100).round(2))
df.head()


## Label Distribution


In [None]:
# Label distribution
label_counts = df['label'].value_counts()
print(label_counts)

plt.figure(figsize=(10, 6))
label_counts.plot(kind='bar', color=['#2ecc71', '#95a5a6', '#e74c3c'])
plt.title('Label Distribution', fontweight='bold', fontsize=14)
plt.xlabel('Label', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
os.makedirs('results', exist_ok=True)
plt.savefig('results/label_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# Class imbalance analysis
print(f"\nClass Imbalance Analysis:")
print(f"  Most common class: {label_counts.index[0]} ({label_counts.iloc[0]} samples, {label_counts.iloc[0]/len(df)*100:.1f}%)")
print(f"  Least common class: {label_counts.index[-1]} ({label_counts.iloc[-1]} samples, {label_counts.iloc[-1]/len(df)*100:.1f}%)")
print(f"  Imbalance ratio: {label_counts.iloc[0]/label_counts.iloc[-1]:.2f}:1")


## Text Length Analysis


In [None]:
# Analyze text lengths
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

# Preprocess for cleaned length
df['cleaned_text'] = preprocess_batch(df['text'])
df['cleaned_length'] = df['cleaned_text'].str.len()
df['cleaned_word_count'] = df['cleaned_text'].str.split().str.len()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Original text length
axes[0, 0].hist(df['text_length'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Original Text Length (Characters)', fontweight='bold')
axes[0, 0].set_xlabel('Length (characters)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].grid(alpha=0.3)

# Cleaned text length
axes[0, 1].hist(df['cleaned_length'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_title('Cleaned Text Length (Characters)', fontweight='bold')
axes[0, 1].set_xlabel('Length (characters)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].grid(alpha=0.3)

# Word count
axes[1, 0].hist(df['word_count'], bins=50, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].set_title('Word Count Distribution', fontweight='bold')
axes[1, 0].set_xlabel('Word Count')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].grid(alpha=0.3)

# Text length by label
for label in df['label'].unique():
    subset = df[df['label'] == label]['text_length']
    axes[1, 1].hist(subset, bins=30, alpha=0.5, label=label, edgecolor='black')
axes[1, 1].set_title('Text Length by Label', fontweight='bold')
axes[1, 1].set_xlabel('Length (characters)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('results/text_length_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nText Length Statistics:")
print(f"  Mean original length: {df['text_length'].mean():.2f} characters")
print(f"  Median original length: {df['text_length'].median():.2f} characters")
print(f"  Mean word count: {df['word_count'].mean():.2f} words")
print(f"  Median word count: {df['word_count'].median():.2f} words")
print(f"  Mean cleaned length: {df['cleaned_length'].mean():.2f} characters")
print(f"  Mean cleaned word count: {df['cleaned_word_count'].mean():.2f} words")


## Sample Examples by Label


In [None]:
# Display sample examples for each label
print("Sample Examples by Label:")
print("=" * 80)
for label in ['positive', 'neutral', 'negative']:
    if label in df['label'].values:
        print(f"\n{label.upper()}:")
        print("-" * 80)
        samples = df[df['label'] == label]['text'].head(10)
        for i, text in enumerate(samples, 1):
            print(f"{i:2d}. {text}")

# Social-media noise indicators
print("\n" + "=" * 80)
print("Social-Media Noise Indicators:")
print("=" * 80)
df['has_hashtag'] = df['text'].str.contains('#', regex=False)
df['has_mention'] = df['text'].str.contains('@', regex=False)
df['has_cashtag'] = df['text'].str.contains(r'\$[A-Z]+', regex=True)
df['has_url'] = df['text'].str.contains('http', regex=False, case=False)

print(f"\nHashtags: {df['has_hashtag'].sum()} ({df['has_hashtag'].mean()*100:.1f}%)")
print(f"Mentions: {df['has_mention'].sum()} ({df['has_mention'].mean()*100:.1f}%)")
print(f"Cashtags: {df['has_cashtag'].sum()} ({df['has_cashtag'].mean()*100:.1f}%)")
print(f"URLs: {df['has_url'].sum()} ({df['has_url'].mean()*100:.1f}%)")

# Dataset quality notes
print("\n" + "=" * 80)
print("Dataset Quality Notes:")
print("=" * 80)
print(f"Total samples: {len(df)}")
print(f"Empty texts after preprocessing: {(df['cleaned_text'].str.len() == 0).sum()}")
print(f"Very short texts (< 10 chars): {(df['cleaned_length'] < 10).sum()} ({(df['cleaned_length'] < 10).mean()*100:.1f}%)")
print(f"Very long texts (> 200 chars): {(df['cleaned_length'] > 200).sum()} ({(df['cleaned_length'] > 200).mean()*100:.1f}%)")
