# Label Quality Analysis: Modern Financial Social-Media Datasets

This notebook performs comprehensive label quality analysis on modern post-2020 Twitter financial sentiment datasets.

**Focus**: Identifying ambiguous cases, noisy labels, and borderline classifications in social-media text.

**Datasets Supported:**
- Twitter Financial News Sentiment (Zeroshot, 2023)
- Financial Tweets Sentiment (TimKoornstra, 2023)
- TweetFinSent (JP Morgan, 2022)


In [None]:
# Setup
import sys
import os

# Get project root
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath("")))
if os.path.basename(os.getcwd()) == 'notebooks':
    PROJECT_ROOT = os.path.dirname(os.getcwd())
    os.chdir(PROJECT_ROOT)

src_path = os.path.join(PROJECT_ROOT, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from dataset_loader import load_dataset
from preprocess import preprocess_batch
from label_quality import (
    detect_misclassifications,
    detect_ambiguous_predictions,
    detect_noisy_labels,
    analyze_neutral_ambiguous_zone,
    analyze_borderline_cases,
    quantify_dataset_ambiguity
)

%matplotlib inline
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✓ Setup complete")
print(f"Project root: {PROJECT_ROOT}")


## 1. Load Model and Data

**Note**: Make sure you have trained a model first using `notebooks/02_train_baseline_modern.ipynb`


In [None]:
# Configuration
MODEL_PATH = 'results/model.joblib'  # Update if needed
DATA_PATH = 'data/twitter_financial_train.csv'  # Update this
DATASET_NAME = 'twitter_financial'  # 'twitter_financial', 'financial_tweets_2023', or 'tweetfinsent'
OUTPUT_DIR = 'results'

# Load model
print("Loading model...")
model = joblib.load(MODEL_PATH)
print(f"✓ Model loaded from {MODEL_PATH}")

# Load dataset
print("Loading dataset...")
df = load_dataset(DATASET_NAME, DATA_PATH)
df['cleaned_text'] = preprocess_batch(df['text'])
df = df[df['cleaned_text'].str.len() > 0]
print(f"✓ Loaded {len(df)} samples")


## 2. Core Label Quality Analysis

### 2.1 Misclassifications


In [None]:
# Detect misclassifications
print("Detecting misclassifications...")
misclass_df = detect_misclassifications(
    model_path=MODEL_PATH,
    data_path=DATA_PATH,
    dataset_name=DATASET_NAME,
    output_path=os.path.join(OUTPUT_DIR, 'misclassifications.csv')
)

print(f"\nFound {len(misclass_df)} misclassifications")
print(f"\nSample misclassifications:")
misclass_df.head(10)


### 2.2 Ambiguous Predictions


In [None]:
# Detect ambiguous predictions
print("Detecting ambiguous predictions...")
ambiguous_df = detect_ambiguous_predictions(
    model_path=MODEL_PATH,
    data_path=DATA_PATH,
    dataset_name=DATASET_NAME,
    confidence_threshold=(0.45, 0.55),
    output_path=os.path.join(OUTPUT_DIR, 'ambiguous_predictions.csv')
)

print(f"\nFound {len(ambiguous_df)} ambiguous predictions")
print(f"\nSample ambiguous predictions:")
ambiguous_df.head(10)


### 2.3 Noisy Labels


In [None]:
# Detect noisy labels
print("Detecting noisy labels...")
noisy_df = detect_noisy_labels(
    model_path=MODEL_PATH,
    data_path=DATA_PATH,
    dataset_name=DATASET_NAME,
    output_path=os.path.join(OUTPUT_DIR, 'noisy_labels.csv')
)

print(f"\nFound {len(noisy_df)} potentially noisy labels")
print(f"\nSample noisy labels:")
noisy_df.head(10)


## 3. Social-Media-Specific Analysis

### 3.1 Neutral Ambiguous Zone

Cases where the model struggles to distinguish neutral from sentiment (common in social-media text).


In [None]:
# Analyze neutral ambiguous zone
print("Analyzing neutral ambiguous zone...")
neutral_ambiguous_df = analyze_neutral_ambiguous_zone(
    model_path=MODEL_PATH,
    data_path=DATA_PATH,
    dataset_name=DATASET_NAME,
    output_path=os.path.join(OUTPUT_DIR, 'neutral_ambiguous_zone.csv')
)

print(f"\nFound {len(neutral_ambiguous_df)} cases in neutral ambiguous zone")
print(f"\nSample cases:")
neutral_ambiguous_df.head(10)


### 3.2 Borderline Cases

Borderline positive/negative vs neutral cases (common in social-media text).


In [None]:
# Analyze borderline cases
print("Analyzing borderline cases...")
borderline_df = analyze_borderline_cases(
    model_path=MODEL_PATH,
    data_path=DATA_PATH,
    dataset_name=DATASET_NAME,
    output_path=os.path.join(OUTPUT_DIR, 'borderline_cases.csv')
)

print(f"\nFound {len(borderline_df)} borderline cases")
print(f"\nBorderline case types:")
print(borderline_df['borderline_type'].value_counts())
print(f"\nSample borderline cases:")
borderline_df.head(10)


### 3.3 Dataset-Inherent Ambiguity Metrics

Quantify overall ambiguity in the dataset.


In [None]:
# Quantify dataset ambiguity
print("Quantifying dataset-inherent ambiguity...")
ambiguity_metrics = quantify_dataset_ambiguity(
    model_path=MODEL_PATH,
    data_path=DATA_PATH,
    dataset_name=DATASET_NAME,
    output_path=os.path.join(OUTPUT_DIR, 'dataset_ambiguity_metrics.csv')
)

print("\nDataset Ambiguity Metrics:")
print("=" * 60)
print(ambiguity_metrics.to_string(index=False))


## 4. Visualizations


In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. Misclassification confidence distribution
axes[0, 0].hist(misclass_df['confidence'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Misclassification Confidence Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Confidence')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].grid(alpha=0.3)

# 2. Ambiguous predictions by label
if len(ambiguous_df) > 0:
    ambiguous_df['predicted_label'].value_counts().plot(kind='bar', ax=axes[0, 1], color='orange')
    axes[0, 1].set_title('Ambiguous Predictions by Label', fontweight='bold')
    axes[0, 1].set_xlabel('Predicted Label')
    axes[0, 1].set_ylabel('Count')
    axes[0, 1].grid(axis='y', alpha=0.3)

# 3. Noisy labels by heuristic
if len(noisy_df) > 0:
    noisy_df['heuristic'].value_counts().plot(kind='bar', ax=axes[0, 2], color='red')
    axes[0, 2].set_title('Noisy Labels by Heuristic', fontweight='bold')
    axes[0, 2].set_xlabel('Heuristic Type')
    axes[0, 2].set_ylabel('Count')
    axes[0, 2].tick_params(axis='x', rotation=45)
    axes[0, 2].grid(axis='y', alpha=0.3)

# 4. Neutral ambiguous zone distribution
if len(neutral_ambiguous_df) > 0:
    axes[1, 0].hist(neutral_ambiguous_df['neutral_prob'], bins=30, edgecolor='black', alpha=0.7, color='purple')
    axes[1, 0].set_title('Neutral Ambiguous Zone Distribution', fontweight='bold')
    axes[1, 0].set_xlabel('Neutral Probability')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].grid(alpha=0.3)

# 5. Borderline cases by type
if len(borderline_df) > 0:
    borderline_df['borderline_type'].value_counts().plot(kind='bar', ax=axes[1, 1], color='green')
    axes[1, 1].set_title('Borderline Cases by Type', fontweight='bold')
    axes[1, 1].set_xlabel('Borderline Type')
    axes[1, 1].set_ylabel('Count')
    axes[1, 1].tick_params(axis='x', rotation=45)
    axes[1, 1].grid(axis='y', alpha=0.3)

# 6. Summary statistics
summary_data = {
    'Metric': ['Misclassifications', 'Ambiguous', 'Noisy Labels', 'Neutral Ambiguous', 'Borderline'],
    'Count': [
        len(misclass_df),
        len(ambiguous_df),
        len(noisy_df),
        len(neutral_ambiguous_df) if len(neutral_ambiguous_df) > 0 else 0,
        len(borderline_df) if len(borderline_df) > 0 else 0
    ]
}
summary_df = pd.DataFrame(summary_data)
summary_df.plot(x='Metric', y='Count', kind='bar', ax=axes[1, 2], color='blue', legend=False)
axes[1, 2].set_title('Label Quality Summary', fontweight='bold')
axes[1, 2].set_xlabel('Metric')
axes[1, 2].set_ylabel('Count')
axes[1, 2].tick_params(axis='x', rotation=45)
axes[1, 2].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('results/label_quality_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualization saved to results/label_quality_analysis.png")


## 5. Summary

### Label Quality Analysis Summary

- **Misclassifications**: [Fill in]
- **Ambiguous Predictions**: [Fill in]
- **Noisy Labels**: [Fill in]
- **Neutral Ambiguous Zone**: [Fill in]
- **Borderline Cases**: [Fill in]
- **Dataset Ambiguity Metrics**: [Fill in]

### Key Insights

1. **Social-media text characteristics**: [Fill in]
2. **Ambiguity patterns**: [Fill in]
3. **Noisy label patterns**: [Fill in]
4. **Borderline case patterns**: [Fill in]

### Next Steps

1. Compare label quality metrics across different modern datasets
2. Analyze specific ambiguous cases in detail
3. Use findings to improve model or dataset
