# Phase 2: Sentiment Scoring - VADER vs FinBERT Comparison

## Objective

Transform 52,974 raw news headlines into daily, ticker-specific sentiment time series using:
- **VADER** (Lexicon-based pproach)
- **FinBERT** (Transformer-based approach)

This enables a direct comparison of baseline vs state-of-the-art sentiment analysis for financial text.

## Pipeline Overview

```
News (53k articles)
    ↓
1. Entity Resolution (yfinance keyword matching)
    ↓
2. News Attribution (assign to tickers or MARKET_GENERAL)
    ↓
3. VADER Scoring (lexicon-based, CPU)
    ↓
4. FinBERT Scoring (transformer-based, GPU)
    ↓
5. Validation & Comparison
    ↓
6. Daily Aggregation by Ticker
    ↓
Output: sentiment_scores_60.csv + market_sentiment_general.csv
```

---

## 1. Environment & Setup

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

import yfinance as yf

import matplotlib.pyplot as plt
import seaborn as sns

import os
import re
from tqdm import tqdm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f'  GPU detected: {torch.cuda.get_device_name(0)}')
    print(f'  CUDA version: {torch.version.cuda}')
    print(f'  Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB')
    print(f'  Memory reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB')
else:
    device = torch.device('cpu')

print(f'\nDevice set to: {device}')

  GPU detected: Tesla T4
  CUDA version: 12.8
  Memory allocated: 0.00 MB
  Memory reserved: 0.00 MB

Device set to: cuda


In [4]:
project_root = '/content/drive/MyDrive/market-sentiment-impact-analysis'

data_processed = os.path.join(project_root, 'data', 'processed')
data_tickers = os.path.join(project_root, 'data', 'tickers')

os.makedirs(data_processed, exist_ok=True)

print(f"Project Root: {project_root}")
print(f"Processed Data: {data_processed}")
print(f"Tickers Data: {data_tickers}")

Project Root: /content/drive/MyDrive/market-sentiment-impact-analysis
Processed Data: /content/drive/MyDrive/market-sentiment-impact-analysis/data/processed
Tickers Data: /content/drive/MyDrive/market-sentiment-impact-analysis/data/tickers


---

## 2. Load Input Data

In [5]:
news_path = os.path.join(data_processed, 'daily_news_cleaned.csv')

print(f'Loading news data from: {news_path}')
news_df = pd.read_csv(news_path)

news_df['date'] = pd.to_datetime(news_df['date'])

print(f'\nNews data loaded:')
print(f'  Records: {len(news_df):,}')
print(f'  Date range: {news_df['date'].min().date()} to {news_df['date'].max().date()}')
print(f'  Sources: {news_df['source'].value_counts().to_dict()}')
print(f'\nColumns: {list(news_df.columns)}')
print(f'\nSample:')
print(news_df.head(3))

Loading news data from: /content/drive/MyDrive/market-sentiment-impact-analysis/data/processed/daily_news_cleaned.csv

News data loaded:
  Records: 52,974
  Date range: 2018-01-02 to 2020-07-18
  Sources: {'Reuters': 32673, 'Guardian': 17516, 'CNBC': 2785}

Columns: ['date', 'source', 'final_text']

Sample:
                       date    source                                         final_text
0 2018-01-02 00:00:00+00:00  Guardian  Former advertising executive reveals junk food...
1 2018-01-02 00:00:00+00:00  Guardian  Transport secretary ‘running scared’ as he fli...
2 2018-01-02 00:00:00+00:00  Guardian  Good for factories, bad for shoppers: a Brexit...


In [6]:
all_betas_path = os.path.join(data_tickers, 'all_betas.csv')

print(f'Loading ticker metadata from all_betas.csv...')
all_tickers_df = pd.read_csv(all_betas_path)

stock_returns_path = os.path.join(data_processed, 'stock_returns_60.csv')
stock_returns = pd.read_csv(stock_returns_path)
ticker_list = stock_returns['Ticker'].unique().tolist()

all_tickers_df = all_tickers_df[all_tickers_df['Ticker'].isin(ticker_list)].copy()

print(f'\nTickers loaded:')
print(f'  Total: {len(ticker_list)}')
print(f'  Unique tickers in metadata: {len(all_tickers_df)}')
print(f'\nSample tickers: {ticker_list[:10]}')

Loading ticker metadata from all_betas.csv...

Tickers loaded:
  Total: 60
  Unique tickers in metadata: 60

Sample tickers: ['AEP', 'AMP', 'APA', 'BA', 'C', 'CAG', 'CCL', 'CHD', 'CHRW', 'CL']


---

## 3. Entity Resolution

In [7]:
print('Fetching company names from Yahoo Finance...')

company_names = {}
failed_tickers = []

for ticker in tqdm(ticker_list):
    try:
        stock = yf.Ticker(ticker)
        info = stock.info

        name = info.get('shortName') or info.get('longName') or ticker
        company_names[ticker] = name

    except Exception as e:
        company_names[ticker] = ticker
        failed_tickers.append(ticker)

print(f'\n  Downloaded names for {len(company_names)} tickers')
if failed_tickers:
    print(f"   Failed to fetch: {failed_tickers}")

print(f'\nSample company names:')
for ticker in list(company_names.keys())[:5]:
    print(f'  {ticker}: {company_names[ticker]}')

Fetching company names from Yahoo Finance...


100%|██████████| 60/60 [00:02<00:00, 21.04it/s]


  Downloaded names for 60 tickers

Sample company names:
  AEP: American Electric Power Company
  AMP: Ameriprise Financial, Inc.
  APA: APA Corporation
  BA: Boeing Company (The)
  C: Citigroup, Inc.





In [8]:
def clean_company_name(name):
    if not isinstance(name, str):
        return str(name)

    suffixes = [
        r',?\s+Inc\.?$',
        r',?\s+Corporation$',
        r',?\s+Corp\.?$',
        r',?\s+Company$',
        r',?\s+Co\.?$',
        r',?\s+Ltd\.?$',
        r',?\s+Limited$',
        r',?\s+Plc$',
        r',?\s+PLC$',
        r',?\s+LLC$',
        r',?\s+L\.P\.$',
        r',?\s+LP$',
        r',?\s+Group$',
        r',?\s+Holdings?$',
        r'\s+\(The\)$',
        r',?\s+& Co\.?$'
    ]

    cleaned = name
    for suffix in suffixes:
        cleaned = re.sub(suffix, '', cleaned, flags=re.IGNORECASE)

    return cleaned.strip()

company_names_clean = {ticker: clean_company_name(name)
                        for ticker, name in company_names.items()}

print('Company name cleaning:')
print('\nBefore → After:')
for ticker in list(company_names.keys())[:10]:
    original = company_names[ticker]
    cleaned = company_names_clean[ticker]
    if original != cleaned:
        print(f'  {ticker}: {original} → {cleaned}')
    else:
        print(f'  {ticker}: {original} (no change)')

Company name cleaning:

Before → After:
  AEP: American Electric Power Company → American Electric Power
  AMP: Ameriprise Financial, Inc. → Ameriprise Financial
  APA: APA Corporation → APA
  BA: Boeing Company (The) → Boeing Company
  C: Citigroup, Inc. → Citigroup
  CAG: ConAgra Brands, Inc. → ConAgra Brands
  CCL: Carnival Corporation → Carnival
  CHD: Church & Dwight Company, Inc. → Church & Dwight
  CHRW: C.H. Robinson Worldwide, Inc. → C.H. Robinson Worldwide
  CL: Colgate-Palmolive Company → Colgate-Palmolive


In [11]:
def create_keyword_dict(ticker_list, company_names_clean, all_tickers_df):
    """
    multi-level keyword dictionary:
    1. Direct: Company-specific keywords
    2. Sector: Industry/sector keywords
    3. Thematic: Major events/themes
    """
    keywords = {}
    sector_keywords = {}
    thematic_keywords = {}

    for ticker in ticker_list:
        name = company_names_clean.get(ticker, ticker)

        keyword_list = [ticker, name]

        words = name.split()
        if len(words) > 1:
            first_word = words[0]
            if len(first_word) > 3:
                keyword_list.append(first_word)

        manual_mappings = {
            'C': ['Citigroup', 'Citi'],
            'PG': ['P&G', 'Procter', 'Procter & Gamble', 'The Procter & Gamble Company'],
            'WMT': ['Walmart', 'Wal-Mart'],
            'BA': ['Boeing'],
            'VZ': ['Verizon'],
            'COF': ['Capital One'],
            'COST': ['Costco'],
            'DG': ['Dollar General'],
            'AMP': ['Ameriprise'],
            'APA': ['APA Corp'],
            'CVNA': ['Carvana'],
            'CCL': ['Carnival'],
            'RCL': ['Royal Caribbean'],
            'MGM': ['MGM Resorts'],
            'HAL': ['Halliburton'],
            'OXY': ['Occidental'],
            'DVN': ['Devon'],
            'MPC': ['Marathon', 'Marathon Petroleum'],
            'WDC': ['Western Digital'],
            'PRU': ['Prudential', 'Prudential Financial'],
            'LRCX': ['Lam Research'],
            'SYF': ['Synchrony'],
            'WYNN': ['Wynn'],
            'OKE': ['Oneok'],
            'KEY': ['KeyCorp', 'KeyBank'],
            'IVZ': ['Invesco'],
            'FANG': ['Diamondback', 'Diamondback Energy'],
            'TTD': ['Trade Desk'],
            'URI': ['United Rentals'],
            'FCX': ['Freeport', 'Freeport-McMoRan'],
            'GIS': ['General Mills'],
            'HRL': ['Hormel', 'Hormel Foods'],
            'CAG': ['Conagra', 'Conagra Brands'],
            'CHD': ['Church & Dwight'],
            'KMB': ['Kimberly-Clark', 'Kimberly Clark', 'Kimberly-Clark Corp'],
            'CLX': ['Clorox', 'Clorox Co', 'The Clorox Company'],
            'KR': ['Kroger', 'Kroger Co'],
            'CPB': ['Campbell', 'Campbell\'s', 'Campbell Soup'],
            'SJM': ['Smucker', 'J.M. Smucker', 'J.M. Smucker'],
            'DPZ': ['Domino\'s', 'Dominos'],
            'KDP': ['Keurig', 'Keurig Dr Pepper'],
            'ED': ['Con Ed', 'ConEd', 'Consolidated Edison'],
            'AEP': ['American Electric', 'American Electric Power', 'AEP'],
            'CMS': ['CMS Energy'],
            'WEC': ['WEC Energy'],
            'LNT': ['Alliant'],
            'GILD': ['Gilead'],
            'MKC': ['McCormick', 'McCormick & Co'],
            'CL': ['Colgate', 'Colgate-Palmolive'],
            'GEN': ['Gen Digital'],
            'CHRW': ['C.H. Robinson'],
            'PSA': ['Public Storage'],
            'EXR': ['Extra Space', 'Extra Space Storage'],
            'NEM': ['Newmont', 'Newmont Corp'],
            'SW': ['Smurfit', 'Smurfit Westrock'],
            'ON': ['ON Semi', 'ON Semiconductor'],
            'NCLH': ['Norwegian Cruise'],
            'TPR': ['Tapestry'],
            'TRGP': ['Targa', 'Targa Resources']
        }

        if ticker in manual_mappings:
            keyword_list.extend(manual_mappings[ticker])

        keyword_list = list(set([k.strip() for k in keyword_list if k.strip()]))
        keywords[ticker] = keyword_list

    ticker_to_sector = dict(zip(all_tickers_df['Ticker'], all_tickers_df['GICS Sector']))

    sector_keyword_map = {
        'Energy': ['oil', 'crude', 'petroleum', 'drilling', 'fracking', 'shale', 'refining',
                   'opec', 'energy sector', 'oil prices', 'natural gas'],

        'Financials': ['bank', 'banking', 'financial services', 'lending', 'credit card',
                       'mortgage', 'interest rate', 'fed rate', 'basel', 'capital requirements'],

        'Consumer Discretionary': ['retail', 'consumer spending', 'shopping', 'e-commerce',
                                   'cruise', 'casino', 'gaming', 'resort', 'leisure',
                                   'tourism', 'travel', 'vacation'],

        'Consumer Staples': ['grocery', 'food', 'beverage', 'packaged goods', 'supermarket',
                            'consumer goods', 'household products'],

        'Industrials': ['manufacturing', 'industrial', 'construction', 'aerospace',
                       'defense', 'machinery', 'equipment rental'],

        'Information Technology': ['tech', 'semiconductor', 'chip', 'software', 'hardware',
                                   'digital', 'silicon', 'foundry', 'memory'],

        'Materials': ['mining', 'metals', 'copper', 'gold', 'commodities', 'materials sector',
                     'raw materials'],

        'Utilities': ['utility', 'electric', 'power', 'gas', 'water', 'energy grid',
                     'renewable energy'],

        'Real Estate': ['real estate', 'property', 'reit', 'commercial property',
                       'storage', 'warehouse'],

        'Health Care': ['healthcare', 'pharma', 'pharmaceutical', 'biotech', 'drug',
                       'medical', 'fda approval'],

        'Communication Services': ['telecom', 'wireless', 'broadband', '5g', 'internet service',
                                   'advertising', 'media']
    }

    # reverse mapping: keyword -> list of tickers in that sector
    for sector, keywords_list in sector_keyword_map.items():
        for keyword in keywords_list:
            if keyword not in sector_keywords:
                sector_keywords[keyword] = []

            tickers_in_sector = [t for t, s in ticker_to_sector.items() if s == sector]
            sector_keywords[keyword].extend(tickers_in_sector)


    thematic_mappings = {
        # COVID-19 impact
        'covid|pandemic|coronavirus|lockdown|quarantine': {
            'positive': [],
            'negative': ['CCL', 'RCL', 'NCLH', 'MGM', 'WYNN', 'BA', 'URI']
        },

        'oil price|crude price|opec': {
            'positive': ['APA', 'OXY', 'DVN', 'FANG', 'TRGP', 'HAL', 'MPC'],
            'negative': []
        },

        'interest rate|fed rate|federal reserve rate': {
            'positive': ['C', 'COF', 'KEY', 'PRU', 'SYF', 'AMP', 'IVZ'],
            'negative': ['ED', 'AEP', 'CMS', 'WEC', 'LNT', 'PSA', 'EXR']
        },

        'chip shortage|semiconductor shortage': {
            'positive': ['ON', 'LRCX', 'WDC'],
            'negative': []
        },

        'inflation|price increase|cost pressure': {
            'positive': ['NEM', 'FCX'],
            'negative': ['WMT', 'COST', 'DG', 'KR']
        },

        'consumer confidence|consumer spending|retail sales': {
            'positive': ['WMT', 'COST', 'DG', 'DPZ', 'TPR'],
            'negative': []
        },

        'supply chain|logistics|shipping': {
            'positive': ['CHRW', 'URI'],
            'negative': ['WMT', 'COST', 'DG']
        }
    }

    for pattern, impact_dict in thematic_mappings.items():
        thematic_keywords[pattern] = impact_dict

    return keywords, sector_keywords, thematic_keywords

keywords, sector_keywords, thematic_keywords = create_keyword_dict(
    ticker_list, company_names_clean, all_tickers_df
)

# samples
print(f'Multi-level keyword dictionary created:')
print(f'  Direct company keywords: {len(keywords)} tickers')
print(f'  Sector keywords: {len(sector_keywords)} sector terms')
print(f'  Thematic keywords: {len(thematic_keywords)} themes')
print(f'\nSample direct mappings:')
for ticker in list(keywords.keys())[:5]:
    print(f'  {ticker}: {keywords[ticker]}')
print(f'\nSample sector keyword:')
print(f"  'cruise' → {sector_keywords.get('cruise', [])} tickers")
print(f'\nSample thematic mapping:')
print(f'  COVID-19 → negative impact: {thematic_keywords[list(thematic_keywords.keys())[0]]['negative']}')

Multi-level keyword dictionary created:
  Direct company keywords: 60 tickers
  Sector keywords: 90 sector terms
  Thematic keywords: 7 themes

Sample direct mappings:
  AEP: ['American Electric', 'AEP', 'American Electric Power', 'American']
  AMP: ['AMP', 'Ameriprise', 'Ameriprise Financial']
  APA: ['APA Corp', 'APA']
  BA: ['Boeing', 'BA', 'Boeing Company']
  C: ['Citi', 'Citigroup', 'C']

Sample sector keyword:
  'cruise' → ['NCLH', 'CVNA', 'CCL', 'RCL', 'MGM', 'TPR', 'WYNN', 'DPZ'] tickers

Sample thematic mapping:
  COVID-19 → negative impact: ['CCL', 'RCL', 'NCLH', 'MGM', 'WYNN', 'BA', 'URI']


---

## 4. News Filtering & Attribution

In [16]:
# find ticker matches in text (multi-levels)

def find_ticker_matches(text, keywords_dict, sector_keywords_dict, thematic_keywords_dict):
    """
    Find all tickers mentioned in text using 3 levels:
    1. Direct, 2. Sector, 3. Thematic
    """
    if not isinstance(text, str):
        return {}

    text_lower = text.lower()
    matches = {}

    for ticker, keyword_list in keywords_dict.items():
        for keyword in keyword_list:
            pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
            if re.search(pattern, text_lower):
                if ticker not in matches:
                    matches[ticker] = []
                matches[ticker].append('direct')
                break

    # Sector Keywords
    for sector_keyword, ticker_list in sector_keywords_dict.items():
        pattern = r'\b' + re.escape(sector_keyword.lower()) + r'\b'
        if re.search(pattern, text_lower):
            for ticker in ticker_list:
                if ticker not in matches:
                    matches[ticker] = []

                if 'direct' not in matches[ticker]:
                    matches[ticker].append('sector')

    # Thematic Keywords
    for theme_pattern, impact_dict in thematic_keywords_dict.items():
        if re.search(theme_pattern, text_lower, re.IGNORECASE):
            if impact_dict.get('positive'):
                for ticker in impact_dict['positive']:
                    if ticker not in matches:
                        matches[ticker] = []
                    if 'direct' not in matches[ticker] and 'sector' not in matches[ticker]:
                        matches[ticker].append('thematic_positive')

            if impact_dict.get('negative'):
                for ticker in impact_dict['negative']:
                    if ticker not in matches:
                        matches[ticker] = []
                    if 'direct' not in matches[ticker] and 'sector' not in matches[ticker]:
                        matches[ticker].append('thematic_negative')

    return matches

# Test the multi-level matching
test_texts = [
    'Royal Caribbean announces new ship',
    'Cruise industry faces COVID-19 restrictions',
    'Oil prices surge on OPEC production cuts',
    'Federal Reserve raises interest rates',
    'Semiconductor shortage impacts tech sector'
]

print("Testing multi-level keyword matching:\n")
for text in test_texts:
    matches = find_ticker_matches(text, keywords, sector_keywords, thematic_keywords)
    print(f'Text: {text}')
    if matches:
        for ticker, match_types in matches.items():
            print(f"  → {ticker} ({', '.join(match_types)})")
    else:
        print(f'  → No matches (MARKET_GENERAL)')
    print()

Testing multi-level keyword matching:

Text: Royal Caribbean announces new ship
  → RCL (direct)

Text: Cruise industry faces COVID-19 restrictions
  → NCLH (sector)
  → CVNA (sector)
  → CCL (sector)
  → RCL (sector)
  → MGM (sector)
  → TPR (sector)
  → WYNN (sector)
  → DPZ (sector)
  → BA (thematic_negative)
  → URI (thematic_negative)

Text: Oil prices surge on OPEC production cuts
  → ON (direct)
  → APA (sector, sector, sector)
  → TRGP (sector, sector, sector)
  → HAL (sector, sector, sector)
  → OXY (sector, sector, sector)
  → DVN (sector, sector, sector)
  → MPC (sector, sector, sector)
  → OKE (sector, sector, sector)
  → FANG (sector, sector, sector)

Text: Federal Reserve raises interest rates
  → C (thematic_positive)
  → COF (thematic_positive)
  → KEY (thematic_positive)
  → PRU (thematic_positive)
  → SYF (thematic_positive)
  → AMP (thematic_positive)
  → IVZ (thematic_positive)
  → ED (thematic_negative)
  → AEP (thematic_negative)
  → CMS (thematic_negative)
  → WE

In [17]:
# Apply attribution to all news articles
print('Attributing news to tickers with multi-level matching...')
print('  Processing news articles...\n')

news_df['matched_tickers_dict'] = news_df['final_text'].apply(
    lambda x: find_ticker_matches(x, keywords, sector_keywords, thematic_keywords)
)

news_df['matched_tickers'] = news_df['matched_tickers_dict'].apply(
    lambda x: list(x.keys()) if x else []
)
news_df['match_count'] = news_df['matched_tickers'].apply(len)

print('Attribution results:')
print(f'  Articles with 0 matches (MARKET_GENERAL): {(news_df['match_count'] == 0).sum():,}')
print(f'  Articles with 1 match: {(news_df['match_count'] == 1).sum():,}')
print(f'  Articles with 2+ matches: {(news_df['match_count'] >= 2).sum():,}')
print(f'  Max matches in single article: {news_df['match_count'].max()}')

all_match_types = []
for match_dict in news_df['matched_tickers_dict']:
    for ticker, types in match_dict.items():
        all_match_types.extend(types)

from collections import Counter
match_type_counts = Counter(all_match_types)
print('\nMatch type distribution:')
for match_type, count in match_type_counts.most_common():
    print(f"  {match_type}: {count:,}")

print('\nMatch count distribution:')
print(news_df['match_count'].value_counts().sort_index().head(10))

Attributing news to tickers with multi-level matching...
  Processing news articles...

Attribution results:
  Articles with 0 matches (MARKET_GENERAL): 15,539
  Articles with 1 match: 12,895
  Articles with 2+ matches: 24,540
  Max matches in single article: 32

Match type distribution:
  sector: 125,543
  thematic_negative: 43,263
  direct: 39,230
  thematic_positive: 4,389

Match count distribution:
match_count
0    15539
1    12895
2     4821
3     1731
4     1954
5     1013
6     1038
7     2033
8     3894
9     3351
Name: count, dtype: int64


In [18]:
# Explode the df - create one row per ticker match
# Articles mentioning multiple companies will have multiple rows

print('Expanding articles with multiple ticker matches...')

news_with_matches = news_df[news_df['match_count'] > 0].copy()
news_no_matches = news_df[news_df['match_count'] == 0].copy()

expanded_rows = []
for idx, row in news_with_matches.iterrows():
    for ticker, match_types in row['matched_tickers_dict'].items():
        expanded_rows.append({
            'date': row['date'],
            'source': row['source'],
            'final_text': row['final_text'],
            'Ticker': ticker,
            'match_type': ','.join(match_types),
            'is_direct': 'direct' in match_types
        })

news_attributed = pd.DataFrame(expanded_rows)

# Add rows for MARKET_GENERAL
news_no_matches['Ticker'] = 'MARKET_GENERAL'
news_no_matches['match_type'] = 'none'
news_no_matches['is_direct'] = False
news_market = news_no_matches[['date', 'source', 'final_text', 'Ticker', 'match_type', 'is_direct']]

news_final = pd.concat([news_attributed, news_market], ignore_index=True)

print('\nExpansion complete:')
print(f'  Original articles: {len(news_df):,}')
print(f'  Attributed rows (after expansion): {len(news_final):,}')
print(f'  Expansion factor: {len(news_final) / len(news_df):.2f}x')
print('\nBreakdown:')
print(f'  Ticker-specific: {(news_final['Ticker'] != 'MARKET_GENERAL').sum():,}')
print(f'    Direct mentions: {news_final['is_direct'].sum():,}')
print(f'    Indirect (sector/thematic): {(~news_final['is_direct'] & (news_final['Ticker'] != 'MARKET_GENERAL')).sum():,}')
print(f'  Market general: {(news_final['Ticker'] == 'MARKET_GENERAL').sum():,}')

Expanding articles with multiple ticker matches...

Expansion complete:
  Original articles: 52,974
  Attributed rows (after expansion): 206,023
  Expansion factor: 3.89x

Breakdown:
  Ticker-specific: 190,484
    Direct mentions: 39,230
    Indirect (sector/thematic): 151,254
  Market general: 15,539


In [20]:
# ticker coverage by match type
ticker_specific = news_final[news_final['Ticker'] != 'MARKET_GENERAL'].copy()

ticker_counts = ticker_specific['Ticker'].value_counts()
print(f'Ticker-specific news coverage:')
print(f' Total tickers with news: {len(ticker_counts)}')
print(f'  Mean articles per ticker: {ticker_counts.mean():.1f}')
print(f'  Median articles per ticker: {ticker_counts.median():.1f}')
print(f'  Min articles per ticker: {ticker_counts.min()}')
print(f'  Max articles per ticker: {ticker_counts.max()}')

print('\nTop 10 most mentioned stocks (all match types):')
print(ticker_counts.head(10))

direct_mentions = ticker_specific[ticker_specific['is_direct']]['Ticker'].value_counts()
print('\nTop 10 most mentioned stocks (direct mentions only):')
print(direct_mentions.head(10))

# indirect mentions (sector/thematic)
indirect_mentions = ticker_specific[~ticker_specific['is_direct']]['Ticker'].value_counts()
print('\nTop 10 stocks most affected by sector/thematic news:')
print(indirect_mentions.head(10))

print('\nMatch type breakdown:')
print(ticker_specific['match_type'].value_counts())

print('\nBottom 10 least mentioned stocks (all types):')
print(ticker_counts.tail(10))

Ticker-specific news coverage:
 Total tickers with news: 60
  Mean articles per ticker: 3174.7
  Median articles per ticker: 2374.5
  Min articles per ticker: 388
  Max articles per ticker: 28315

Top 10 most mentioned stocks (all match types):
Ticker
ON      28315
URI      9214
BA       7756
RCL      7063
NCLH     6969
MGM      6914
WYNN     6913
CCL      6911
KEY      4307
COF      4243
Name: count, dtype: int64

Top 10 most mentioned stocks (direct mentions only):
Ticker
ON      27634
URI      2876
BA       1410
PSA       993
AEP       894
GIS       853
COST      737
KEY       621
DG        600
COF       599
Name: count, dtype: int64

Top 10 stocks most affected by sector/thematic news:
Ticker
RCL     6894
MGM     6894
CCL     6893
WYNN    6882
NCLH    6880
BA      6346
URI     6338
IVZ     3747
PRU     3746
SYF     3746
Name: count, dtype: int64

Match type breakdown:
match_type
sector                                       86303
thematic_negative                            43140
di