# Phase 2: Sentiment Scoring - VADER vs FinBERT Comparison

## Objective

Transform 52,974 raw news headlines into daily, ticker-specific sentiment time series using:
- **VADER** (Lexicon-based pproach)
- **FinBERT** (Transformer-based approach)

This enables a direct comparison of baseline vs state-of-the-art sentiment analysis for financial text.

## Pipeline Overview

```
News (53k articles)
    ↓
1. Entity Resolution (yfinance keyword matching)
    ↓
2. News Attribution (assign to tickers or MARKET_GENERAL)
    ↓
3. VADER Scoring (lexicon-based, CPU)
    ↓
4. FinBERT Scoring (transformer-based, GPU)
    ↓
5. Validation & Comparison
    ↓
6. Daily Aggregation by Ticker
    ↓
Output: sentiment_scores_60.csv + market_sentiment_general.csv
```

---

## 1. Environment & Setup

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

import yfinance as yf

import matplotlib.pyplot as plt
import seaborn as sns

import os
import re
from tqdm import tqdm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f'  GPU detected: {torch.cuda.get_device_name(0)}')
    print(f'  CUDA version: {torch.version.cuda}')
    print(f'  Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB')
    print(f'  Memory reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB')
else:
    device = torch.device('cpu')

print(f'\nDevice set to: {device}')

  GPU detected: Tesla T4
  CUDA version: 12.8
  Memory allocated: 0.00 MB
  Memory reserved: 0.00 MB

Device set to: cuda


In [4]:
project_root = '/content/drive/MyDrive/market-sentiment-impact-analysis'

data_processed = os.path.join(project_root, 'data', 'processed')
data_tickers = os.path.join(project_root, 'data', 'tickers')

os.makedirs(data_processed, exist_ok=True)

print(f"Project Root: {project_root}")
print(f"Processed Data: {data_processed}")
print(f"Tickers Data: {data_tickers}")

Project Root: /content/drive/MyDrive/market-sentiment-impact-analysis
Processed Data: /content/drive/MyDrive/market-sentiment-impact-analysis/data/processed
Tickers Data: /content/drive/MyDrive/market-sentiment-impact-analysis/data/tickers


---

## 2. Load Input Data

In [5]:
news_path = os.path.join(data_processed, 'daily_news_cleaned.csv')

print(f'Loading news data from: {news_path}')
news_df = pd.read_csv(news_path)

news_df['date'] = pd.to_datetime(news_df['date'])

print(f'\nNews data loaded:')
print(f'  Records: {len(news_df):,}')
print(f'  Date range: {news_df['date'].min().date()} to {news_df['date'].max().date()}')
print(f'  Sources: {news_df['source'].value_counts().to_dict()}')
print(f'\nColumns: {list(news_df.columns)}')
print(f'\nSample:')
print(news_df.head(3))

Loading news data from: /content/drive/MyDrive/market-sentiment-impact-analysis/data/processed/daily_news_cleaned.csv

News data loaded:
  Records: 52,974
  Date range: 2018-01-02 to 2020-07-18
  Sources: {'Reuters': 32673, 'Guardian': 17516, 'CNBC': 2785}

Columns: ['date', 'source', 'final_text']

Sample:
                       date    source                                         final_text
0 2018-01-02 00:00:00+00:00  Guardian  Former advertising executive reveals junk food...
1 2018-01-02 00:00:00+00:00  Guardian  Transport secretary ‘running scared’ as he fli...
2 2018-01-02 00:00:00+00:00  Guardian  Good for factories, bad for shoppers: a Brexit...


In [6]:
all_betas_path = os.path.join(data_tickers, 'all_betas.csv')

print(f'Loading ticker metadata from all_betas.csv...')
all_tickers_df = pd.read_csv(all_betas_path)

stock_returns_path = os.path.join(data_processed, 'stock_returns_60.csv')
stock_returns = pd.read_csv(stock_returns_path)
ticker_list = stock_returns['Ticker'].unique().tolist()

all_tickers_df = all_tickers_df[all_tickers_df['Ticker'].isin(ticker_list)].copy()

print(f'\nTickers loaded:')
print(f'  Total: {len(ticker_list)}')
print(f'  Unique tickers in metadata: {len(all_tickers_df)}')
print(f'\nSample tickers: {ticker_list[:10]}')

Loading ticker metadata from all_betas.csv...

Tickers loaded:
  Total: 60
  Unique tickers in metadata: 60

Sample tickers: ['AEP', 'AMP', 'APA', 'BA', 'C', 'CAG', 'CCL', 'CHD', 'CHRW', 'CL']


---

## 3. Entity Resolution

In [7]:
print('Fetching company names from Yahoo Finance...')

company_names = {}
failed_tickers = []

for ticker in tqdm(ticker_list):
    try:
        stock = yf.Ticker(ticker)
        info = stock.info

        name = info.get('shortName') or info.get('longName') or ticker
        company_names[ticker] = name

    except Exception as e:
        company_names[ticker] = ticker
        failed_tickers.append(ticker)

print(f'\n  Downloaded names for {len(company_names)} tickers')
if failed_tickers:
    print(f"   Failed to fetch: {failed_tickers}")

print(f'\nSample company names:')
for ticker in list(company_names.keys())[:5]:
    print(f'  {ticker}: {company_names[ticker]}')

Fetching company names from Yahoo Finance...


100%|██████████| 60/60 [00:02<00:00, 21.15it/s]



  Downloaded names for 60 tickers

Sample company names:
  AEP: American Electric Power Company
  AMP: Ameriprise Financial, Inc.
  APA: APA Corporation
  BA: Boeing Company (The)
  C: Citigroup, Inc.


In [9]:
def clean_company_name(name):
    if not isinstance(name, str):
        return str(name)

    suffixes = [
        r',?\s+Inc\.?$',
        r',?\s+Corporation$',
        r',?\s+Corp\.?$',
        r',?\s+Company$',
        r',?\s+Co\.?$',
        r',?\s+Ltd\.?$',
        r',?\s+Limited$',
        r',?\s+Plc$',
        r',?\s+PLC$',
        r',?\s+LLC$',
        r',?\s+L\.P\.$',
        r',?\s+LP$',
        r',?\s+Group$',
        r',?\s+Holdings?$',
        r'\s+\(The\)$',
        r',?\s+& Co\.?$'
    ]

    cleaned = name
    for suffix in suffixes:
        cleaned = re.sub(suffix, '', cleaned, flags=re.IGNORECASE)

    return cleaned.strip()

company_names_clean = {ticker: clean_company_name(name)
                        for ticker, name in company_names.items()}

print('Company name cleaning:')
print('\nBefore → After:')
for ticker in list(company_names.keys())[:10]:
    original = company_names[ticker]
    cleaned = company_names_clean[ticker]
    if original != cleaned:
        print(f'  {ticker}: {original} → {cleaned}')
    else:
        print(f'  {ticker}: {original} (no change)')

Company name cleaning:

Before → After:
  AEP: American Electric Power Company → American Electric Power
  AMP: Ameriprise Financial, Inc. → Ameriprise Financial
  APA: APA Corporation → APA
  BA: Boeing Company (The) → Boeing Company
  C: Citigroup, Inc. → Citigroup
  CAG: ConAgra Brands, Inc. → ConAgra Brands
  CCL: Carnival Corporation → Carnival
  CHD: Church & Dwight Company, Inc. → Church & Dwight
  CHRW: C.H. Robinson Worldwide, Inc. → C.H. Robinson Worldwide
  CL: Colgate-Palmolive Company → Colgate-Palmolive


In [10]:
def create_keyword_dict(ticker_list, company_names_clean):
    keywords = {}

    for ticker in ticker_list:
        name = company_names_clean.get(ticker, ticker)

        keyword_list = [ticker, name]

        # Add common abbreviations and variations
        words = name.split()
        if len(words) > 1:
            first_word = words[0]
            if len(first_word) > 3:
                keyword_list.append(first_word)

        manual_mappings = {
            'C': ['Citigroup', 'Citi'],
            'PG': ['P&G', 'Procter', 'Procter & Gamble', 'The Procter & Gamble Company'],
            'WMT': ['Walmart', 'Wal-Mart'],
            'BA': ['Boeing'],
            'VZ': ['Verizon'],
            'COF': ['Capital One'],
            'COST': ['Costco'],
            'DG': ['Dollar General'],
            'AMP': ['Ameriprise'],
            'APA': ['APA Corp'],
            'CVNA': ['Carvana'],
            'CCL': ['Carnival'],
            'RCL': ['Royal Caribbean'],
            'MGM': ['MGM Resorts'],
            'HAL': ['Halliburton'],
            'OXY': ['Occidental'],
            'DVN': ['Devon'],
            'MPC': ['Marathon', 'Marathon Petroleum'],
            'WDC': ['Western Digital'],
            'PRU': ['Prudential', 'Prudential Financial'],
            'LRCX': ['Lam Research'],
            'SYF': ['Synchrony'],
            'WYNN': ['Wynn'],
            'OKE': ['Oneok'],
            'KEY': ['KeyCorp', 'KeyBank'],
            'IVZ': ['Invesco'],
            'FANG': ['Diamondback', 'Diamondback Energy'],
            'TTD': ['Trade Desk'],
            'URI': ['United Rentals'],
            'FCX': ['Freeport', 'Freeport-McMoRan'],
            'GIS': ['General Mills'],
            'HRL': ['Hormel', 'Hormel Foods'],
            'CAG': ['Conagra', 'Conagra Brands'],
            'CHD': ['Church & Dwight'],
            'KMB': ['Kimberly-Clark', 'Kimberly Clark', 'Kimberly-Clark Corp'],
            'CLX': ['Clorox', 'Clorox Co', 'The Clorox Company'],
            'KR': ['Kroger', 'Kroger Co'],
            'CPB': ['Campbell', 'Campbell\'s', 'Campbell Soup'],
            'SJM': ['Smucker', 'J.M. Smucker', 'J.M. Smucker'],
            'DPZ': ['Domino\'s', 'Dominos'],
            'KDP': ['Keurig', 'Keurig Dr Pepper'],
            'ED': ['Con Ed', 'ConEd', 'Consolidated Edison'],
            'AEP': ['American Electric', 'American Electric Power', 'AEP'],
            'CMS': ['CMS Energy'],
            'WEC': ['WEC Energy'],
            'LNT': ['Alliant'],
            'GILD': ['Gilead'],
            'MKC': ['McCormick', 'McCormick & Co'],
            'CL': ['Colgate', 'Colgate-Palmolive'],
            'GEN': ['Gen Digital'],
            'CHRW': ['C.H. Robinson'],
            'PSA': ['Public Storage'],
            'EXR': ['Extra Space', 'Extra Space Storage'],
            'NEM': ['Newmont', 'Newmont Corp'],
            'SW': ['Smurfit', 'Smurfit Westrock'],
            'ON': ['ON Semi', 'ON Semiconductor'],
            'NCLH': ['Norwegian Cruise'],
            'TPR': ['Tapestry'],
            'TRGP': ['Targa', 'Targa Resources']
        }

        if ticker in manual_mappings:
            keyword_list.extend(manual_mappings[ticker])

        keyword_list = list(set([k.strip() for k in keyword_list if k.strip()]))

        keywords[ticker] = keyword_list

    return keywords

# keyword dictionary
keywords = create_keyword_dict(ticker_list, company_names_clean)

print(f'Keyword dictionary created for {len(keywords)} tickers\n')
print('Sample keyword mappings:')
for ticker in list(keywords.keys())[:10]:
    print(f'  {ticker}: {keywords[ticker]}')

Keyword dictionary created for 60 tickers

Sample keyword mappings:
  AEP: ['AEP', 'American', 'American Electric', 'American Electric Power']
  AMP: ['Ameriprise Financial', 'Ameriprise', 'AMP']
  APA: ['APA Corp', 'APA']
  BA: ['Boeing', 'BA', 'Boeing Company']
  C: ['Citi', 'C', 'Citigroup']
  CAG: ['ConAgra', 'Conagra', 'ConAgra Brands', 'CAG', 'Conagra Brands']
  CCL: ['CCL', 'Carnival']
  CHD: ['Church', 'CHD', 'Church & Dwight']
  CHRW: ['C.H.', 'CHRW', 'C.H. Robinson', 'C.H. Robinson Worldwide']
  CL: ['Colgate-Palmolive', 'CL', 'Colgate']
