In [1]:
import os
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm   # ✅ text-only progress bar
from IPython.display import display

# ---- Setup ----
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # just in case, avoid warnings
nltk.download("vader_lexicon")

# ---- Load data ----
df = pd.read_csv("sp500_headlines.csv")

# ---- Init VADER ----
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    score = analyzer.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

# ---- Apply with text-only progress ----
tqdm.pandas(desc="Analyzing headlines")  # adds progress_apply to pandas
df["sentiment"] = df["title"].progress_apply(get_sentiment)

# ---- Distribution ----
distribution = df["sentiment"].value_counts(normalize=True) * 100
print("\nSentiment distribution (%):")
display(distribution.round(2))  # ✅ prettier in Jupyter

# ---- Show 5 examples per class ----
for sentiment in ["positive", "neutral", "negative"]:
    print(f"\n{sentiment.upper()} EXAMPLES:")
    ex = df[df["sentiment"] == sentiment]["title"].head(5)
    if ex.empty:
        print(" (none found)")
    else:
        for t in ex:
            print(" -", t)

# ---- Optional: Save results ----
df.to_csv("sp500_headlines_with_vader.csv", index=False)


[nltk_data] Downloading package vader_lexicon to /Users/x/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Analyzing headlines: 100%|██████████| 1460433/1460433 [01:24<00:00, 17367.69it/s]



Sentiment distribution (%):


neutral     50.19
positive    30.73
negative    19.08
Name: sentiment, dtype: float64


POSITIVE EXAMPLES:
 - Cumbria - Entertainment - Tom Hingley "Un-guarded" - BBC
 - Gift idea : pre-packaged, bioengineered pets - Core77
 - Another day, another exotic supercar: the Edran Enigma - Motor Authority
 - Survivor: Cook Islands - Box Office Prophets
 - Healdsburg restaurant has a new chef but its claim to fame remains the same - The Press Democrat

NEUTRAL EXAMPLES:
 - Interview: Margaret Morrison, Founder And Director of Cybercandy - Londonist
 - Leicester - Features - Return of the Blaby Tomatoes - BBC
 - Paul Smith At Borough Market - Londonist
 - Marcie Shatula Interview- Mmm Blueberries - Pinkbike
 - Interstate '82 - Eurogamer

NEGATIVE EXAMPLES:
 - Computer feels your rage - Australian Broadcasting Corporation
 - Mmm…. Teaser - empireonline.com
 - "Flushed Away": DreamWorks' $100 Million Dump - TMZ
 - Taking Violence to a New, Technological Absurdity (Published 2007) - The New York Times
 - Mavrodi Convicted of Fraud in MMM Trial - The Moscow Times


In [2]:
import os
import sys
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm import tqdm  # ✅ text-only progress bar (no widgets)

# Keep notebook output tidy
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ---- Speed/Device setup ----
def pick_device_for_pipeline():
    """
    Returns a device handle suitable for transformers.pipeline:
      - torch.device('mps') on Apple Silicon if available
      - 0 for first CUDA GPU
      - -1 for CPU
    """
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        return torch.device("mps")
    elif torch.cuda.is_available():
        return 0
    else:
        return -1

device = pick_device_for_pipeline()

# ---- Load data ----
df = pd.read_csv("sp500_headlines.csv")

# ---- Model ----
MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

nlp = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    framework="pt",
    device=device,  # uses MPS/CUDA if available, else CPU
)

# ---- Run inference with text-only progress ----
texts = df["title"].astype(str).tolist()
batch_size = 128
results = []

# Plain-text tqdm (no widgets), good in Jupyter/terminal
for i in tqdm(
    range(0, len(texts), batch_size),
    desc="Processing headlines",
    ascii=True,
    dynamic_ncols=True,
    leave=True,
    file=sys.stdout,   # ensure text bar prints to cell output
):
    batch = texts[i:i+batch_size]
    preds = nlp(
        batch,
        batch_size=batch_size,
        truncation=True,
        padding=True,
        max_length=48,  # headlines are short; keeps it fast
    )
    results.extend(preds)

# ---- Map to 3 classes (add neutral band around 0.5) ----
def map_to_trinary(label, score, neutral_band=0.06):
    if abs(score - 0.5) <= neutral_band:
        return "neutral"
    return "positive" if label.upper() == "POSITIVE" else "negative"

df["sentiment"] = [map_to_trinary(p["label"], p["score"]) for p in results]

# ---- Distribution ----
dist = (df["sentiment"].value_counts(normalize=True) * 100).reindex(
    ["positive", "neutral", "negative"]
).fillna(0).round(2)
print("\nSentiment distribution (%):")
print(dist.to_string())

# ---- Examples ----
for s in ["positive", "neutral", "negative"]:
    print(f"\n{s.upper()} EXAMPLES:")
    ex = df.loc[df["sentiment"] == s, "title"].head(5)
    if ex.empty:
        print(" (none found)")
    else:
        for t in ex:
            print(" -", t)

# ---- Optional: save results ----
df.to_csv("headlines_with_sentiment_DistilBERT.csv", index=False)


Device set to use mps


Processing headlines: 100%|##########| 11410/11410 [2:21:28<00:00,  1.34it/s]    

Sentiment distribution (%):
positive    46.28
neutral      2.06
negative    51.67

POSITIVE EXAMPLES:
 - Interview: Margaret Morrison, Founder And Director of Cybercandy - Londonist
 - Paul Smith At Borough Market - Londonist
 - Mmm…. Teaser - empireonline.com
 - Another day, another exotic supercar: the Edran Enigma - Motor Authority
 - Interstate '82 - Eurogamer

NEUTRAL EXAMPLES:
 - Two and a Half Men Recap: "Mmm, fish. Yum." - TV Fanatic
 - Maison Martin Margiela 20 - Dazed
 - Miley Cyrus: Pink Polka Dot Bikini! - Just Jared Jr
 - It's Food Truck Heaven Every Wednesday Night in Cerritos - Patch
 - EXCLUSIVE! Chris Evans Naked! We Repeat: CHRIS EVANS NAKED! - Perez Hilton

NEGATIVE EXAMPLES:
 - Cumbria - Entertainment - Tom Hingley "Un-guarded" - BBC
 - Leicester - Features - Return of the Blaby Tomatoes - BBC
 - Computer feels your rage - Australian Broadcasting Corporation
 - Marcie Shatula Intervie

In [3]:
# BERTweet 3-class sentiment with calibrated neutral (no widgets)
import os, sys, time
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# ---- Settings ----
CSV_PATH   = "sp500_headlines.csv"
MODEL_NAME = "finiteautomata/bertweet-base-sentiment-analysis"  # 3-class
MAX_ROWS   = None     # e.g., 1000
BATCH_SIZE = 128
MAX_LEN    = 48
PRINT_EVERY= 1

# --- Neutral calibration (tune these) ---
NEU_MIN    = 0.78    # was 0.70
NEU_MARGIN = 0.25    # was 0.20
SIDE_MIN   = 0.45    # was 0.40
ALPHA      = 0.60    # was 0.50
VADER_DELTA = 0.08   # was 0.06

# Tie-breaker
USE_VADER_TIEBREAKER = True

# Tidy logs
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

def pick_device_for_pipeline():
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        return torch.device("mps")
    elif torch.cuda.is_available():
        return 0
    else:
        return -1

device = pick_device_for_pipeline()

# --- Load data ---
df = pd.read_csv(CSV_PATH)
if MAX_ROWS is not None:
    df = df.head(MAX_ROWS)
texts = df["title"].astype(str).tolist()
total = len(texts)

# --- Load model (force safetensors to avoid torch>=2.6 requirement) ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, use_safetensors=True)

# Optional CPU speed-up
if device == -1:
    model.eval()
    model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, framework="pt", device=device)

# Optional VADER tie-breaker
if USE_VADER_TIEBREAKER:
    import nltk
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    nltk.download("vader_lexicon", quiet=True)
    vader = SentimentIntensityAnalyzer()

# --- Helpers ---
def print_progress(done, total, start_time):
    pct = (done / total * 100) if total else 100.0
    elapsed = time.time() - start_time
    rate = done / elapsed if elapsed > 0 else 0
    sys.stdout.write(f"\rScoring headlines: {done}/{total} ({pct:5.1f}%)  {rate:6.1f} it/s")
    sys.stdout.flush()

def to_prob_dict(items):
    # items: [{'label': 'NEG/NEU/POS' or 'LABEL_i', 'score': float}, ...]
    mapping = {
        "LABEL_0":"negative","NEGATIVE":"negative","NEG":"negative",
        "LABEL_1":"neutral","NEUTRAL":"neutral","NEU":"neutral",
        "LABEL_2":"positive","POSITIVE":"positive","POS":"positive",
    }
    out = {"positive":0.0,"neutral":0.0,"negative":0.0}
    for d in items:
        out[mapping.get(d["label"].upper(), d["label"].lower())] = float(d["score"])
    return out

def decide_label(p, text=None):
    p_pos, p_neu, p_neg = p["positive"], p["neutral"], p["negative"]
    best_non_neu = max(p_pos, p_neg)

    # 1) Strict neutral gate
    if (p_neu >= NEU_MIN) and ((p_neu - best_non_neu) >= NEU_MARGIN):
        return "neutral"

    # 2) If a side has decent probability, prefer it unless neutral clearly dominates
    if max(p_pos, p_neg) >= SIDE_MIN:
        chosen = "positive" if p_pos >= p_neg else "negative"
        return chosen

    # 3) Apply neutral penalty and pick side by adjusted scores
    s_pos = p_pos - ALPHA * p_neu
    s_neg = p_neg - ALPHA * p_neu
    if abs(s_pos - s_neg) < 0.06 and USE_VADER_TIEBREAKER and text is not None:
        s = vader.polarity_scores(text)["compound"]
        if s >= VADER_DELTA:  return "positive"
        if s <= -VADER_DELTA: return "negative"
    return "positive" if s_pos >= s_neg else "negative"

# --- Inference (text-only progress) ---
results = []
start = time.time()
done = 0

for i in range(0, total, BATCH_SIZE):
    batch = texts[i:i+BATCH_SIZE]
    scored = nlp(
        batch,
        batch_size=BATCH_SIZE,
        truncation=True,
        padding=True,
        max_length=MAX_LEN,
        return_all_scores=True,
    )
    for item, text in zip(scored, batch):
        p = to_prob_dict(item)
        results.append(decide_label(p, text))
    done += len(batch)
    if ((i // BATCH_SIZE) % PRINT_EVERY) == 0:
        print_progress(done, total, start)

sys.stdout.write("\n")

# --- Results ---
df["sentiment"] = results

dist = (df["sentiment"].value_counts(normalize=True) * 100).reindex(
    ["positive","neutral","negative"]
).fillna(0).round(2)

print("\nSentiment distribution (%):")
print(dist.to_string())

for s in ["positive","neutral","negative"]:
    print(f"\n{s.upper()} EXAMPLES:")
    ex = df.loc[df["sentiment"] == s, "title"].head(5)
    if ex.empty:
        print(" (none found)")
    else:
        for t in ex:
            print(" -", t)

out_path = "sp500_headlines_with_bertweet_calibrated.csv"
df.to_csv(out_path, index=False)
print(f"\nSaved results to {out_path}")


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Device set to use mps


Scoring headlines: 1460433/1460433 (100.0%)    83.2 it/s

Sentiment distribution (%):
positive    22.38
neutral     63.80
negative    13.82

POSITIVE EXAMPLES:
 - Gift idea : pre-packaged, bioengineered pets - Core77
 - Another day, another exotic supercar: the Edran Enigma - Motor Authority
 - Lathrop's 'CHAKA' has zest for life and marinades - The Stockton Record
 - MMC Welcomes 14 New Faculty Members • News & Events - Marymount Manhattan College
 - Mmm, mobile: ‘Simpsons’ goes cellular - NBC News

NEUTRAL EXAMPLES:
 - Cumbria - Entertainment - Tom Hingley "Un-guarded" - BBC
 - Interview: Margaret Morrison, Founder And Director of Cybercandy - Londonist
 - Leicester - Features - Return of the Blaby Tomatoes - BBC
 - Paul Smith At Borough Market - Londonist
 - Computer feels your rage - Australian Broadcasting Corporation

NEGATIVE EXAMPLES:
 - "Flushed Away": DreamWorks' $100 Million Dump - TMZ
 - Mmm, sweaty! Women aroused by male scent - NBC News
 - Taking Violence to a New, Techno

In [4]:
import os
import sys
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm import tqdm  # text-only progress bar

# Keep notebook output tidy
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

# ---- Device pick (MPS > CUDA > CPU) ----
def pick_device_for_pipeline():
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        return torch.device("mps")
    elif torch.cuda.is_available():
        return 0
    else:
        return -1

device = pick_device_for_pipeline()

# ---- Config ----
CSV_PATH = "sp500_headlines.csv"
PRIMARY_MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"   # 3-class
FALLBACK_MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"     # 3-class, safetensors present

# ---- Load data ----
df = pd.read_csv(CSV_PATH)
texts = df["title"].astype(str).tolist()

# ---- Load model (force safetensors to avoid torch.load restriction) ----
def load_model_and_tokenizer(model_name: str):
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    mdl = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        use_safetensors=True,   # <-- critical: bypasses torch.load path
    )
    return tok, mdl

try:
    tokenizer, model = load_model_and_tokenizer(PRIMARY_MODEL)
except Exception as e:
    print(f"[INFO] Could not load '{PRIMARY_MODEL}' with safetensors: {e}")
    print(f"[INFO] Falling back to '{FALLBACK_MODEL}' ...")
    tokenizer, model = load_model_and_tokenizer(FALLBACK_MODEL)

nlp = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    framework="pt",
    device=device,  # MPS/CUDA if available, else CPU
)

# ---- Inference with text-only progress ----
batch_size = 128
results = []
for i in tqdm(
    range(0, len(texts), batch_size),
    desc="Processing headlines",
    ascii=True,
    dynamic_ncols=True,
    leave=True,
    file=sys.stdout,   # ensure progress prints in cell/terminal
):
    batch = texts[i:i+batch_size]
    preds = nlp(
        batch,
        batch_size=batch_size,
        truncation=True,
        padding=True,
        max_length=48,  # headlines are short; keeps it fast
    )
    results.extend(preds)

# ---- Map labels to 'negative'/'neutral'/'positive' ----
# CardiffNLP models return LABEL_0/1/2 with the following mapping:
#   LABEL_0 = negative, LABEL_1 = neutral, LABEL_2 = positive
label_map = {"LABEL_0": "negative", "LABEL_1": "neutral", "LABEL_2": "positive"}
df["sentiment"] = [label_map.get(r.get("label", ""), r.get("label", "")).lower() for r in results]

# ---- Distribution ----
dist = (df["sentiment"].value_counts(normalize=True) * 100).reindex(
    ["positive", "neutral", "negative"]
).fillna(0).round(2)

print("\nSentiment distribution (%):")
print(dist.to_string())

# ---- Show 5 examples per class ----
for s in ["positive", "neutral", "negative"]:
    print(f"\n{s.upper()} EXAMPLES:")
    ex = df.loc[df["sentiment"] == s, "title"].head(5)
    if ex.empty:
        print(" (none found)")
    else:
        for t in ex:
            print(" -", t)

# ---- Save results ----
out_path = "sp500_headlines_with_cardiffnlp.csv"
df.to_csv(out_path, index=False)
print(f"\nSaved results to {out_path}")


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps


Processing headlines:  19%|#8        | 2134/11410 [36:02<2:36:41,  1.01s/it] 



KeyboardInterrupt



In [None]:
# import os
# import sys
# import pandas as pd
# import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
# from tqdm import tqdm  # text-only progress bar

# # --- Clean output in notebooks/terminal ---
# os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

# # === Choose your 3-class model (pick one) ===
# # A) CardiffNLP RoBERTa (3-class, general short text)
# MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
# # B) BERTweet (often a bit “spicier” than RoBERTa on headlines)
# # MODEL_NAME = "finiteautomata/bertweet-base-sentiment-analysis"
# # C) FinBERT-tone (finance; 3-class; usually faster than Prosus finbert)
# # MODEL_NAME = "yiyanghkust/finbert-tone"

# CSV_PATH = "sp500_headlines.csv"

# # --- Device selection for the pipeline (MPS > CUDA > CPU) ---
# def pick_device_for_pipeline():
#     if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
#         return torch.device("mps")
#     elif torch.cuda.is_available():
#         return 0
#     else:
#         return -1

# device = pick_device_for_pipeline()

# # --- Load data ---
# # df = pd.read_csv(CSV_PATH)
# df = pd.read_csv(CSV_PATH).head(1000)

# texts = df["title"].astype(str).tolist()

# # --- Load tokenizer & model (force safetensors to avoid torch.load restriction) ---
# def load_model(model_name: str):
#     tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
#     mdl = AutoModelForSequenceClassification.from_pretrained(
#         model_name,
#         use_safetensors=True,   # <- avoids torch.load CVE restriction
#     )
#     return tok, mdl

# try:
#     tokenizer, model = load_model(MODEL_NAME)
# except Exception as e:
#     # Fallback to a multilingual 3-class model with safetensors if needed
#     print(f"[INFO] Could not load '{MODEL_NAME}' with safetensors: {e}")
#     fallback = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
#     print(f"[INFO] Falling back to '{fallback}' ...")
#     tokenizer, model = load_model(fallback)

# # --- Build pipeline ---
# nlp = pipeline(
#     "sentiment-analysis",
#     model=model,
#     tokenizer=tokenizer,
#     framework="pt",
#     device=device,
# )

# # --- Neutral calibration parameters (tune these) ---
# NEU_MIN = 0.55    # require at least this much neutral prob to consider neutral
# NEU_MARGIN = 0.12 # neutral must exceed max(pos,neg) by this margin

# # Helper to convert pipeline output (list of dicts with LABEL_X) to probs dict
# def to_prob_dict(items):
#     # items like: [{'label':'LABEL_0','score':0.12},{'label':'LABEL_1','score':0.70},{'label':'LABEL_2','score':0.18}]
#     mapping = {"LABEL_0": "negative", "LABEL_1": "neutral", "LABEL_2": "positive",
#                "NEGATIVE": "negative", "NEUTRAL": "neutral", "POSITIVE": "positive"}
#     out = {}
#     for d in items:
#         out[mapping.get(d["label"].upper(), d["label"].lower())] = float(d["score"])
#     # Ensure all keys exist
#     for k in ("positive", "neutral", "negative"):
#         out.setdefault(k, 0.0)
#     return out

# def decide_label(p):
#     # p is dict: {'positive': p_pos, 'neutral': p_neu, 'negative': p_neg}
#     p_pos, p_neu, p_neg = p["positive"], p["neutral"], p["negative"]
#     best_non_neu = max(p_pos, p_neg)

#     # Only assign neutral if it clears both a minimum AND a margin over pos/neg
#     if (p_neu >= NEU_MIN) and ((p_neu - best_non_neu) >= NEU_MARGIN):
#         return "neutral"
#     # Otherwise go with the non-neutral argmax
#     return "positive" if p_pos >= p_neg else "negative"

# # --- Inference with text-only progress (and return_all_scores for calibration) ---
# batch_size = 128
# results = []
# for i in tqdm(
#     range(0, len(texts), batch_size),
#     desc="Scoring headlines",
#     ascii=True,
#     dynamic_ncols=True,
#     leave=True,
#     file=sys.stdout,
# ):
#     batch = texts[i:i+batch_size]
#     # Ask pipeline for all class scores to apply our rule
#     # Note: return_all_scores=True makes pipeline return a list per input
#     scored = nlp(
#         batch,
#         batch_size=batch_size,
#         truncation=True,
#         padding=True,
#         max_length=48,
#         return_all_scores=True,
#     )
#     # Convert each item's LABEL_X list to probabilities dict and decide label
#     for item in scored:
#         p = to_prob_dict(item)
#         results.append(decide_label(p))

# df["sentiment"] = results

# # --- Distribution ---
# dist = (df["sentiment"].value_counts(normalize=True) * 100).reindex(
#     ["positive", "neutral", "negative"]
# ).fillna(0).round(2)

# print("\nSentiment distribution (%):")
# print(dist.to_string())

# # --- Show 5 examples per class ---
# for s in ["positive", "neutral", "negative"]:
#     print(f"\n{s.upper()} EXAMPLES:")
#     ex = df.loc[df["sentiment"] == s, "title"].head(5)
#     if ex.empty:
#         print(" (none found)")
#     else:
#         for t in ex:
#             print(" -", t)

# # --- Save ---
# out_path = "sp500_headlines_with_calibrated_sentiment.csv"
# df.to_csv(out_path, index=False)
# print(f"\nSaved results to {out_path}")
