<a href="https://colab.research.google.com/github/renzungo/Clarin_Covers_Sent_Analysis/blob/sentiment/05_entity_connotation_beto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import drive; drive.mount('/content/drive', force_remount=True)
!pip -q install transformers==4.43.3 torch pandas unidecode tqdm

import os, math, hashlib, time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from unidecode import unidecode
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ---------------- Paths ----------------
WORK_DIR = "/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_pipeline_work"
OUT_DIR  = "/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_analytics_out"
CACHE    = "/content/drive/MyDrive/hf_cache"
os.makedirs(WORK_DIR, exist_ok=True); os.makedirs(OUT_DIR, exist_ok=True); os.makedirs(CACHE, exist_ok=True)

ENT_OUT = os.path.join(OUT_DIR, "entities_sentiment.csv")
DONE_LOG = os.path.join(OUT_DIR, "entities_done.txt")   # keeps processed file names (one per line)

# ---------------- Speed knobs (tuned) ----------------
MODEL_ID = "pysentimiento/robertuito-sentiment-analysis"   # fast Spanish POS/NEU/NEG
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

KEEP_ENTITY_TYPES = {"PER"}        # people only (fastest). Add "ORG","LOC","GPE" if needed.
CAP_WINDOWS_PER_ENTITY = 2         # at most N windows per (file, entity, type)
DEDUP_WINDOWS = True               # deduplicate identical windows within file
CHAR_RADIUS = 100                  # ±100 chars around mention
MAX_LEN_TOKENS = 128
BATCH = 256

if DEVICE == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.set_float32_matmul_precision("high")

# ---------------- Load inputs ----------------
df_base  = pd.read_parquet(os.path.join(WORK_DIR, "base.parquet"))       # file,date,text
df_spans = pd.read_parquet(os.path.join(WORK_DIR, "ner_spans.parquet"))  # file,date,entity_text,entity_type,start,end
print(f"[DEBUG] df_base shape: {df_base.shape} | df_spans shape: {df_spans.shape}")  # Debug: input sizes
if KEEP_ENTITY_TYPES:
    df_spans = df_spans[df_spans["entity_type"].isin(KEEP_ENTITY_TYPES)]

text_map = {r.file: (r.text or "") for r in df_base.itertuples(index=False)}

# Files to process (those that have any span)
files_with_spans = df_spans["file"].drop_duplicates().tolist()

# Progress log helpers
# Read log of already processed files
def load_done(log_path):
    if not os.path.exists(log_path): return set()
    with open(log_path, "r", encoding="utf-8") as f:
        return set([ln.strip() for ln in f if ln.strip()])
# Append a filename to the processed log
def append_done(log_path, fname):
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(fname + "\n")

done = load_done(DONE_LOG)
todo = [f for f in files_with_spans if f not in done]
print(f"Total covers with entities: {len(files_with_spans)} | Already done: {len(done)} | To do: {len(todo)}")

# Ensure output file exists with header
if not os.path.exists(ENT_OUT):
    pd.DataFrame(columns=["file","date","entity_norm","entity_type","mentions","avg_score","pos_share","neu_share","neg_share"])\
      .to_csv(ENT_OUT, index=False, encoding="utf-8")

# ---------------- Load model (direct, no pipeline) ----------------
load_kwargs = {"cache_dir": CACHE}
if DEVICE == "cuda":
    load_kwargs["torch_dtype"] = torch.float16
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, **load_kwargs).to(DEVICE).eval()
id2label = model.config.id2label

# Efficient batched sentiment prediction without gradient tracking
@torch.no_grad()
def predict(texts):
    out_labels, out_scores = [], []
    for i in range(0, len(texts), BATCH):
        batch = texts[i:i+BATCH]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=MAX_LEN_TOKENS, return_tensors="pt")
        enc = {k: v.to(DEVICE, non_blocking=True) for k,v in enc.items()}
        with torch.autocast(device_type="cuda", enabled=(DEVICE=="cuda"), dtype=torch.float16):
            logits = model(**enc).logits
    # Convert similarities to pseudo-probabilities
        probs = torch.softmax(logits, dim=-1)
        best_idx = torch.argmax(probs, dim=-1)
        best_prob = probs.gather(1, best_idx.unsqueeze(1)).squeeze(1)
        out_labels.extend([id2label[int(i)] for i in best_idx.tolist()])
        out_scores.extend(best_prob.tolist())
    return out_labels, out_scores

# Build sentiment windows around entities for a single cover
def process_one_file(fname: str, debug=False):
    # Spans for this file
    s = df_spans[df_spans["file"] == fname].copy()
    if s.empty:
        return None
    date = s["date"].iloc[0]
    text = text_map.get(fname, "")
    # Build windows for this file
    rows = []
    for r in s.itertuples(index=False):
        lo = max(0, int(r.start) - CHAR_RADIUS); hi = min(len(text), int(r.end) + CHAR_RADIUS)
        win = text[lo:hi].strip()
        if not win:
            continue
        rows.append({"file": fname,
                     "date": date,
                     "entity_norm": unidecode(r.entity_text).lower().strip(),
                     "entity_type": r.entity_type,
                     "window": win})
    if not rows:
        return None
    df = pd.DataFrame(rows)
    if debug:
        print(f"[DEBUG] {fname} windows sample: {df.head()}")  # Debug: show sample windows

    # Cap windows per entity for this file
    if CAP_WINDOWS_PER_ENTITY is not None:
        df = (df.groupby(["entity_norm","entity_type"], as_index=False, sort=False)
                .head(CAP_WINDOWS_PER_ENTITY))

    # Dedup windows within this file
    if DEDUP_WINDOWS:
        df["win_hash"] = df["window"].map(lambda t: hashlib.sha1(t.encode("utf-8")).hexdigest())
        uniq = df.drop_duplicates("win_hash", keep="first")[["win_hash","window"]].reset_index(drop=True)
    else:
        df["win_hash"] = pd.util.hash_pandas_object(df["window"], index=False).astype(str)
        uniq = df[["win_hash","window"]].copy()

    # Predict once per unique window
    lbls, scs = predict(uniq["window"].tolist())
    uniq["label"], uniq["score"] = lbls, scs
    pred_map = dict(zip(uniq["win_hash"], zip(uniq["label"], uniq["score"])))

    # Fan back to all windows
    df["label"] = df["win_hash"].map(lambda h: pred_map[h][0])
    df["score"] = df["win_hash"].map(lambda h: pred_map[h][1])

    # Aggregate to (file, date, entity_norm, entity_type)
    def share(series, target):
        if len(series) == 0: return 0.0
        return float(np.mean([1.0 if x == target else 0.0 for x in series]))

    # Aggregate window scores per entity
    agg = (df.groupby(["file","date","entity_norm","entity_type"], as_index=False)
             .agg(mentions=("label","count"),
                  avg_score=("score","mean"),
                  pos_share=("label", lambda s: share(s, "POS")),
                  neu_share=("label", lambda s: share(s, "NEU")),
                  neg_share=("label", lambda s: share(s, "NEG"))))
    if debug:
        print(f"[DEBUG] Aggregated sentiment: {agg.head()}")  # Debug: sample aggregated
    return agg

# ---------------- Stream: process file-by-file and append ----------------
start = time.time()
# Stream through remaining files and append results
for idx, fname in enumerate(tqdm(todo, desc="Streaming entities per cover")):
    agg = process_one_file(fname, debug=(idx==0))
    if idx == 0:
        print(f"[DEBUG] First file processed: {fname}")  # Debug: shows file name
    if agg is not None and not agg.empty:
        # Append to CSV without header
        # Append aggregated sentiment to CSV output
        agg.to_csv(ENT_OUT, mode="a", index=False, header=False, encoding="utf-8")
    append_done(DONE_LOG, fname)

elapsed = time.time() - start
print(f"Done streaming. Appended {len(todo)} file(s). Elapsed: {elapsed/60:.1f} min")
if DEVICE == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))


Mounted at /content/drive
Total covers with entities: 652 | Already done: 0 | To do: 652


Streaming entities per cover:   0%|          | 0/652 [00:00<?, ?it/s]

Done streaming. Appended 652 file(s). Elapsed: 38.7 min
