<a href="https://colab.research.google.com/github/renzungo/Clarin_Covers_Sent_Analysis/blob/main/News_Cover_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =============================================================
# News Cover Analytics (Spanish)
# From OCR .txt files -> dashboard-ready CSVs
# - NER (people/orgs/locations)
# - Topic tagging (zero-shot, Spanish labels)
# - Sentiment (overall + around entities) with BETO (offline)
# - Top positive/negative words (lexicon-based)
# - Economy vs Government coverage share + sentiment
# - n-grams (bigrams/trigrams)
# - Timeseries aggregations
# =============================================================

!pip uninstall -y spacy transformers torch pandas numpy scikit-learn unidecode tqdm rapidfuzz torchvision torchaudio
!pip -q install spacy==3.7.5 transformers==4.43.3 pandas==2.2.2 numpy==1.26.4 scikit-learn unidecode tqdm rapidfuzz
!pip -q install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu118

# Download Spanish spaCy model (NER)
import sys, os, re, json, math, ast
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from pathlib import Path

import spacy, subprocess, pkg_resources
try:
    spacy.load("es_core_news_md")
except OSError:
    subprocess.run([sys.executable, "-m", "spacy", "download", "es_core_news_md"], check=True)

import spacy
nlp = spacy.load("es_core_news_md")

from unidecode import unidecode
from collections import Counter, defaultdict

from transformers import pipeline

Found existing installation: spacy 3.7.5
Uninstalling spacy-3.7.5:
  Successfully uninstalled spacy-3.7.5
Found existing installation: transformers 4.43.3
Uninstalling transformers-4.43.3:
  Successfully uninstalled transformers-4.43.3
Found existing installation: torch 2.2.2+cu118
Uninstalling torch-2.2.2+cu118:
  Successfully uninstalled torch-2.2.2+cu118
Found existing installation: pandas 2.2.2
Uninstalling pandas-2.2.2:
  Successfully uninstalled pandas-2.2.2
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: scikit-learn 1.7.1
Uninstalling scikit-learn-1.7.1:
  Successfully uninstalled scikit-learn-1.7.1
Found existing installation: Unidecode 1.4.0
Uninstalling Unidecode-1.4.0:
  Successfully uninstalled Unidecode-1.4.0
Found existing installation: tqdm 4.67.1
Uninstalling tqdm-4.67.1:
  Successfully uninstalled tqdm-4.67.1
Found existing installation: RapidFuzz 3.13.0
Uninstalling RapidFuzz-3.

  import spacy, subprocess, pkg_resources


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# =============================================================
# CONFIG
# =============================================================

# Folder with your OCR .txt files (one per cover)
# Example: "/content/drive/MyDrive/odc_ocr_out/txt"
INPUT_TXT_DIR = "/content/drive/MyDrive/odc_ocr_out/txt"

# Where to write outputs
OUTPUT_DIR = "/content/drive/MyDrive/odc_analytics_out"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Regex to infer date from filename. Adjust to your naming.
# Examples it can handle:
#   clarin_20250806.txt  -> 2025-08-06
#   2025-08-06_clarin.txt -> 2025-08-06
DATE_PATTERNS = [
    (re.compile(r'(?P<y>20\d{2})(?P<m>\d{2})(?P<d>\d{2})'), "%Y%m%d"),
    (re.compile(r'(?P<y>20\d{2})[-_](?P<m>\d{2})[-_](?P<d>\d{2})'), "%Y-%m-%d"),
]

# Topic labels (Spanish). Multi-label zero-shot with XNLI (supports Spanish).
TOPIC_LABELS = [
    "economía", "gobierno", "política", "justicia", "seguridad",
    "deportes", "internacionales", "sociedad", "cultura",
    "salud", "educación", "tecnología"
]
TOPIC_THRESHOLD = 0.35  # include topics with score >= threshold

# People you especially care about (optional; lowercase, no accents recommended)
PERSONS_OF_INTEREST = [
    # e.g., "javier milei", "sergio massa"
]

# Economy/Government keyword sets (extend freely)
ECONOMY_KWS = {
    "inflación","inflacion","dólar","dolar","salario","precios","devaluación","devaluacion",
    "pbi","actividad","desempleo","impuestos","tarifas","deuda","fmi","paritarias","exportaciones","importaciones"
}
GOVERNMENT_KWS = {
    "presidente","presidenta","gobierno","ministro","ministra","gabinete","decreto",
    "congreso","senado","diputados","casa rosada","boletín oficial","boletin oficial"
}

# Positive/Negative word seeds (quick start; extend for better coverage)
POS_LEX = {
    "histórico","historico","récord","record","mejora","baja","récord","exitoso","logro",
    "avance","acuerdo","crece","crecimiento","ganó","gano","victoria","alivio","optimismo","favorable"
}
NEG_LEX = {
    "crisis","derrota","suba","caída","caida","escándalo","escandalo","corrupción","corrupcion",
    "inflación","inflacion","devaluación","devaluacion","paro","conflicto","denuncia",
    "violencia","delito","inseguridad","déficit","deficit","ajuste","recesión","recesion"
}

# n-gram settings
NGRAM_MIN_COUNT = 2
MAX_NGRAMS_PER_COVER = 20

In [4]:
# =============================================================
# MODELS (offline pipelines)
# =============================================================

# BETO sentiment (Spanish) — outputs NEG/NEU/POS
SENT_CLF = pipeline(
    "text-classification",
    model="finiteautomata/beto-sentiment-analysis",
    tokenizer="finiteautomata/beto-sentiment-analysis",
    top_k=None
)

# Zero-shot topic classifier (XNLI multilingual)
ZSL = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli"
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [5]:
# =============================================================
# HELPERS
# =============================================================

def parse_date_from_name(name: str):
    base = os.path.basename(name)
    for rx, fmt in DATE_PATTERNS:
        m = rx.search(base)
        if m:
            y, mth, d = m.group('y'), m.group('m'), m.group('d')
            return f"{y}-{mth}-{d}"
    return None

def normalize_text(t: str):
    # keep original for sentiment; use normalized for keyword/lexicon
    t = re.sub(r'\s+', ' ', t).strip()
    return t

def tokenize_simple(text: str):
    # Accents-stripped, lowercase tokens for lexicon & kw matching
    t = unidecode(text).lower()
    return re.findall(r"[a-záéíóúñ]+", t)

def lines(text: str):
    return [l.strip() for l in text.split("\n") if l.strip()]

def get_top_words(token_list, lexicon_set, top_k=5):
    cnt = Counter(tok for tok in token_list if tok in {unidecode(w).lower() for w in lexicon_set})
    return [w for w,_ in cnt.most_common(top_k)]

def get_share_and_sentiment_for_kwset(text: str, kws: set):
    toks = tokenize_simple(text)
    kws_norm = {unidecode(k).lower() for k in kws}
    hits = [i for i, tok in enumerate(toks) if tok in kws_norm]
    share = 0.0 if not toks else len(hits) / len(toks)
    # sentiment around kw windows (±30 tokens)
    snippets = []
    for idx in hits:
        lo = max(0, idx-30); hi = min(len(toks), idx+31)
        snippets.append(" ".join(toks[lo:hi]))
    if not snippets:
        return share, None, None
    joined = " ".join(snippets)[:900]
    pred = SENT_CLF(joined)[0]
    return share, pred["label"], float(pred["score"])

def sentiment_overall(text: str):
    # BETO returns list sorted by score; pick best
    out = SENT_CLF(text[:1000])[0]
    return out["label"], float(out["score"])

def zsl_topics(text: str, labels):
    res = ZSL(text[:1000], candidate_labels=labels, multi_label=True)
    # res = {'labels': [...], 'scores': [...]}
    topics = {lab: float(score) for lab, score in zip(res["labels"], res["scores"]) if score >= TOPIC_THRESHOLD}
    # also record top topic
    if res["labels"]:
        top_lab = res["labels"][np.argmax(res["scores"])]
        top_score = float(np.max(res["scores"]))
    else:
        top_lab, top_score = None, None
    return topics, top_lab, top_score

def entity_spans(doc):
    ents = []
    for ent in doc.ents:
        if ent.label_ in {"PER","ORG","LOC","GPE"}:
            ents.append((ent.text, ent.label_, ent.start_char, ent.end_char))
    return ents

def entity_connotation(doc_text, ents):
    # Sentiment on a small window around each entity mention (char-based window ±140 chars)
    ent_rows = []
    for txt, lbl, s, e in ents:
        lo = max(0, s-140); hi = min(len(doc_text), e+140)
        window = doc_text[lo:hi]
        pred = SENT_CLF(window)[0]
        ent_rows.append({
            "entity": txt,
            "entity_type": lbl,
            "label": pred["label"],
            "score": float(pred["score"])
        })
    # aggregate per entity
    agg = defaultdict(lambda: {"type": None, "n":0, "pos":0, "neu":0, "neg":0, "avg_score":0.0})
    for r in ent_rows:
        key = (unidecode(r["entity"]).lower(), r["entity_type"])
        rec = agg[key]
        rec["type"] = r["entity_type"]
        rec["n"] += 1
        rec["avg_score"] += r["score"]
        if r["label"] == "POS": rec["pos"] += 1
        elif r["label"] == "NEU": rec["neu"] += 1
        else: rec["neg"] += 1
    rows = []
    for (ent_norm, etype), rec in agg.items():
        rows.append({
            "entity_norm": ent_norm,
            "entity_type": etype,
            "mentions": rec["n"],
            "avg_score": rec["avg_score"]/rec["n"],
            "pos_share": rec["pos"]/rec["n"],
            "neu_share": rec["neu"]/rec["n"],
            "neg_share": rec["neg"]/rec["n"],
        })
    return rows

def extract_ngrams(tokens, n=2):
    grams = [" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    return Counter(grams)


In [6]:
# =============================================================
# MAIN
# =============================================================

txt_dir = Path(INPUT_TXT_DIR)
files = sorted([p for p in txt_dir.glob("*.txt") if p.is_file()])

master_rows = []
entity_rows_all = []
ngram_rows_all = []

for fp in tqdm(files, desc="Analyzing covers"):
    raw = fp.read_text(encoding="utf-8", errors="ignore")
    text = normalize_text(raw)
    if not text:
        # still record empty row
        master_rows.append({
            "file": fp.name, "date": parse_date_from_name(fp.name),
            "overall_sentiment": "VACIO", "overall_score": 0.0,
            "top_topic": None, "top_topic_score": None,
            "topics_json": json.dumps({}),
            "people_json": json.dumps({}), "orgs_json": json.dumps({}), "places_json": json.dumps({}),
            "top_pos_words_json": json.dumps([]), "top_neg_words_json": json.dumps([]),
            "eco_share": 0.0, "eco_sentiment": None, "eco_sent_score": None,
            "gov_share": 0.0, "gov_sentiment": None, "gov_sent_score": None,
        })
        continue

    # -------- Overall sentiment
    s_label, s_score = sentiment_overall(text)

    # -------- Topics (zero-shot)
    topics, top_topic, top_topic_score = zsl_topics(text, TOPIC_LABELS)

    # -------- NER
    doc = nlp(text)
    ents = entity_spans(doc)

    people = Counter()
    orgs = Counter()
    places = Counter()
    for etxt, etype, _, _ in ents:
        key = unidecode(etxt).lower().strip()
        if etype == "PER": people[key] += 1
        elif etype == "ORG": orgs[key] += 1
        elif etype in {"LOC","GPE"}: places[key] += 1

    # -------- Entity connotation
    ent_summary = entity_connotation(text, ents)
    for r in ent_summary:
        r.update({"file": fp.name, "date": parse_date_from_name(fp.name)})
        entity_rows_all.append(r)

    # -------- Lexicon words & n-grams
    toks_norm = tokenize_simple(text)
    top_pos = get_top_words(toks_norm, POS_LEX, top_k=5)
    top_neg = get_top_words(toks_norm, NEG_LEX, top_k=5)

    bi = extract_ngrams(toks_norm, n=2)
    tri = extract_ngrams(toks_norm, n=3)
    # keep top and with min count
    for g, cnt in bi.most_common(MAX_NGRAMS_PER_COVER):
        if cnt >= NGRAM_MIN_COUNT:
            ngram_rows_all.append({"file": fp.name, "date": parse_date_from_name(fp.name), "ngram": g, "n": 2, "count": cnt})
    for g, cnt in tri.most_common(MAX_NGRAMS_PER_COVER):
        if cnt >= NGRAM_MIN_COUNT:
            ngram_rows_all.append({"file": fp.name, "date": parse_date_from_name(fp.name), "ngram": g, "n": 3, "count": cnt})

    # -------- Economy / Government shares + sentiment near those terms
    eco_share, eco_sent, eco_sent_score = get_share_and_sentiment_for_kwset(text, ECONOMY_KWS)
    gov_share, gov_sent, gov_sent_score = get_share_and_sentiment_for_kwset(text, GOVERNMENT_KWS)

    # -------- Row
    master_rows.append({
        "file": fp.name,
        "date": parse_date_from_name(fp.name),
        "overall_sentiment": s_label,         # NEG | NEU | POS
        "overall_score": s_score,             # 0..1
        "top_topic": top_topic,
        "top_topic_score": top_topic_score,
        "topics_json": json.dumps(topics, ensure_ascii=False),
        "people_json": json.dumps(dict(people.most_common(50)), ensure_ascii=False),
        "orgs_json": json.dumps(dict(orgs.most_common(50)), ensure_ascii=False),
        "places_json": json.dumps(dict(places.most_common(50)), ensure_ascii=False),
        "top_pos_words_json": json.dumps(top_pos, ensure_ascii=False),
        "top_neg_words_json": json.dumps(top_neg, ensure_ascii=False),
        "eco_share": eco_share,
        "eco_sentiment": eco_sent,
        "eco_sent_score": eco_sent_score,
        "gov_share": gov_share,
        "gov_sentiment": gov_sent,
        "gov_sent_score": gov_sent_score,
    })


Analyzing covers: 0it [00:00, ?it/s]

In [7]:
# =============================================================
# SAVE PER-COVER TABLES
# =============================================================
df_master = pd.DataFrame(master_rows)
df_entities = pd.DataFrame(entity_rows_all)
df_ngrams = pd.DataFrame(ngram_rows_all)

# fill empties for consistent schema
if df_entities.empty:
    df_entities = pd.DataFrame(columns=["file","date","entity_norm","entity_type","mentions","avg_score","pos_share","neu_share","neg_share"])
if df_ngrams.empty:
    df_ngrams = pd.DataFrame(columns=["file","date","ngram","n","count"])

master_csv = os.path.join(OUTPUT_DIR, "covers_master.csv")
entities_csv = os.path.join(OUTPUT_DIR, "entities_sentiment.csv")
ngrams_csv = os.path.join(OUTPUT_DIR, "ngrams.csv")

df_master.to_csv(master_csv, index=False, encoding="utf-8")
df_entities.to_csv(entities_csv, index=False, encoding="utf-8")
df_ngrams.to_csv(ngrams_csv, index=False, encoding="utf-8")

print("Saved:")
print(master_csv)
print(entities_csv)
print(ngrams_csv)

Saved:
/content/drive/MyDrive/odc_analytics_out/covers_master.csv
/content/drive/MyDrive/odc_analytics_out/entities_sentiment.csv
/content/drive/MyDrive/odc_analytics_out/ngrams.csv


In [8]:
# =============================================================
# DASHBOARD-FRIENDLY AGGREGATIONS
# =============================================================

# --- Timeseries: sentiment by date
ts_sent = (df_master
           .dropna(subset=["date"])
           .groupby("date")
           .agg(overall_pos=("overall_sentiment", lambda s: np.mean([1 if x=="POS" else 0 for x in s])),
                overall_neu=("overall_sentiment", lambda s: np.mean([1 if x=="NEU" else 0 for x in s])),
                overall_neg=("overall_sentiment", lambda s: np.mean([1 if x=="NEG" else 0 for x in s])),
                eco_share=("eco_share","mean"),
                gov_share=("gov_share","mean"))
           .reset_index())
ts_sent_csv = os.path.join(OUTPUT_DIR, "timeseries_sentiment.csv")
ts_sent.to_csv(ts_sent_csv, index=False, encoding="utf-8")

# --- Topic distribution over time (explode topics_json)
def explode_topics(df):
    rows = []
    for _, r in df.iterrows():
        try:
            topics = json.loads(r["topics_json"]) if pd.notna(r["topics_json"]) else {}
        except Exception:
            topics = {}
        for k,v in topics.items():
            rows.append({"date": r["date"], "file": r["file"], "topic": k, "score": v})
    return pd.DataFrame(rows)

df_topics = explode_topics(df_master)
topic_ts = (df_topics.dropna(subset=["date"])
                    .groupby(["date","topic"])
                    .agg(avg_topic_score=("score","mean"),
                         covers=("file","count"))
                    .reset_index())
topic_ts_csv = os.path.join(OUTPUT_DIR, "timeseries_topics.csv")
topic_ts.to_csv(topic_ts_csv, index=False, encoding="utf-8")

# --- Most-mentioned people over time
def explode_entities_for_people(df_entities):
    # df_entities is already aggregated per entity_norm per file in our pipeline
    return df_entities[df_entities["entity_type"]=="PER"].copy()

people_ts = (explode_entities_for_people(df_entities)
             .dropna(subset=["date"])
             .groupby(["date","entity_norm"])
             .agg(mentions=("mentions","sum"),
                  pos_share=("pos_share","mean"),
                  neg_share=("neg_share","mean"))
             .reset_index())
people_ts_csv = os.path.join(OUTPUT_DIR, "timeseries_people.csv")
people_ts.to_csv(people_ts_csv, index=False, encoding="utf-8")

print("Also saved:")
print(ts_sent_csv)
print(topic_ts_csv)
print(people_ts_csv)

KeyError: ['date']