<a href="https://colab.research.google.com/github/renzungo/Clarin_Covers_Sent_Analysis/blob/main/02_topics_zero_shot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Drive
from google.colab import drive; drive.mount('/content/drive', force_remount=True)

# Install deps (small + fast)
!pip -q install sentence-transformers==3.0.1 torch pandas numpy tqdm

import os, json, math
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from sentence_transformers import SentenceTransformer, util

# -------------------------
# Config
# -------------------------
WORK_DIR = "/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_pipeline_work"
OUT_DIR = "/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_analytics_out"
CACHE    = "/content/drive/MyDrive/hf_cache"
os.makedirs(WORK_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(CACHE, exist_ok=True)

# Topic labels (Spanish)
TOPIC_LABELS = [
    "economía", "gobierno", "política", "justicia", "seguridad",
    "deportes", "internacionales", "sociedad", "cultura",
    "salud", "educación", "tecnología"
]

# Scoring controls
MAX_CHARS = 800            # trim text to speed up with minimal topic loss
BATCH_SIZE = 64            # larger batches speed up on GPU
THRESHOLD = 0.35           # keep topics with score >= THRESHOLD
TEMP = 0.5                 # softmax temperature for nicer score spread (lower = peakier)

# Model: fast, multilingual, high quality
MODEL_ID = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = SentenceTransformer(MODEL_ID, device=DEVICE, cache_folder=CACHE)

# -------------------------
# Read base texts
# -------------------------
base_path = os.path.join(WORK_DIR, "base.parquet")
df_base = pd.read_parquet(base_path)
df_base = df_base.fillna({"text": ""})
df_base["text_trim"] = df_base["text"].str.slice(0, MAX_CHARS)

# If you want to bias toward topic intent, prepend a short instruction prompt:
def make_input(txt: str) -> str:
    # Spanish hint to the encoder (optional but helps)
    return f"Resumen de temas del texto en español: {txt}"

inputs = [make_input(t) for t in df_base["text_trim"].tolist()]

# -------------------------
# Precompute label embeddings
# -------------------------
# Use short natural-language prompts for labels to improve matching
label_prompts = [f"Este texto trata sobre: {lab}." for lab in TOPIC_LABELS]
label_emb = model.encode(label_prompts, convert_to_tensor=True, normalize_embeddings=True)

# -------------------------
# Encode in batches, score topics
# -------------------------
rows = []
for i in tqdm(range(0, len(inputs), BATCH_SIZE), desc="Scoring topics"):
    batch_inputs = inputs[i:i+BATCH_SIZE]
    batch_files  = df_base["file"].iloc[i:i+BATCH_SIZE].tolist()
    batch_dates  = df_base["date"].iloc[i:i+BATCH_SIZE].tolist()

    text_emb = model.encode(batch_inputs, convert_to_tensor=True, normalize_embeddings=True)
    # cosine similarity: (batch_size x n_labels)
    sims = (text_emb @ label_emb.T)  # faster than util.cos_sim with normalized embeddings

    # Convert to "prob-like" scores via temperature-scaled softmax per row
    # This spreads scores across labels while keeping ranking
    probs = torch.softmax(sims / TEMP, dim=1).detach().cpu().numpy()

    for j in range(len(batch_inputs)):
        scores = probs[j]  # np.array length = n_labels
        # Select labels above threshold
        keep = {TOPIC_LABELS[k]: float(scores[k]) for k in range(len(TOPIC_LABELS)) if scores[k] >= THRESHOLD}
        # Top topic
        top_idx = int(scores.argmax()) if scores.size else None
        top_topic = TOPIC_LABELS[top_idx] if scores.size else None
        top_score = float(scores[top_idx]) if scores.size else None

        rows.append({
            "file": batch_files[j],
            "date": batch_dates[j],
            "topics_json": json.dumps(keep, ensure_ascii=False),
            "top_topic": top_topic,
            "top_topic_score": top_score
        })

df_topics = pd.DataFrame(rows, columns=["file","date","topics_json","top_topic","top_topic_score"])
out_path = os.path.join(WORK_DIR, "topics.parquet")
df_topics.to_parquet(out_path, index=False)

print("Wrote:", out_path)
print("Device:", DEVICE, "| Model:", MODEL_ID)
print("Rows:", len(df_topics))


Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Scoring topics:   0%|          | 0/11 [00:00<?, ?it/s]

Wrote: /content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_pipeline_work/topics.parquet
Device: cpu | Model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
Rows: 652
