<a href="https://colab.research.google.com/github/renzungo/Clarin_Covers_Sent_Analysis/blob/sentiment/03_overall_sentiment_beto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive; drive.mount('/content/drive', force_remount=True)

!pip -q install transformers==4.43.3 torch pandas unidecode tqdm

import os, re
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from unidecode import unidecode
from transformers import pipeline

WORK_DIR = "/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_pipeline_work"
OUT_DIR = "/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_analytics_out"
os.makedirs(WORK_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

SENT_CLF = pipeline(
    "text-classification",
    model="finiteautomata/beto-sentiment-analysis",
    tokenizer="finiteautomata/beto-sentiment-analysis",
    top_k=None
)

ECONOMY_KWS = {
    "inflación","inflacion","dólar","dolar","salario","precios","devaluación","devaluacion",
    "pbi","actividad","desempleo","impuestos","tarifas","deuda","fmi","paritarias","exportaciones","importaciones"
}
GOVERNMENT_KWS = {
    "presidente","presidenta","gobierno","ministro","ministra","gabinete","decreto",
    "congreso","senado","diputados","casa rosada","boletín oficial","boletin oficial"
}
ECO = {unidecode(k).lower() for k in ECONOMY_KWS}
GOV = {unidecode(k).lower() for k in GOVERNMENT_KWS}

def tokenize_simple(text: str):
    t = unidecode(text).lower()
    return re.findall(r"[a-záéíóúñ]+", t)

def window_sentiment(text_tokens, hits, radius=30):
    if not hits: return None, None
    lo_hi = [(max(0,i-radius), min(len(text_tokens), i+radius+1)) for i in hits]
    joined = " ".join(" ".join(text_tokens[lo:hi]) for lo,hi in lo_hi)[:900]
    pred = SENT_CLF(joined)[0][0]  # Access the dictionary within the nested lists
    return pred["label"], float(pred["score"])

df_base = pd.read_parquet(os.path.join(WORK_DIR, "base.parquet"))

rows = []
for r in tqdm(df_base.itertuples(index=False), total=len(df_base)):
    text = r.text or ""
    if not text:
        rows.append({"file": r.file, "date": r.date,
                     "overall_sentiment":"VACIO","overall_score":0.0,
                     "eco_share":0.0,"eco_sentiment":None,"eco_sent_score":None,
                     "gov_share":0.0,"gov_sentiment":None,"gov_sent_score":None})
        continue

    pred = SENT_CLF(text[:1000])[0][0] # Access the dictionary within the nested lists
    tokens = tokenize_simple(text)
    hits_eco = [i for i,t in enumerate(tokens) if t in ECO]
    hits_gov = [i for i,t in enumerate(tokens) if t in GOV]
    eco_share = 0.0 if not tokens else len(hits_eco)/len(tokens)
    gov_share = 0.0 if not tokens else len(hits_gov)/len(tokens)

    eco_lbl, eco_sc = window_sentiment(tokens, hits_eco)
    gov_lbl, gov_sc = window_sentiment(tokens, hits_gov)

    rows.append({"file": r.file, "date": r.date,
                 "overall_sentiment": pred["label"], "overall_score": float(pred["score"]),
                 "eco_share": eco_share, "eco_sentiment": eco_lbl, "eco_sent_score": eco_sc,
                 "gov_share": gov_share, "gov_sentiment": gov_lbl, "gov_sent_score": gov_sc})

df_sent = pd.DataFrame(rows)
df_sent.to_parquet(os.path.join(WORK_DIR, "sentiment.parquet"), index=False)
print("Wrote:", os.path.join(WORK_DIR, "sentiment.parquet"))

Mounted at /content/drive


  0%|          | 0/652 [00:00<?, ?it/s]

Wrote: /content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_pipeline_work/sentiment.parquet
