<a href="https://colab.research.google.com/github/renzungo/Clarin_Covers_Sent_Analysis/blob/sentiment/01_prepare_texts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Colab setup
from google.colab import drive; drive.mount('/content/drive', force_remount=True)

!pip -q install pandas numpy unidecode tqdm

import os, re, json
from pathlib import Path
from collections import Counter
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from unidecode import unidecode

# --- Basic Spanish stop words
STOP_WORDS = {
    'de','la','que','el','en','y','a','los','del','se','las','por','un','para','con',
    'no','una','su','al','lo','como','mas','más','pero','sus','le','ya','o','este',
    'si','sí','porque','esta','entre','cuando','muy','sin','sobre','tambien','también',
    'me','hasta','hay','donde','quien'
}

# --- Config
INPUT_TXT_DIR = "/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/OCR_Out/txt"
WORK_DIR = "/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_pipeline_work"
OUT_DIR = "/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_analytics_out"
os.makedirs(WORK_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)

# --- Date patterns
DATE_PATTERNS = [
    (re.compile(r'(?P<y>20\d{2})(?P<m>\d{2})(?P<d>\d{2})'), "%Y%m%d"),
    (re.compile(r'(?P<y>20\d{2})[-_](?P<m>\d{2})[-_](?P<d>\d{2})'), "%Y-%m-%d"),
]
# Extract a YYYY-MM-DD string from file names like 20200131.txt
def parse_date_from_name(name: str):
    base = os.path.basename(name)
    for rx, _ in DATE_PATTERNS:
        m = rx.search(base)
        if m:
            return f"{m.group('y')}-{m.group('m')}-{m.group('d')}"
    return None

# Collapse multiple whitespace and trim edges
def normalize_text(t: str):
    return re.sub(r'\s+', ' ', t).strip()

# Convert to lowercase ASCII tokens for basic analysis
def tokenize_simple(text: str, stopwords=None):
    t = unidecode(text).lower()
    tokens = re.findall(r"[a-záéíóúñ]+", t)
    if stopwords:
        tokens = [tok for tok in tokens if tok not in stopwords]
    return tokens

# Count contiguous token sequences of length n
def extract_ngrams(tokens, n=2):
    return Counter([" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)])

# --- Lexicons (extend freely)
POS_LEX = {
    'histórico','historico','récord','record','mejora','baja','exitoso','logro','avance','acuerdo','crece','crecimiento','ganó','gano','victoria','alivio','optimismo','favorable',
    'beneficio','feliz','prosperidad','apoyo','ganancia','exito','logros'
}
NEG_LEX = {
    'crisis','derrota','suba','caída','caida','escándalo','escandalo','corrupción','corrupcion','inflación','inflacion','devaluación','devaluacion','paro','conflicto','denuncia',
    'violencia','delito','inseguridad','déficit','deficit','ajuste','recesión','recesion','desempleo','problema','fraude','perdida','pérdida','crimen','malestar'
}
POS_NORM = {unidecode(w).lower() for w in POS_LEX}
NEG_NORM = {unidecode(w).lower() for w in NEG_LEX}

NGRAM_MIN_COUNT = 2
MAX_NGRAMS_PER_COVER = 20

# --- Walk input files
txt_dir = Path(INPUT_TXT_DIR)
files = sorted([p for p in txt_dir.glob("*.txt") if p.is_file()])

base_rows = []
ngram_rows = []
lex_rows = []

# Iterate through each OCR text file
for fp in tqdm(files, desc="Preparing texts"):
    raw = fp.read_text(encoding="utf-8", errors="ignore")
    text = normalize_text(raw)
    date = parse_date_from_name(fp.name)
    toks = tokenize_simple(text, STOP_WORDS) if text else []

    # n-grams
    # Most common bi-grams and tri-grams per cover
    bi = extract_ngrams(toks, 2)
    tri = extract_ngrams(toks, 3)
    for grams, n in [(bi,2),(tri,3)]:
        for g, cnt in grams.most_common(MAX_NGRAMS_PER_COVER):
            if cnt >= NGRAM_MIN_COUNT:
                ngram_rows.append({"file": fp.name, "date": date, "ngram": g, "n": n, "count": cnt})

    # simple lexicon hits (top words)
    # Check tokens against sentiment lexicons
    pos_hits = [t for t in toks if t in POS_NORM]
    neg_hits = [t for t in toks if t in NEG_NORM]
    pos_top = [w for w,_ in Counter(pos_hits).most_common(5)]
    neg_top = [w for w,_ in Counter(neg_hits).most_common(5)]
    lex_rows.append({"file": fp.name, "date": date,
                     "top_pos_words_json": json.dumps(pos_top, ensure_ascii=False),
                     "top_neg_words_json": json.dumps(neg_top, ensure_ascii=False)})

    base_rows.append({"file": fp.name, "date": date, "text": text})

# --- Save intermediates
# Save cleaned base texts and n-gram/lexicon outputs
df_base = pd.DataFrame(base_rows)             # file, date, text
df_base.to_parquet(os.path.join(WORK_DIR, "base.parquet"), index=False)

df_ngrams = pd.DataFrame(ngram_rows) if ngram_rows else pd.DataFrame(columns=["file","date","ngram","n","count"])
df_ngrams.to_csv(os.path.join(OUT_DIR, "ngrams.csv"), index=False, encoding="utf-8")

df_lex = pd.DataFrame(lex_rows)
df_lex.to_parquet(os.path.join(WORK_DIR, "lexicon.parquet"), index=False)

print("Wrote:")
print(os.path.join(WORK_DIR, "base.parquet"))
print(os.path.join(WORK_DIR, "lexicon.parquet"))
print(os.path.join(OUT_DIR, "ngrams.csv"))


Mounted at /content/drive


Preparing texts:   0%|          | 0/652 [00:00<?, ?it/s]

Wrote:
/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_pipeline_work/base.parquet
/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_pipeline_work/lexicon.parquet
/content/drive/MyDrive/Data Justicialista/Clarin Cover Sentiment Analysis/odc_analytics_out/ngrams.csv
