In [2]:
# Entrada (Input)
%pip install -q -U google-genai beautifulsoup4 spacy nltk

import os
from dataclasses import dataclass, field, asdict
from typing import List, Dict, Any, Optional
import requests
import pandas as pd
import nltk
from bs4 import BeautifulSoup
from google import genai

nltk.download("vader_lexicon")

os.environ["GEMINI_API_KEY"] = "AIzaSyBUTwmlijD7D0DeSsEVH72X5dOuq7fvf3M"

client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])

@dataclass
class SessionState:
    query: str = ""
    articles: List[Dict[str, str]] = field(default_factory=list)
    summary: str = ""
    entities: List[Dict[str, Any]] = field(default_factory=list)
    sentiment: Dict[str, Any] = field(default_factory=dict)
    metadata: Dict[str, Any] = field(default_factory=dict)

class InputAgent:
    def run(self, query: Optional[str] = None) -> SessionState:
        if query is None or not query.strip():
            query = "Resumir y analizar las noticias políticas más recientes de Colombia."
        state = SessionState()
        state.query = query
        state.metadata["input_timestamp"] = pd.Timestamp.utcnow().isoformat()
        return state

user_query = input("Ingresa tu consulta en lenguaje natural (o deja vacío para usar una por defecto): ")
state = InputAgent().run(user_query)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Ingresa tu consulta en lenguaje natural (o deja vacío para usar una por defecto): 


In [3]:
# Adquisición (Web)
class WebScraperAgent:
    def __init__(self, base_url: str, max_articles: int = 10):
        self.base_url = base_url
        self.max_articles = max_articles
    def run(self, state: SessionState) -> SessionState:
        response = requests.get(self.base_url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        raw_articles = []
        for a in soup.find_all("a", href=True):
            href = a["href"]
            text = a.get_text(strip=True)
            if "/politica/" in href and text and len(text.split()) > 4:
                url = href if href.startswith("http") else "https://www.eltiempo.com" + href
                raw_articles.append({"titulo": text, "url": url})
        seen = set()
        articles = []
        for art in raw_articles:
            if art["url"] not in seen:
                seen.add(art["url"])
                articles.append(art)
        state.articles = articles[: self.max_articles]
        state.metadata["num_articulos"] = len(state.articles)
        return state

scraper_agent = WebScraperAgent(base_url="https://www.eltiempo.com/politica", max_articles=10)
state = scraper_agent.run(state)

In [4]:
# Procesamiento (Text Analytics)
import spacy
from spacy.cli import download as spacy_download
from nltk.sentiment import SentimentIntensityAnalyzer

spacy_download("es_core_news_sm")
nlp = spacy.load("es_core_news_sm")

class SummarizerAgent:
    def __init__(self, model_name: str = "gemini-2.5-flash"):
        self.model_name = model_name
    def run(self, state: SessionState) -> SessionState:
        if not state.articles:
            state.summary = ""
            return state
        corpus = ""
        for idx, art in enumerate(state.articles, start=1):
            corpus += f"{idx}. {art['titulo']}\n"
        prompt = (
            "Eres un analista que resume noticias políticas colombianas.\n"
            "A partir de la lista de titulares, genera un resumen analítico breve "
            "en máximo 200 palabras y destaca los temas principales.\n\n"
            "Titulares:\n"
            f"{corpus}"
        )
        response = client.models.generate_content(model=self.model_name, contents=prompt)
        state.summary = response.text
        return state

class TextAnalyticsAgent:
    def __init__(self):
        self.sia = SentimentIntensityAnalyzer()
    def run(self, state: SessionState) -> SessionState:
        if not state.articles:
            return state
        corpus = ""
        for art in state.articles:
            corpus += art["titulo"] + "\n"
        doc = nlp(corpus)
        from collections import Counter
        pairs = [(ent.text, ent.label_) for ent in doc.ents]
        counter = Counter(pairs)
        entities = []
        for (text, label), freq in counter.most_common():
            entities.append({"texto": text, "etiqueta": label, "frecuencia": freq})
        detallado = []
        acumulado = 0.0
        for art in state.articles:
            scores = self.sia.polarity_scores(art["titulo"])
            detallado.append(
                {
                    "titulo": art["titulo"],
                    "compound": scores["compound"],
                    "positivo": scores["pos"],
                    "negativo": scores["neg"],
                    "neutro": scores["neu"],
                }
            )
            acumulado += scores["compound"]
        promedio = acumulado / len(detallado) if detallado else 0.0
        state.entities = entities
        state.sentiment = {"promedio_compound": promedio, "detalle": detallado}
        return state

summarizer_agent = SummarizerAgent()
analytics_agent = TextAnalyticsAgent()
state = summarizer_agent.run(state)
state = analytics_agent.run(state)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
# Persistencia/Memoria
import json

class MemoryAgent:
    def __init__(self, path: str = "session_state.json"):
        self.path = path
    def save(self, state: SessionState) -> SessionState:
        data = asdict(state)
        with open(self.path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        return state
    def load(self) -> SessionState:
        if os.path.exists(self.path):
            with open(self.path, encoding="utf-8") as f:
                data = json.load(f)
            return SessionState(**data)
        return SessionState()

memory_agent = MemoryAgent()
state = memory_agent.save(state)

In [6]:
# Salida (Output)
report = {
    "consulta": state.query,
    "num_articulos": state.metadata.get("num_articulos", 0),
    "resumen": state.summary,
    "entidades": state.entities,
    "sentimiento_promedio_compound": state.sentiment.get("promedio_compound", 0.0),
}

df_articulos = pd.DataFrame(state.articles)
df_entidades = pd.DataFrame(state.entities)
df_sentimiento = pd.DataFrame(state.sentiment.get("detalle", []))

print("=== CONSULTA DE ENTRADA ===")
print(report["consulta"])
print()
print("=== RESUMEN AUTOMÁTICO ===")
print(report["resumen"])
print()
print("=== ARTÍCULOS ANALIZADOS ===")
print(df_articulos)
print()
print("=== ENTIDADES NOMBRADAS ===")
print(df_entidades)
print()
print("=== SENTIMIENTO POR TITULAR ===")
print(df_sentimiento)

=== CONSULTA DE ENTRADA ===
Resumir y analizar las noticias políticas más recientes de Colombia.

=== RESUMEN AUTOMÁTICO ===
El panorama político colombiano se centra intensamente en la antesala de las **elecciones al Congreso y las presidenciales**. Los partidos están definiendo sus **listas al Congreso**, con pujas internas por los cabezas de lista, y se observa una **fractura en el Centro Democrático** que plantea desafíos para la derecha, pese al respaldo de Álvaro Uribe a su lista cerrada al Senado.

En cuanto a las estrategias presidenciales, **Sergio Fajardo ha optado por ir directamente a primera vuelta**, descartando consultas interpartidistas y reconfigurando el tablero para el centro y la izquierda. Nuevas coaliciones como "Ahora Colombia" ya anuncian ambiciosas metas.

Paralelamente, el Gobierno enfrenta un momento crítico al **jugar sus últimas cartas para evitar el archivo de su reforma tributaria** en el Congreso. También se reportan **tensiones en la Corte Suprema de Ju