<a href="https://colab.research.google.com/github/siwarbouali25/News-Agent/blob/siwar-bouali/news_retriever_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# News Retriever Agent (Dataset-only)

This notebook provides:

- **Category preview**: show the N most recent articles in selected categories.
- **Grounded Q&A**: answer questions using ONLY retrieved context from your local dataset.

> Minimal, robust, and Colab-ready.

In [13]:
!pip -q install "transformers>=4.44" accelerate bitsandbytes langchain langchain-community dateparser


In [14]:
pip install faiss-cpu



In [15]:
!pip -q install -U --prefer-binary bitsandbytes

In [16]:
pip install -U bitsandbytes



## 1) Load your dataset
- If you already have a DataFrame `df` in memory, **skip this cell**.
- Otherwise, set `DATA_PATH` and the text column (`TEXT_COL`).

In [17]:
import pandas as pd

# If `df` already exists in memory, you can skip this cell.
DATA_PATH =  "/content/merged_articles (5).csv"
TEXT_COL  = "content"  # or "text"

if DATA_PATH:
    if DATA_PATH.endswith(".csv"):
        df = pd.read_csv(DATA_PATH)
    elif DATA_PATH.endswith(".jsonl") or DATA_PATH.endswith(".json"):
        df = pd.read_json(DATA_PATH, lines=True)
    else:
        raise ValueError("Use a .csv or .jsonl file, or provide df beforehand.")

if 'df' in globals():
    print("Columns:", list(df.columns))
    print("Rows:", len(df))
else:
    print("No DataFrame named `df` found. Provide it or set DATA_PATH.")

Columns: ['id_article', 'title', 'content', 'url', 'category', 'source', 'image', 'published_date']
Rows: 4199


## 2) Normalize dates, ensure schema, and de-duplicate URLs

In [18]:
import pandas as pd
from datetime import timezone

ALLOWED_CATEGORIES = ["Politics","World","Science","Health","Sports","Entertainment","Culture","Society","Technology"]

# Ensure text column
assert any(c in df.columns for c in ["content","text"]), "Need a text column named 'content' or 'text'."
if "content" not in df.columns and "text" in df.columns:
    df["content"] = df["text"]

# Ensure category column exists (pre-labeled)
assert "category" in df.columns, "Your df needs a 'category' column with values from ALLOWED_CATEGORIES."

# Parse dates into timezone-aware UTC
DATE_COL = "published_date"
if DATE_COL in df.columns:
    df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce", utc=True)
else:
    # Create an empty date column if missing
    df[DATE_COL] = pd.NaT

# Keep ISO string for vectorstore metadata (safer to serialize)
df["published_date_iso"] = df[DATE_COL].dt.strftime("%Y-%m-%dT%H:%M:%S%z").str.replace(r"(\+0000)$", "+00:00", regex=True)

# Optional: de-duplicate by URL
if "url" in df.columns:
    before = len(df)
    df = df.drop_duplicates(subset=["url"]).reset_index(drop=True)
    print(f"Deduped by URL: {before} → {len(df)}")
else:
    print("No 'url' column found; skipping URL dedupe.")

Deduped by URL: 4199 → 4197


## 3) Build FAISS retriever (MMR for diversity)

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

TEXT_COL = "content"
splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=120)

texts, metas = [], []
for i, r in df.iterrows():
    txt = str(r.get(TEXT_COL, "")).strip()
    if not txt:
        continue
    for j, ch in enumerate(splitter.split_text(txt)):
        texts.append(ch)
        metas.append({
            "id": str(r.get("id", i)),
            "title": str(r.get("title", "")),
            "url": str(r.get("url", "")),
            "published_date": r.get("published_date_iso"),
            "source": str(r.get("source", "")),
            "category": str(r.get("category", "")),
            "chunk": j,
        })

print(f"Chunks: {len(texts)}")
emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectordb = FAISS.from_texts(texts=texts, embedding=emb, metadatas=metas)

# MMR retriever (less duplicate chunks)
retriever = vectordb.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 12, "fetch_k": 60, "lambda_mult": 0.5}
)

Chunks: 25208


  emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## 4) Utilities — dates, dedupe, category preview, retrieval

In [20]:
def meta_to_timestamp_utc(meta_val):
    try:
        return pd.to_datetime(meta_val, utc=True)
    except Exception:
        return pd.NaT

def sort_hits_newest_first(hits):
    hits.sort(key=lambda d: (meta_to_timestamp_utc(d.metadata.get("published_date")) or pd.Timestamp(0, tz=timezone.utc)), reverse=True)
    return hits

def dedupe_by_url(hits):
    seen, uniq = set(), []
    for h in hits:
        key = h.metadata.get("url") or h.metadata.get("id")
        if key in seen:
            continue
        seen.add(key)
        uniq.append(h)
    return uniq

def filter_hits_by_categories(hits, categories):
    if not categories:
        return hits
    allowed = set(categories)
    return [h for h in hits if h.metadata.get("category") in allowed]

def retrieve_for_query(query, categories=None, k=12):
    hits = retriever.invoke(query)
    hits = filter_hits_by_categories(hits, categories or [])
    hits = dedupe_by_url(hits)
    hits = sort_hits_newest_first(hits)
    return hits[:k]

from urllib.parse import urlparse
import pandas as pd

def _canonical_source(row):
    s = (row.get("source") or "").strip()
    if s:
        return s
    # fallback to URL domain if source missing
    url = (row.get("url") or "").strip()
    if url:
        try:
            host = urlparse(url).netloc.lower()
            # collapse common subdomains
            if host.startswith("www."):
                host = host[4:]
            return host or "unknown"
        except Exception:
            pass
    return "unknown"

# Build a stable normalized source column once
df["source_norm"] = df.apply(_canonical_source, axis=1)

def get_recent_articles_diverse(categories, n_per_cat=5, per_source_limit=1, pretty=True):
    """
    For each category, return the N most recent articles,
    limiting picks to `per_source_limit` items per source (default 1).
    """
    results = {}
    cats = [c for c in categories if c in ALLOWED_CATEGORIES]

    for c in cats:
        sub = df[df["category"] == c].copy()

        # newest first
        sub = sub.sort_values("published_date", ascending=False, na_position="last")

        # enforce per-source cap
        if per_source_limit == 1:
            # simplest: keep the most recent per source
            sub = sub.drop_duplicates(subset=["source_norm"], keep="first")
        else:
            # cap >1: take top-k per source then re-sort globally
            sub = (sub
                   .groupby("source_norm", group_keys=False)
                   .head(per_source_limit)
                   .sort_values("published_date", ascending=False, na_position="last"))

        rows = sub.head(n_per_cat)[["title","url","published_date","source","source_norm"]].to_dict(orient="records")
        results[c] = rows

    if pretty:
        for cat, items in results.items():
            print(f"\n=== {cat} — {len(items)} most recent (diverse sources) ===")
            for r in items:
                dt = r.get("published_date")
                date_str = dt.strftime("%Y-%m-%d %H:%M UTC") if pd.notna(dt) else "Unknown date"
                shown_source = r.get("source") or r.get("source_norm")
                print(f"- {r['title']} ({shown_source})")
                print(f"  {r['url']}  [{date_str}]")
    return results


## 5) Load Llama 3.1 8B Instruct (4-bit)
> You must have accepted the model license on Hugging Face and be logged in in this runtime.

In [21]:
# If needed:
!pip -q install huggingface_hub
!huggingface-cli login  # paste your hf_ token

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"  # long context (license-gated)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # use torch.bfloat16 on A100 if preferred
    bnb_4bit_use_double_quant=True,
)

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
mdl = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
)
print("Model loaded on:", mdl.device)


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: read).
The token `agent-token` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authent

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Model loaded on: cuda:0


## 6) Build the dataset-only answerer

In [22]:
SYSTEM_MSG = (
  "You are the Dataset Retrieval Agent for a private news corpus. "
  "Use ONLY the provided CONTEXT from the local dataset. "
  "If the answer is not present, reply exactly: \"I don't know.\" "
  "Be concise, neutral, and always cite article titles and URLs."
)

def pack_context_unique(docs, max_ctx_tokens=6000):
    parts, used, seen = [], 0, set()
    for d in docs:
        key = d.metadata.get("url") or d.metadata.get("id")
        if key in seen:
            continue
        seen.add(key)
        block = (
            f"Title: {d.metadata.get('title','')}\n"
            f"URL: {d.metadata.get('url','')}\n"
            f"Snippet: {d.page_content}\n\n"
        )
        n = len(tok(block).input_ids)
        if used + n > max_ctx_tokens:
            break
        parts.append(block); used += n
    return "".join(parts) if parts else "NO_MATCH"

def build_llama_prompt(question: str, context: str):
    messages = [
        {"role":"system", "content": SYSTEM_MSG},
        {"role":"user",   "content":
            f"CONTEXT:\n{context}\n\n"
            f"QUESTION: {question}\n\n"
            "Respond with a concise factual answer. Cite sources like: (Title — URL). "
            "If multiple sources agree, cite up to 2. If not answerable from CONTEXT, reply exactly: \"I don't know.\""
        }
    ]
    return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

@torch.inference_mode()
def answer_from_dataset(question: str, docs, max_ctx_tokens=6000, max_new_tokens=220):
    context = pack_context_unique(docs, max_ctx_tokens=max_ctx_tokens)
    prompt  = build_llama_prompt(question, context)
    enc = tok(prompt, return_tensors="pt", truncation=True, max_length=16384).to(mdl.device)
    out = mdl.generate(
        **enc,
        max_new_tokens=max_new_tokens,
        temperature=0.0,
        do_sample=False,
        pad_token_id=tok.eos_token_id
    )
    return tok.decode(out[0], skip_special_tokens=True)

## 7) Unified `ask()` — with or without categories

In [23]:
def ask(question: str, categories: list | None = None, k: int = 12,
        max_ctx_tokens: int = 6000, max_new_tokens: int = 220, return_docs: bool = False):
    """    Ask a question over your dataset.
    - If `categories` is None or empty, search the WHOLE dataset.
    - Otherwise, restrict retrieval to the selected categories.
    - Returns a concise, cited answer. If not answerable from context: 'I don't know.'
    """
    hits = retrieve_for_query(question, categories=categories or [], k=k)
    if not hits:
        ans = "I don't know."
        return (ans, hits) if return_docs else ans
    ans = answer_from_dataset(question, hits, max_ctx_tokens=max_ctx_tokens, max_new_tokens=max_new_tokens)
    return (ans, hits) if return_docs else ans

## 8) Demo — most recent by category

In [24]:
user_categories = ["Technology", "Politics"]  # change as you like
get_recent_articles_diverse(user_categories, n_per_cat=5)


=== Technology — 5 most recent (diverse sources) ===
- Jimmy Wales Thinks the World Should Be More Like Wikipedia (New York Times)
  https://www.nytimes.com/2025/10/18/magazine/jimmy-wales-interview.html  [2025-10-18 09:15 UTC]
- Home Depot Deals on Decor and Cookware—Plus BOGO Milwaukee Power Tools (wired)
  https://www.wired.com/story/home-depot-fall-savings-2025  [2025-10-17 22:37 UTC]
- These Comfortable Luxury Earbuds Are $100 Off (Washington Post)
  https://www.wired.com/story/beyerdynamic-amiron-300-deal-1025  [2025-10-15 17:41 UTC]
- iPhone Air review: Apple’s pursuit of absolute thinness (The Guardian)
  https://www.theguardian.com/technology/2025/oct/15/iphone-air-review-apple-pursuit-absolute-thinness  [Unknown date]
- Friday’s global computer meltdown (CNN)
  https://www.cnn.com/videos/tech/2024/07/20/smr-clarke-on-summmer-2024-blackout.cnn  [Unknown date]

=== Politics — 2 most recent (diverse sources) ===
- Thames Water administration call from Lib Dem leader (BBC)
  htt

{'Technology': [{'title': 'Jimmy Wales Thinks the World Should Be More Like Wikipedia',
   'url': 'https://www.nytimes.com/2025/10/18/magazine/jimmy-wales-interview.html',
   'published_date': Timestamp('2025-10-18 09:15:04.766000+0000', tz='UTC'),
   'source': 'New York Times',
   'source_norm': 'New York Times'},
  {'title': 'Home Depot Deals on Decor and Cookware—Plus BOGO Milwaukee Power Tools',
   'url': 'https://www.wired.com/story/home-depot-fall-savings-2025',
   'published_date': Timestamp('2025-10-17 22:37:31.239000+0000', tz='UTC'),
   'source': 'wired',
   'source_norm': 'wired'},
  {'title': 'These Comfortable Luxury Earbuds Are $100 Off',
   'url': 'https://www.wired.com/story/beyerdynamic-amiron-300-deal-1025',
   'published_date': Timestamp('2025-10-15 17:41:49.989000+0000', tz='UTC'),
   'source': 'Washington Post',
   'source_norm': 'Washington Post'},
  {'title': 'iPhone Air review: Apple’s pursuit of absolute thinness',
   'url': 'https://www.theguardian.com/technol

In [25]:
!pip -q install streamlit pyngrok pandas


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/10.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/10.1 MB[0m [31m55.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m10.1/10.1 MB[0m [31m157.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m108.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m6.6/6.9 MB[0m [31m199.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m118.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [26]:
NGROK_AUTH_TOKEN = "2w3oMNyM0oQ5tjatgGWLlCrepJ5_4VRybodh39hzX9fcub3gL"  # <-- paste token here (keep it secret!)
assert NGROK_AUTH_TOKEN and NGROK_AUTH_TOKEN.startswith(""), "Set NGROK_AUTH_TOKEN"


In [27]:
%%writefile app.py
import pandas as pd
import streamlit as st
from urllib.parse import urlparse

ALLOWED_CATEGORIES = [
    "Politics","World","Science","Health","Sports",
    "Entertainment","Culture","Society","Technology"
]
DATE_COL = "published_date"
TEXT_COL_CANDIDATES = ["content","text","body"]

st.set_page_config(page_title="News Retriever (Dataset-only)", layout="wide")
st.title("📰 News Retriever — Category Preview")

# ---------- Helpers ----------
@st.cache_data(show_spinner=False)
def load_df(file) -> pd.DataFrame:
    if file is None:
        return pd.DataFrame()
    name = file.name.lower()
    if name.endswith(".csv"):
        df = pd.read_csv(file)
    elif name.endswith(".jsonl") or name.endswith(".json"):
        df = pd.read_json(file, lines=True)
    else:
        st.error("Please upload .csv or .jsonl/.json")
        return pd.DataFrame()
    return df

def ensure_schema(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty: return df

    # text column
    text_col = None
    for c in TEXT_COL_CANDIDATES:
        if c in df.columns:
            text_col = c; break
    if text_col is None:
        df["content"] = ""
    elif text_col != "content":
        df["content"] = df[text_col]

    # category
    if "category" not in df.columns:
        df["category"] = ""

    # basic columns
    if "url" not in df.columns: df["url"] = ""
    if "source" not in df.columns: df["source"] = ""

    # dates (UTC)
    if DATE_COL not in df.columns:
        df[DATE_COL] = pd.NaT
    df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce", utc=True)

    # normalized source for diversity
    df["source_norm"] = df.apply(_canonical_source, axis=1)

    # drop duplicate URLs
    if "url" in df.columns:
        df = df.drop_duplicates(subset=["url"]).reset_index(drop=True)

    return df

def _canonical_source(row):
    s = str(row.get("source") or "").strip()
    if s:
        return s
    url = str(row.get("url") or "").strip()
    if url:
        try:
            host = urlparse(url).netloc.lower()
            if host.startswith("www."):
                host = host[4:]
            return host or "unknown"
        except Exception:
            pass
    return "unknown"

def pretty_date(dt):
    if pd.isna(dt): return "Unknown date"
    try:
        return dt.strftime("%Y-%m-%d %H:%M UTC")
    except Exception:
        return "Unknown date"

def get_recent_articles_diverse(df: pd.DataFrame, categories, n_per_cat=5, per_source_limit=1):
    results = {}
    cats = [c for c in categories if c in ALLOWED_CATEGORIES]
    for c in cats:
        sub = df[df["category"] == c].copy()
        if sub.empty:
            results[c] = []; continue

        # newest → oldest
        sub = sub.sort_values(DATE_COL, ascending=False, na_position="last")

        # enforce max per source
        if per_source_limit == 1:
            sub = sub.drop_duplicates(subset=["source_norm"], keep="first")
        else:
            sub = (
                sub.groupby("source_norm", group_keys=False)
                   .head(per_source_limit)
                   .sort_values(DATE_COL, ascending=False, na_position="last")
            )

        results[c] = sub.head(n_per_cat)[["title","url",DATE_COL,"source","source_norm"]].to_dict(orient="records")
    return results

def article_card(item: dict):
    title = item.get("title") or "(Untitled)"
    url   = item.get("url") or ""
    src   = item.get("source") or item.get("source_norm") or "unknown"
    dt    = item.get(DATE_COL)

    st.markdown(
        f"""
        <div style="border:1px solid #e9ecef;border-radius:14px;padding:16px;height:100%">
          <div style="font-weight:600;font-size:1.05rem;line-height:1.3;margin-bottom:6px">{title}</div>
          <div style="color:#6c757d;font-size:0.9rem;margin-bottom:8px">{src} • {pretty_date(dt)}</div>
          <a href="{url}" target="_blank" style="text-decoration:none">Open article ↗</a>
        </div>
        """,
        unsafe_allow_html=True,
    )

# ---------- Sidebar ----------
with st.sidebar:
    st.header("Upload dataset")
    up = st.file_uploader("CSV or JSONL/JSON", type=["csv","jsonl","json"])
    st.caption("Needs: category, title, url, and a text column (content/text). Optional: published_date, source.")

    st.header("Filters")
    chosen = st.multiselect("Categories", ALLOWED_CATEGORIES, default=["Technology","Politics"])
    n_per_cat = st.slider("Articles per category", 1, 10, 5)
    per_source_limit = st.slider("Max per source (diversity)", 1, 3, 1)

    show_table = st.checkbox("Show raw table", value=False)

# ---------- Main ----------
df = load_df(up)
if df.empty:
    st.info("Upload a dataset to begin.")
    st.stop()

df = ensure_schema(df)

if not chosen:
    st.warning("Choose at least one category.")
    st.stop()

results = get_recent_articles_diverse(df, categories=chosen, n_per_cat=n_per_cat, per_source_limit=per_source_limit)

for cat in chosen:
    items = results.get(cat, [])
    st.subheader(f"{cat} — {len(items)} most recent (diverse sources)")
    if not items:
        st.caption("No articles found.")
        continue

    cols = st.columns(3)
    for i, item in enumerate(items):
        with cols[i % 3]:
            article_card(item)

    if show_table:
        st.dataframe(pd.DataFrame(items))


Writing app.py


In [None]:
import subprocess, time, re, os, sys
from pyngrok import ngrok

# Kill old tunnels/servers
ngrok.kill()
!pkill -f "streamlit run app.py" || true

# Start Streamlit (background)
port = 8501
proc = subprocess.Popen(
    ["streamlit", "run", "app.py", "--server.port", str(port), "--server.address", "0.0.0.0"],
    stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)

# ngrok tunnel
ngrok.set_auth_token(NGROK_AUTH_TOKEN)
public_url = ngrok.connect(port, "http").public_url
print("Public URL:", public_url)

# Optional: stream a few lines of Streamlit logs to know it's up
for _ in range(10):
    line = proc.stdout.readline()
    if not line: break
    print(line.strip())
time.sleep(2)
print("✅ Open the URL above in your browser.")


^C
Public URL: https://029a7153d349.ngrok-free.app

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.


You can now view your Streamlit app in your browser.

URL: http://0.0.0.0:8501



In [31]:
!git config --global user.name "siwarbouali25"
!git config --global user.email "siwar.bouali@esprit.tn"


In [32]:
!git clone https://github.com/siwarbouali25/News-Agent.git


Cloning into 'News-Agent'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 6 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), 7.28 MiB | 12.00 MiB/s, done.
Resolving deltas: 100% (1/1), done.
