In [1]:
"""
Minimal Google News RSS → Full-text → DataFrame pipeline
- Step 1: Collect news (RSS via feedparser)
- Step 2: Extract article body (trafilatura)
- Step 3: Tidy into a DataFrame (with dedup)
- Step 4: (Optional) Summarize via ChatOllama (Mistral)

Requirements:
  pip install feedparser trafilatura pandas tldextract tqdm
"""

'\nMinimal Google News RSS → Full-text → DataFrame pipeline\n- Step 1: Collect news (RSS via feedparser)\n- Step 2: Extract article body (trafilatura)\n- Step 3: Tidy into a DataFrame (with dedup)\n- Step 4: (Optional) Summarize via ChatOllama (Mistral)\n\nRequirements:\n  pip install feedparser trafilatura pandas tldextract tqdm\n'

In [26]:
# --- LLM Configuration (LangChain + Ollama) ---
LLM_NAME = "mistral"
TEMPERATURE = 0.1
TIMEOUT = 60.0  # seconds

from langchain_ollama import ChatOllama

# 1) llm 
llm = ChatOllama(
    model=LLM_NAME,
    temperature=TEMPERATURE,
    timeout=TIMEOUT
)

print("✅ ChatOllama ready")

# 2) test
response = llm.invoke("Hello! Can you summarize what LangChain is?")
print(response.content)


✅ ChatOllama ready
 LangChain is an open-source project that aims to create a decentralized, blockchain-based platform for building and deploying natural language processing (NLP) models. The goal of LangChain is to make it easier for developers to build and share NLP models, while also ensuring that the data used to train these models is secure, private, and transparent.

LangChain uses a combination of smart contracts on the Ethereum blockchain and machine learning algorithms to enable the creation and deployment of NLP models in a decentralized manner. The platform allows developers to train models using their own data, while also providing tools for collaborating with others and sharing models. LangChain also includes mechanisms for incentivizing participation in the network through the use of its native cryptocurrency, LGNT.

Overall, LangChain is designed to address some of the challenges associated with building and deploying NLP models, such as data privacy, security, and scala

In [125]:
import sys
print(sys.executable)

/Users/jessicahong/nlp_env/bin/python


In [5]:
# --- Imports ---
import pandas as pd
import feedparser
from urllib.parse import quote


In [28]:
# --- Step 1: RSS Function ---
import pandas as pd
import feedparser
from urllib.parse import quote

def google_news_rss_min_df_en(query: str = "Harry Potter", limit: int = 2) -> pd.DataFrame:
    """
    Fetch basic metadata from Google News (US/English) RSS.
    Returns: DataFrame[title, link, published, source]
    """
    url = f"https://news.google.com/rss/search?q={quote(query)}&hl=en&gl=US&ceid=US:en"
    feed = feedparser.parse(url)

    rows = []
    for entry in feed.entries[:limit]:
        title = (getattr(entry, "title", "") or "").strip()
        link = getattr(entry, "link", "") or ""
        published = getattr(entry, "published", "") or ""
        source = getattr(getattr(entry, "source", None), "title", "") or ""
        rows.append({"title": title, "link": link, "published": published, "source": source})

    df = pd.DataFrame(rows, columns=["title", "link", "published", "source"])

    # Deduplication (first by title → then by link)
    df = df.drop_duplicates(subset="title", keep="first")
    df = df.drop_duplicates(subset="link", keep="first").reset_index(drop=True)

    return df

# --- Test run ---
if __name__ == "__main__":
    df = google_news_rss_min_df_en()  # Default query = "Harry Potter", 2 articles
    print(df)


                                                                                title  \
0                   Watch: Filming of new Harry Potter series spotted in London - BBC   
1  See photos of new-look Weasley siblings in HBO's 'Harry Potter' series - USA Today   

                                                                                                  link  \
0  https://news.google.com/rss/articles/CBMiV0FVX3lxTE1RdWpFVGI0cXVrdDh4b181V0dpWWdvcTZBREdwUmNpbm9...   
1  https://news.google.com/rss/articles/CBMipAFBVV95cUxNaGJGb2xJa2J5UmRzQVJYdkFQck1pVTFkZW9pRkVTXzV...   

                       published     source  
0  Tue, 19 Aug 2025 14:38:00 GMT        BBC  
1  Tue, 19 Aug 2025 19:30:00 GMT  USA Today  


In [29]:
# --- Step 2: Article Extraction---
import trafilatura

def extract_article_text(url: str) -> str:
    """
    Extract the main body text from a news article URL.
    - Filtering (e.g., length check) will be handled in Step 3.
    - Returns an empty string if extraction fails.
    """
    if not url:
        return ""
    try:
        downloaded = trafilatura.fetch_url(url)
        if not downloaded:
            return ""
        text = trafilatura.extract(
            downloaded,
            include_comments=False,
            include_tables=False
        )
        return (text or "").strip()
    except Exception:
        return ""


In [32]:
# --- Step 2: Article Extraction test using Step 1 output ---
# Prereqs: google_news_rss_min_df_en, resolve_final_url, extract_article_text are defined.

if __name__ == "__main__":
    # 1) Get links from Step 1
    df = google_news_rss_min_df_en("Harry Potter", limit=2)
    print(df[["title", "link"]])

    if not df.empty:
        # Pick the first link (use index 1 if you want the second)
        test_url = df.iloc[0]["link"]

        # 2) Resolve to the real article URL (not the Google News redirect)
        final_url = resolve_final_url(test_url)

        print("\n=== Step 2: Article Extraction Test ===")
        print("Original:", test_url)
        print("Final   :", final_url)

        # 3) Try extraction (final URL first, then fallback to original)
        body = extract_article_text(final_url) or extract_article_text(test_url)

        if body:
            print("Length:", len(body))
            print("Preview:", body[:300], "...")
        else:
            print("(No content extracted)")
    else:
        print("(No rows from Step 1)")
# --- Helpers ---
import time
import pandas as pd
import requests
from urllib.parse import urlparse, parse_qs, unquote

UA = {"User-Agent": "Mozilla/5.0"}  # Simple UA to avoid basic blocks

def dedup_by_title(df: pd.DataFrame) -> pd.DataFrame:
    """
    Deduplicate rows by title (casefolded) and then by link.
    Returns a DataFrame with stable ordering and reset index.
    """
    if df.empty:
        return df
    df = df.copy()
    df["__title_key"] = df["title"].fillna("").str.casefold().str.strip()
    df = df.drop_duplicates(subset="__title_key", keep="first")
    df = df.drop_duplicates(subset="link", keep="first")
    return df.drop(columns="__title_key").reset_index(drop=True)

def resolve_gnews_link(url: str) -> str:
    """
    Resolve Google News RSS redirect links by extracting the '?url=' parameter.
    If not present, return the original URL.
    """
    try:
        p = urlparse(url)
        if "news.google.com" in p.netloc:
            q = parse_qs(p.query)
            if "url" in q and q["url"]:
                return unquote(q["url"][0])
        return url
    except Exception:
        return url

def resolve_final_url(url: str) -> str:
    """
    Get the final canonical article URL:
    1) Strip Google News '?url=' parameter if present (resolve_gnews_link).
    2) Follow HTTP redirects to the ultimate destination.
    """
    u = resolve_gnews_link(url)
    try:
        r = requests.get(u, headers=UA, timeout=12, allow_redirects=True)
        return r.url or u
    except Exception:
        return u


                                                                                title  \
0  See photos of new-look Weasley siblings in HBO's 'Harry Potter' series - USA Today   
1                   Watch: Filming of new Harry Potter series spotted in London - BBC   

                                                                                                  link  
0  https://news.google.com/rss/articles/CBMipAFBVV95cUxNaGJGb2xJa2J5UmRzQVJYdkFQck1pVTFkZW9pRkVTXzV...  
1  https://news.google.com/rss/articles/CBMiV0FVX3lxTE1RdWpFVGI0cXVrdDh4b181V0dpWWdvcTZBREdwUmNpbm9...  

=== Step 2: Article Extraction Test ===
Original: https://news.google.com/rss/articles/CBMipAFBVV95cUxNaGJGb2xJa2J5UmRzQVJYdkFQck1pVTFkZW9pRkVTXzVoWkJULVYyRlYxeUFMMG9TTjc1NU1NbjB1NWZvc3pWeF9YdHBJLURnMjZ1Y0E2Tlk4Q0UtQmxrV21pZS1yNVh1RFBPaFA1S2NsNGpReDFZNVVVcU9nYUUzM2xYVERSYjN5LWRfTERTbE0wM2U5dkFQWjZuUFMyYjEzcTR0Yw?oc=5
Final   : https://news.google.com/rss/articles/CBMipAFBVV95cUxNaGJGb2xJa2J5UmRzQVJYdkFQck1pVTFkZW

In [24]:
# --- Helpers ---
import time
import pandas as pd
import requests                             # ✅ 추가
from urllib.parse import urlparse, parse_qs, unquote

UA = {"User-Agent": "Mozilla/5.0"}          # ✅ 추가: 간단 UA

def dedup_by_title(df: pd.DataFrame) -> pd.DataFrame:
    """제목과 링크 기준으로 중복 제거(초간단)."""
    if df.empty:
        return df
    df = df.copy()
    df["__title_key"] = df["title"].fillna("").str.casefold().str.strip()
    df = df.drop_duplicates(subset="__title_key", keep="first")
    df = df.drop_duplicates(subset="link", keep="first")
    return df.drop(columns="__title_key").reset_index(drop=True)

def resolve_gnews_link(url: str) -> str:
    """Google News RSS 리디렉트 링크를 원본 기사 URL로 변환 (?url= 파라미터 기반)."""
    try:
        p = urlparse(url)
        if "news.google.com" in p.netloc:
            q = parse_qs(p.query)
            if "url" in q and q["url"]:
                return unquote(q["url"][0])
        return url
    except Exception:
        return url

def resolve_final_url(url: str) -> str:
    """
    최종 원본 URL을 얻기 위해:
    1) 먼저 resolve_gnews_link로 ?url= 파라미터 제거
    2) 그래도 리디렉트가 남아있으면 HTTP 리디렉트 추적
    """
    u = resolve_gnews_link(url)
    try:
        r = requests.get(u, headers=UA, timeout=12, allow_redirects=True)
        return r.url or u
    except Exception:
        return u


In [25]:
# --- Helpers 확인용 간단 테스트 ---
if __name__ == "__main__":
    import pandas as pd

    # 샘플 DataFrame
    sample = pd.DataFrame([
        {"title": "AI News Today", "link": "http://example.com/1"},
        {"title": "AI News Today", "link": "http://example.com/1"},  # 중복
        {"title": "Another AI Update", "link": "http://example.com/2"},
    ])

    print("=== dedup_by_title ===")
    print(dedup_by_title(sample))

    print("\n=== resolve_gnews_link ===")
    gnews_url = "https://news.google.com/articles/xxx?url=https%3A%2F%2Foriginalsite.com%2Farticle"
    print("Before:", gnews_url)
    print("After :", resolve_gnews_link(gnews_url))

    print("\n=== resolve_final_url ===")
    print("Before:", gnews_url)
    print("After :", resolve_final_url(gnews_url))


=== dedup_by_title ===
               title                  link
0      AI News Today  http://example.com/1
1  Another AI Update  http://example.com/2

=== resolve_gnews_link ===
Before: https://news.google.com/articles/xxx?url=https%3A%2F%2Foriginalsite.com%2Farticle
After : https://originalsite.com/article

=== resolve_final_url ===
Before: https://news.google.com/articles/xxx?url=https%3A%2F%2Foriginalsite.com%2Farticle
After : https://originalsite.com/article


In [33]:
# --- Step 3: Pipeline ---
import time
import pandas as pd

def build_news_df_simple(query: str, limit: int = 2, min_chars: int = 300) -> pd.DataFrame:
    """
    Google News RSS → fulltext → DataFrame(title, link, content)
    - Dedup title/link
    - Resolve Google News redirects to the final URL
    - Filter by minimum content length (min_chars)
    """
    # 1) Step 1: fetch metadata
    meta = google_news_rss_min_df_en(query, limit=limit)
    # 2) dedup
    meta = dedup_by_title(meta)

    rows = []
    # 3) extract article body
    for i, r in meta.iterrows():
        raw_link = r["link"]
        final_url = resolve_final_url(raw_link)  # ✅ use final URL (better than resolve_gnews_link)

        # Try final URL first → fallback to raw RSS link
        body = extract_article_text(final_url) or extract_article_text(raw_link)

        print(f"[{i}] len={len(body):4d}  title={r['title'][:40]}...  final={'Y' if final_url!=raw_link else 'N'}")

        # 4) length filter
        if body and len(body) >= min_chars:
            rows.append({
                "title": r["title"].strip(),
                "link": final_url,  # ✅ store the real article URL
                "content": body
            })

        time.sleep(0.2)  # polite delay

    # 5) return final DataFrame
    return pd.DataFrame(rows, columns=["title", "link", "content"])


# --- Run: Pipeline Test ---
if __name__ == "__main__":
    df = build_news_df_simple("Harry Potter", limit=3, min_chars=300)
    print("\n=== Final DataFrame (title, link) ===")
    print(df[["title", "link"]])



[0] len=   0  title=See photos of new-look Weasley siblings ...  final=Y
[1] len=   0  title=Watch: Filming of new Harry Potter serie...  final=Y
[2] len=   0  title='Harry Potter' TV Series First Look: Dom...  final=Y

=== Final DataFrame (title, link) ===
Empty DataFrame
Columns: [title, link]
Index: []


In [34]:
# --- Step 4: Summarize with LLM  ---
def summarize_with_llm(text: str) -> str:
    """Summarize the news article body into 3 concise bullet points using ChatOllama."""
    if not text:
        return ""
    prompt = (
        "Summarize the following news article in 3 concise bullet points.\n"
        "Focus on facts (who/what/when/where), key numbers, and outcomes.\n\n"
        f"{text}"
    )
    resp = llm.invoke(prompt)  # ⚠️ llm = ChatOllama(...) must be initialized before
    return getattr(resp, "content", str(resp))


In [35]:
if __name__ == "__main__":
    df = build_news_df_simple("Harry Potter", limit=2, min_chars=300)
    print("\n=== Step 3 DataFrame ===")
    print(df[["title", "link"]])

    if not df.empty:
        first_article = df.iloc[0]["content"]
        print("\n=== Step 4: LLM Summary ===")
        summary = summarize_with_llm(first_article)
        print(summary)


[0] len=   0  title=HBO’s Harry Potter Series Casts More Wea...  final=Y
[1] len=   0  title=Watch: Filming of new Harry Potter serie...  final=Y

=== Step 3 DataFrame ===
Empty DataFrame
Columns: [title, link]
Index: []
