In [2]:
# --- LLM Configuration (LangChain + Ollama) ---
LLM_NAME = "mistral"
TEMPERATURE = 0.1
TIMEOUT = 60.0  # seconds

from langchain_ollama import ChatOllama

# 1) llm 
llm = ChatOllama(
    model=LLM_NAME,
    temperature=TEMPERATURE,
    timeout=TIMEOUT
)

print("✅ ChatOllama ready")

# 2) test
response = llm.invoke("Hello! Can you summarize what LangChain is?")
print(response.content)


✅ ChatOllama ready
 LangChain is an open-source project that aims to create a decentralized, blockchain-based platform for building and deploying artificial intelligence models. The goal of LangChain is to make it easier for developers to build, share, and monetize AI applications by providing a secure, transparent, and scalable infrastructure.

The LangChain platform uses smart contracts on the Ethereum blockchain to manage the lifecycle of AI models, including training, deployment, and execution. This allows for decentralized storage and computation of AI models, reducing the reliance on centralized cloud providers and enabling more efficient use of resources.

LangChain also includes a marketplace where developers can buy and sell AI models, as well as tools for creating and managing AI applications. The project is still in development, but it has the potential to revolutionize the way AI is developed and deployed by providing a decentralized, open-source alternative to traditional 

In [3]:
# --- Imports ---
import pandas as pd
import feedparser
from urllib.parse import quote


In [10]:
# --- Step 1: RSS ---
import pandas as pd
import feedparser
from urllib.parse import quote

def google_news_rss_min_df_en(query: str = "Harry Potter", limit: int = 2) -> pd.DataFrame:
    """
    Fetch basic metadata from Google News RSS feed.
    Args:
        query (str): Search keyword (default = "Harry Potter")
        limit (int): Number of articles to fetch
    Returns:
        DataFrame with columns [title, link, published, source]
    """
    # Build Google News RSS URL (English / US region)
    url = f"https://news.google.com/rss/search?q={quote(query)}&hl=en&gl=US&ceid=US:en"

    # Parse the RSS feed
    feed = feedparser.parse(url)

    rows = []
    for e in feed.entries[:limit]:
        rows.append({
            # Get article title (default = "")
            "title": (getattr(e, "title", "") or "").strip(),
            # Get article link
            "link": getattr(e, "link", "") or "",
            # Get publish date
            "published": getattr(e, "published", "") or "",
            # Get source (e.g., BBC, USA Today)
            "source": getattr(getattr(e, "source", None), "title", "") or ""
        })

    # Convert to DataFrame
    df = pd.DataFrame(rows, columns=["title", "link", "published", "source"])

    # Remove duplicate rows (first by title, then by link)
    df = df.drop_duplicates(subset="title").drop_duplicates(subset="link").reset_index(drop=True)

    return df


In [5]:
# --- Helpers ---
import requests
from urllib.parse import urlparse, parse_qs, unquote

UA = {"User-Agent": "Mozilla/5.0"}

def dedup_by_title(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty: return df
    df = df.copy()
    df["__k"] = df["title"].fillna("").str.casefold().str.strip()
    df = df.drop_duplicates(subset="__k").drop_duplicates(subset="link").drop(columns="__k").reset_index(drop=True)
    return df

def resolve_final_url(url: str) -> str:
    # 1) ?url= 있으면 그 값 사용
    try:
        p = urlparse(url)
        if "news.google.com" in p.netloc:
            q = parse_qs(p.query)
            if "url" in q and q["url"]:
                url = unquote(q["url"][0])
    except:
        pass
    # 2) HTTP 리디렉트 따라가기
    try:
        r = requests.get(url, headers=UA, timeout=12, allow_redirects=True)
        return r.url or url
    except:
        return url


In [6]:
# --- Step 2: Extract ---
import trafilatura
import requests   # 
make sure requests is imported
                  # (used for fetching the webpage content)

def extract_article_text(url: str) -> str:
    """
    Extract the main text from a news article URL.
    Args:
        url (str): Article web link
    Returns:
        str: Clean article text, or empty string if failed
    """
    if not url:
        return ""
    try:
        # Download the webpage (follow redirects)
        r = requests.get(url, headers=UA, timeout=15, allow_redirects=True)

        # If failed (error code or no content), return empty
        if r.status_code >= 400 or not r.text:
            return ""

        # Extract article text (remove comments, tables, metadata)
        txt = trafilatura.extract(
            r.text,
            include_comments=False,
            include_tables=False,
            favor_recall=True,      # try to capture as much as possible
            with_metadata=False     # skip extra metadata
        )

        # Return stripped text (or empty if None)
        return (txt or "").strip()
    except:
        # In case of error (e.g. timeout, parse error), return empty


In [11]:
# --- Step 3: Pipeline ---
import time
import pandas as pd

def build_news_df_simple(query: str = "Harry Potter", limit: int = 2, min_chars: int = 300) -> pd.DataFrame:
    """
    Full pipeline:
    1) Fetch news metadata from Google News RSS
    2) Remove duplicate titles/links
    3) Extract article body text
    4) Filter out short articles (by min_chars)
    5) Return as DataFrame
    """
    # Step 1: Fetch metadata (title, link, source, etc.)
    meta = google_news_rss_min_df_en(query, limit=limit)

    # Step 2: Remove duplicate titles/links
    meta = dedup_by_title(meta)

    rows = []
    for i, r in meta.iterrows():
        # Step 3: Resolve redirect to get final article URL
        final_url = resolve_final_url(r["link"])

        # Step 4: Try extracting text (first from final_url, fallback to raw RSS link)
        body = extract_article_text(final_url) or extract_article_text(r["link"])

        # Debug print: show text length and part of the title
        print(f"[{i}] len={len(body):4d}  {r['title'][:40]}...")

        # Step 5: Only keep if article is long enough
        if body and len(body) >= min_chars:
            rows.append({
                "title": r["title"].strip(),
                "link": final_url,
                "content": body
            })

        # Small delay to avoid overloading servers
        time.sleep(0.2)

    # Step 6: Return DataFrame with clean results
    return pd.DataFrame(rows, columns=["title", "link", "content"])


In [12]:
# --- Step 4: Summarize ---
def summarize_with_llm(text: str) -> str:
    """
    Summarize the article text into 3 short bullet points using LLM.
    - If text is empty, return ""
    - Otherwise, send prompt to llm.invoke()
    """
    if not text:
        return ""
    # Simple prompt asking for 3 concise bullets
    return llm.invoke("Summarize in 3 short bullets:\n\n" + text).content


In [13]:
if __name__ == "__main__":
    # 0) LLM 준비 (Ollama 실행 중이어야 함: ollama serve / ollama pull mistral)
    from langchain_ollama import ChatOllama
    llm = ChatOllama(model="mistral", temperature=0.1, timeout=60.0)

    # 1) 뉴스 수집 + 본문 추출
    df = build_news_df_simple("Harry Potter", limit=2, min_chars=300)
    print("\n=== Titles ===")
    print(df[["title", "link"]])

    # 2) 요약
    if not df.empty:
        print("\n=== Summaries ===")
        for i, row in df.iterrows():
            print(f"\n[{i}] {row['title']}")
            print(summarize_with_llm(row["content"]))
    else:
        print("\n(No rows passed the min_chars filter)")


[0] len=   0  ‘Harry Potter’ HBO Series Reveals Weasle...
[1] len=   0  Watch: Filming of new Harry Potter serie...

=== Titles ===
Empty DataFrame
Columns: [title, link]
Index: []

(No rows passed the min_chars filter)
