In [None]:
# Research Agent

import re
from datetime import datetime
from collections import Counter

from langchain.tools import WikipediaQueryRun
from langchain.utilities import WikipediaAPIWrapper

from langchain_community.tools import DuckDuckGoSearchRun
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain_community.document_loaders import WebBaseLoader

# --- Tools ---
def search_wikipedia(query: str) -> str:
    try:
        tool = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
        return tool.run(query)
    except Exception as e:
        return f"Error searching Wikipedia: {repr(e)}"


def search_duckduckgo(query: str) -> str:
    try:
        api = DuckDuckGoSearchAPIWrapper(
            region="us-en",
            safesearch="moderate",
            time=None,
            max_results=8,
        )
        tool = DuckDuckGoSearchRun(api_wrapper=api)
        return tool.invoke(query)  # Runnable API (.invoke) for v0.1+
    except Exception as e:
        return f"Error searching DuckDuckGo: {repr(e)}"


def extract_urls(query: str, ddg_text: str, max_results: int = 8) -> list[str]:
    urls = re.findall(r'https?://[^\s\]]+', ddg_text)
    if not urls:
        api = DuckDuckGoSearchAPIWrapper(
            region="us-en", safesearch="moderate", time=None, max_results=max_results
        )
        results = api.results(query, max_results=max_results)
        urls = [r.get("link") or r.get("href") for r in results if r.get("link") or r.get("href")]
    # Trim trailing punctuation that sometimes sticks to URLs
    urls = [u.rstrip(').,]') for u in urls]
    # Remove obvious duplicates while keeping order
    seen, deduped = set(), []
    for u in urls:
        if u not in seen:
            seen.add(u)
            deduped.append(u)
    return deduped


def scrape_url(url: str, max_len: int = 3000) -> str:
    try:
        if not url.startswith("http"):
            return f"Invalid URL: {url}"
        docs = WebBaseLoader(url).load()
        text = "\n".join(d.page_content for d in docs)
        text = re.sub(r"\s+", " ", text).strip()
        return (text[:max_len] + "... [Content truncated for length]") if len(text) > max_len else text
    except Exception as e:
        return f"Error scraping {url}: {repr(e)}"


def summarize_texts(texts: list[str], max_sentences: int = 5) -> str:
    """
    Simple frequency-based extractive summary (English only).
    """
    joined = " ".join([t for t in texts if t and not t.startswith("Error scraping")]).strip()
    if not joined:
        return "(No content to summarize)"

    # Sentence split by punctuation
    sentences = re.split(r'(?<=[\.\!\?])\s+', joined)
    sentences = [s.strip() for s in sentences if s.strip()]
    if not sentences:
        return "(No content to summarize)"

    # Tokenize English words only
    tokens = re.findall(r"[A-Za-z]+", joined.lower())

    # Very basic English stopwords
    stop = set("""
        a an the and or for of to in on with from by as is are was were be been being
        this that these those it its into at over under about you your their they we us our
        not no yes if but so than then may might can could should would will
    """.split())

    tokens = [t for t in tokens if t not in stop and len(t) > 2]
    if not tokens:
        # fallback: just pick first N sentences
        return "\n- " + "\n- ".join(sentences[:max_sentences])

    # Frequency count
    from collections import Counter
    freq = Counter(tokens)

    def score_sentence(s: str) -> int:
        ts = re.findall(r"[A-Za-z]+", s.lower())
        ts = [t for t in ts if t not in stop and len(t) > 2]
        return sum(freq.get(t, 0) for t in ts)

    scored = [(i, s, score_sentence(s)) for i, s in enumerate(sentences)]
    top = sorted(sorted(scored, key=lambda x: x[2], reverse=True)[:max_sentences], key=lambda x: x[0])
    picks = [s for _, s, _ in top]

    return "\n- " + "\n- ".join(picks)

def save_report(content: str, filename: str | None = None, title: str = "RESEARCH REPORT") -> str:
    try:
        if filename is None:
            filename = f"research_report_{datetime.now():%Y%m%d_%H%M%S}.txt"
        report = (
            f"{title}\n"
            f"Generated on: {datetime.now():%Y-%m-%d %H:%M:%S}\n"
            + "=" * 47 + "\n\n"
            + content + "\n\n"
            + "=" * 47 + "\nEnd of Report\n"
        )
        with open(filename, "w", encoding="utf-8") as f:
            f.write(report)
        return f"Saved: {filename}"
    except Exception as e:
        return f"Error saving file: {repr(e)}"

# --- JUST DO IT GOGOGO ---

def run_research(query: str, scrape_top_k: int = 3) -> str:
    """
    End-to-end research:
    1) Wikipedia search
    2) DuckDuckGo search
    3) Extract URLs and scrape top-k sites
    4) Auto-summary from scraped text
    5) Save to a generic .txt report
    """
    parts = []

    # 1) Wikipedia
    wiki = search_wikipedia(query)
    parts.append(f"WIKIPEDIA RESULTS:\n{wiki}\n")

    # 2) DuckDuckGo
    ddg_text = search_duckduckgo(query)
    parts.append(f"DUCKDUCKGO RESULTS:\n{ddg_text}\n")

    # 3) Extract URLs and scrape
    urls = extract_urls(query, ddg_text, max_results=8)
    scraped_blocks = []
    scraped_texts = []
    for i, u in enumerate(urls[:scrape_top_k]):
        body = scrape_url(u)
        scraped_blocks.append(f"WEBSITE {i+1} ({u}):\n{body}\n")
        scraped_texts.append(body)
    if scraped_blocks:
        parts.extend(scraped_blocks)
    else:
        parts.append("No valid URLs found or scraping failed.\n")

    # 4) Auto-summary
    summary = summarize_texts(scraped_texts, max_sentences=5)
    parts.append(f"FINAL SUMMARY")

    # 5) Save
    full = "\n".join(parts)
    print(save_report(full))  # Print save path
    return full

# --- Execute with the required query ---
if __name__ == "__main__":
    QUERY = "Research about the XZ backdoor"
    run_research(QUERY, scrape_top_k=3)

  with DDGS() as ddgs:
  ddgs_gen = ddgs.text(
  with DDGS() as ddgs:
  ddgs_gen = ddgs.text(
