In [6]:
import requests
import wikipediaapi
import json
from urllib.parse import urlparse, parse_qs
from typing import Type
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool
from pydantic import BaseModel, Field
from langchain.agents import initialize_agent, AgentType
from bs4 import BeautifulSoup


llm = ChatOpenAI(
    temperature=0.1,
    model="gpt-4o-mini",
)

class SearchToolArgsSchema(BaseModel):
    query: str = Field(
        description="The query you will search for."
    )


# DuckDuckGo 검색 툴
class CustomDDGSearchTool(BaseTool):
    name = "DDGSearchTool"
    description = """
    Use this tool to find documents using DuckDuckGo Search Engine.
    It takes a query as an argument and fetches content from the resulting websites.
    """
    args_schema: Type[SearchToolArgsSchema] = SearchToolArgsSchema

    def _run(self, query: str):
        try:
            # DuckDuckGo 검색
            search_url = f"https://html.duckduckgo.com/html/?q={query}"
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(search_url, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                results = soup.find_all('a', class_='result__a')
                
                # 각 웹사이트의 링크를 따라가서 콘텐츠 추출
                extracted_content = []
                print(len(results))
                for result in results[:5]:
                    url = result['href']
                    actual_url = self.extract_actual_url(url)
                    print(actual_url)
                    if actual_url:
                        website_content = self.fetch_website_content(actual_url)
                        if website_content:
                            extracted_content.append(website_content)
                
                all_content = "\n\n".join(extracted_content)
                save_to_txt(f"research_DDG.txt", all_content)
                return all_content
            else:
                return f"Error fetching results from DuckDuckGo with status code {response.status_code}."
        except Exception as e:
            return f"Error fetching results from DuckDuckGo: {str(e)}"

    def extract_actual_url(self, duckduckgo_url: str):
        """DuckDuckGo 리다이렉션 URL에서 실제 URL을 추출하는 함수"""
        parsed_url = urlparse(duckduckgo_url)
        query_params = parse_qs(parsed_url.query)
        actual_url = query_params.get('uddg', [None])[0]
        
        # DuckDuckGo 리다이렉션 URL에 HTTPS 스킴이 없을 경우 추가
        if actual_url and not actual_url.startswith('http'):
            actual_url = 'https://' + actual_url
        return actual_url

    def fetch_website_content(self, url: str):
        """웹사이트의 콘텐츠를 가져오는 함수"""
        try:
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                paragraphs = soup.find_all('p')  # 주요 텍스트 콘텐츠 추출
                return "\n".join([p.get_text() for p in paragraphs])
            else:
                return None
        except Exception as e:
            return f"Error fetching content from {url}: {str(e)}"


class WikiPediaSearchTool(BaseTool):
    name = "WikiPediaSearchTool"
    description = """
    Use this tool to find the documents using Wikipedia.
    It takes a query as an argument.
    """
    args_schema: Type[SearchToolArgsSchema] = SearchToolArgsSchema

    def _run(self, query: str):
        try:
            headers = {'User-Agent': 'Mozilla/5.0'}
            wiki_wiki = wikipediaapi.Wikipedia('en', headers=headers)  
            page = wiki_wiki.page(query)
            if page.exists():
                save_to_txt(f"research_WIKI.txt", page.text)
                return page.text
            else:
                return f"No Wikipedia page found for {query}."
        except Exception as e:
            return f"Error fetching results from Wikipedia: {str(e)}"


agent = initialize_agent(
    llm=llm,
    verbose=True,
    agent=AgentType.OPENAI_FUNCTIONS,
    handle_parsing_errors=True,
    tools=[
        CustomDDGSearchTool(),
        WikiPediaSearchTool(),
    ],
)


def save_to_txt(filename, content):
    # content가 dict일 경우 JSON으로
    if isinstance(content, dict):
        content = json.dumps(content, indent=4, ensure_ascii=False)
    with open(filename, 'a', encoding='utf-8') as f:
        f.write(content)
    print(f"Results saved to {filename}")


query = "XZ backdoor"
prompt = f"Research about the {query}. You can use DuckDuckGoSearch Tool and WikipediaSearchTool."
result = agent.invoke(prompt)

save_to_txt(f"research_{query}.txt", result)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `DDGSearchTool` with `{'query': 'XZ backdoor'}`


[0m24
https://en.wikipedia.org/wiki/XZ_Utils_backdoor
https://www.wired.com/story/xz-backdoor-everything-you-need-to-know/
https://www.wired.com/story/jia-tan-xz-backdoor/
https://www.kali.org/blog/about-the-xz-backdoor/
https://cybernews.com/editorial/xz-linux-backdoor-explained/
Results saved to research_DDG.txt
[36;1m[1;3m

In February 2024, a malicious backdoor was introduced to the Linux utility xz within the liblzma library in versions 5.6.0 and 5.6.1 by an account using the name "Jia Tan".[b][2] The backdoor gives an attacker who possesses a specific Ed448 private key remote code execution capabilities on the affected Linux system. The issue has been given the Common Vulnerabilities and Exposures number CVE-2024-3094 and has been assigned a CVSS score of 10.0, the highest possible score.[3][4]

While xz is commonly present in most Linux distributions, at t