In [15]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass("Введите ваш OpenAI API ключ: ")

Введите ваш OpenAI API ключ:  ········


In [25]:
from smolagents import tool
import arxiv
from pydantic import BaseModel, Field
from smolagents import ToolCallingAgent, OpenAIServerModel

In [17]:
class ArxivArticle(BaseModel):
    title: str = Field(description="Заголовок статьи")
    summary: str = Field(description="Краткое содержание статьи")
    pdf_url: str = Field(description="Ссылка на PDF-файл статьи")
    
def search_arxiv(query: str) -> ArxivArticle:
    search = arxiv.Search(query=query, max_results=1)
    result = next(search.results())
    
    # Проверяем, есть ли pdf_url
    pdf_url = getattr(result, "pdf_url", None)
    if not pdf_url:
        # Формируем pdf_url вручную из entry_id, если не найден
        entry_id = result.entry_id  # например https://arxiv.org/abs/2301.12345
        # извлечь id из entry_id
        arxiv_id = entry_id.split('/')[-1]
        pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"

    return ArxivArticle.model_validate({"title": result.title, "summary": result.summary, "pdf_url": pdf_url})

In [18]:
def download_pdf(pdf_url: str) -> str:
    """Скачивает pdf, сохраняет в temp.pdf."""
    r = requests.get(pdf_url)
    with open("temp.pdf", "wb") as f:
        f.write(r.content)
    return "temp.pdf"

def load_pdf(pdf_path: str):
    """Загружает PDF и разбивает на страницы."""
    loader = PyPDFLoader(pdf_path)
    return loader.load_and_split()

In [19]:
class PDFQA:
    """Класс для индексации PDF и ответа на вопросы по статье."""
    def __init__(self):
        self.qa_chain = None
    
    def index_pdf(self, pages):
        embeddings = OpenAIEmbeddings()
        vectorstore = FAISS.from_documents(pages, embeddings)
        retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
        self.qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), retriever=retriever)
    
    def ask(self, question: str) -> str:
        if not self.qa_chain:
            return "PDF не загружен и не проиндексирован."
        return self.qa_chain.run(question)

pdf_qa = PDFQA()

In [20]:
def summarize_pdf(pdf_path: str) -> str:
    pages = load_pdf(pdf_path)
    llm = OpenAI(model_name='o3-mini')
    chain = load_summarize_chain(llm, chain_type="map_reduce")
    summary = chain.run(pages)
    return summary

In [14]:
@tool
def tool_search_arxiv(query: str) -> str:
    """Tool for searching arxiv articles by text query.
    Args:
        query: the topic to search for
    """
    return search_arxiv(query)
@tool
def tool_load_and_index(pdf_url: str) -> str:
    """Tool for downloading a PDF file of an article from a link.
    Args:
        pdf_url: Arxiv paper url
    """
    pdf_path = download_pdf(pdf_url)
    pages = load_pdf(pdf_path)
    pdf_qa.index_pdf(pages)
    return f"PDF загружен и проиндексирован: {len(pages)} страниц"
@tool
def tool_ask_pdf(question: str) -> str:
    """ A tool for answering questions about an uploaded article.
    Args:
        question: Question about the article
    """
    return pdf_qa.ask(question)
@tool
def tool_summarize(pdf_url: str) -> str:
    """Tool for summarizing the uploaded article.
    Args:
        pdf_url: Arxiv paper url
    """
    pdf_path = download_pdf(pdf_url)
    return summarize_pdf(pdf_path)


In [26]:
agent = ToolCallingAgent(tools=[tool_summarize, tool_ask_pdf, tool_load_and_index, tool_search_arxiv], model=OpenAIServerModel(
    model_id="o3-mini"
))