In [1]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass("Введите ваш OpenAI API ключ: ")

Введите ваш OpenAI API ключ:  ········


In [3]:
from langchain.agents import Tool, initialize_agent, AgentType
from langchain_openai import ChatOpenAI as OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
from langchain_core.tools.structured import StructuredTool

from pydantic import BaseModel, Field
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import arxiv
import requests
import re
from pydantic import BaseModel

In [4]:
class ArxivArticle(BaseModel):
    title: str = Field(description="Заголовок статьи")
    summary: str = Field(description="Краткое содержание статьи")
    pdf_url: str = Field(description="Ссылка на PDF-файл статьи")
    
def search_arxiv(query: str) -> ArxivArticle:
    search = arxiv.Search(query=query, max_results=1)
    result = next(search.results())
    
    # Проверяем, есть ли pdf_url
    pdf_url = getattr(result, "pdf_url", None)
    if not pdf_url:
        # Формируем pdf_url вручную из entry_id, если не найден
        entry_id = result.entry_id  # например https://arxiv.org/abs/2301.12345
        # извлечь id из entry_id
        arxiv_id = entry_id.split('/')[-1]
        pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"

    return ArxivArticle.model_validate({"title": result.title, "summary": result.summary, "pdf_url": pdf_url})

def download_pdf(pdf_url: str) -> str:
    """Скачивает pdf, сохраняет в temp.pdf."""
    r = requests.get(pdf_url)
    with open("temp.pdf", "wb") as f:
        f.write(r.content)
    return "temp.pdf"

def load_pdf(pdf_path: str):
    """Загружает PDF и разбивает на страницы."""
    loader = PyPDFLoader(pdf_path)
    return loader.load_and_split()

class PDFQA:
    """Класс для индексации PDF и ответа на вопросы по ней."""
    def __init__(self):
        self.qa_chain = None
    
    def index_pdf(self, pages):
        embeddings = OpenAIEmbeddings()
        vectorstore = FAISS.from_documents(pages, embeddings)
        retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
        self.qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), retriever=retriever)
    
    def ask(self, question: str) -> str:
        if not self.qa_chain:
            return "PDF не загружен и не проиндексирован."
        return self.qa_chain.run(question)

pdf_qa = PDFQA()

def summarize_pdf(pdf_path: str) -> str:
    pages = load_pdf(pdf_path)
    llm = OpenAI(model_name='o3-mini')
    chain = load_summarize_chain(llm, chain_type="map_reduce")
    summary = chain.run(pages)
    return summary



In [5]:
# --- Инструменты для ReAct агента ---
def tool_search_arxiv(query: str) -> str:
    return search_arxiv(query)

def tool_load_and_index(pdf_url: str) -> str:
    pdf_path = download_pdf(pdf_url)
    pages = load_pdf(pdf_path)
    pdf_qa.index_pdf(pages)
    return f"PDF загружен и проиндексирован: {len(pages)} страниц"

def tool_ask_pdf(question: str) -> str:
    return pdf_qa.ask(question)

def tool_summarize(pdf_url: str) -> str:
    pdf_path = download_pdf(pdf_url)
    return summarize_pdf(pdf_path)

tools = [
     StructuredTool.from_function(
        name="SearchArxiv",
        func=tool_search_arxiv,
        description="Поиск статьи в arXiv по запросу. Ввод: поисковый запрос, вывод: title, summary и pdf ссылка.",
                 return_direct=True
    ),
    Tool(
        name="LoadAndIndexPDF",
        func=tool_load_and_index,
        description="Загружает PDF по ссылке, разбивает на страницы и индексирует для поиска."
    ),
    Tool(
        name="AskPDF",
        func=tool_ask_pdf,
        description="Отвечает на вопросы по загруженному PDF."
    ),
    Tool(
        name="SummarizePDF",
        func=tool_summarize,
        description="Делает краткое резюме статьи по PDF-ссылке."
    )
]

In [6]:
# --- Инициализация агента ReAct ---

llm = OpenAI(model_name="o3-mini")

agent = initialize_agent(
    tools,
    llm,
    agent_type=AgentType.OPENAI_FUNCTIONS,  
    verbose=True,
)

  agent = initialize_agent(


In [7]:
# --- Пример использования ---

# 1. Поиск статьи
query = "diffusion models 2024"
print(f"Поиск статьи: {query}")
search_result = agent.run(f"SearchArxiv: {query}")

print("\nРезультат поиска:")
print(search_result)
pdf_url = search_result.pdf_url
print(f"PDF ссылка: {pdf_url}")

# 2. Загрузка и индексирование PDF
print("Загрузка и индексирование PDF...")
load_index_result = agent.run(f"LoadAndIndexPDF: {pdf_url}")
print(load_index_result)

# 3. Суммаризация
print("Генерация резюме статьи...")
summary = agent.run(f"SummarizePDF: {pdf_url}")
print(summary)

# 4. Вопросы по статье
questions = [
    "Что такое диффузионные модели?",
    "Какие основные результаты в статье?",
    "Какова методология исследования в статье?"
]

for q in questions:
    print(f"Вопрос: {q}")
    answer = agent.run(f"AskPDF: {q}")
    print(f"Ответ: {answer}")

🔍 Поиск статьи: diffusion models 2024


[1m> Entering new AgentExecutor chain...[0m


  search_result = agent.run(f"SearchArxiv: {query}")


[32;1m[1;3mQuestion: SearchArxiv: diffusion models 2024
Thought: I need to search arXiv for diffusion models papers from 2024. I'll use the SearchArxiv tool with the query "diffusion models 2024".
Action: SearchArxiv
Action Input: diffusion models 2024[0m

  result = next(search.results())



Observation: [36;1m[1;3mtitle="Rebuttal of Morris' criticism of the diffusive compressible Euler model" summary='This short note addresses the criticism of the diffusive compressible Euler\nmodel regarding heat diffusion, sound attenuation and material frame\nindifference put forward by M. Morris.' pdf_url='http://arxiv.org/pdf/2406.18241v1'[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m

Результат поиска:
title="Rebuttal of Morris' criticism of the diffusive compressible Euler model" summary='This short note addresses the criticism of the diffusive compressible Euler\nmodel regarding heat diffusion, sound attenuation and material frame\nindifference put forward by M. Morris.' pdf_url='http://arxiv.org/pdf/2406.18241v1'

📥 PDF ссылка: http://arxiv.org/pdf/2406.18241v1

⏳ Загрузка и индексирование PDF...


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mQuestion: LoadAndIndexPDF: http://arxiv.org/pdf/2406.18241v1
Thought: The task is to load and index the provided PDF fr

  embeddings = OpenAIEmbeddings()



Observation: [33;1m[1;3mPDF загружен и проиндексирован: 8 страниц[0m
Thought:[32;1m[1;3mThought: The PDF from the given URL has been loaded and indexed successfully, and it contains 8 pages.
Final Answer: The PDF has been loaded and indexed successfully, and it contains 8 pages.[0m

[1m> Finished chain.[0m
The PDF has been loaded and indexed successfully, and it contains 8 pages.

📝 Генерация резюме статьи...


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mQuestion: SummarizePDF: http://arxiv.org/pdf/2406.18241v1

Thought: I need to obtain a summary of the provided PDF by using the SummarizePDF tool.

Action: SummarizePDF
Action Input: http://arxiv.org/pdf/2406.18241v1
[0m
Observation: [36;1m[1;3mThe paper defends the diffusive compressible Euler (dcE) model—an alternative to the traditional Navier–Stokes–Fourier (NSF) system—against Morris’ criticisms regarding heat diffusion, sound attenuation, and material frame indifference. The dcE model modifies the class

  result = next(search.results())



Observation: [36;1m[1;3mtitle='RGB-D-Fusion: Image Conditioned Depth Diffusion of Humanoid Subjects' summary='We present RGB-D-Fusion, a multi-modal conditional denoising diffusion\nprobabilistic model to generate high resolution depth maps from low-resolution\nmonocular RGB images of humanoid subjects. RGB-D-Fusion first generates a\nlow-resolution depth map using an image conditioned denoising diffusion\nprobabilistic model and then upsamples the depth map using a second denoising\ndiffusion probabilistic model conditioned on a low-resolution RGB-D image. We\nfurther introduce a novel augmentation technique, depth noise augmentation, to\nincrease the robustness of our super-resolution model.' pdf_url='http://arxiv.org/pdf/2307.15988v1'[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
🤖 Ответ: title='RGB-D-Fusion: Image Conditioned Depth Diffusion of Humanoid Subjects' summary='We present RGB-D-Fusion, a multi-modal conditional denoising diffusion\nprobabilistic model to generate h