In [28]:
import arxiv
from langchain_core.documents import Document

search = arxiv.Search(
    query="cat:cs.AI",
    max_results=500,
    sort_by=arxiv.SortCriterion.SubmittedDate
)

docs = []
for result in search.results():
    metadata = {
        "title": result.title,
        "authors": ", ".join([author.name for author in result.authors]),
        "published": result.published.strftime("%Y-%m-%d"),
        "url": result.entry_id
    }
    content = result.summary.strip()
    doc = Document(page_content=content, metadata=metadata)
    docs.append(doc)



  for result in search.results():


In [29]:
docs

[Document(metadata={'title': 'Evaluating Memory in LLM Agents via Incremental Multi-Turn Interactions', 'authors': 'Yuanzhe Hu, Yu Wang, Julian McAuley', 'published': '2025-07-07', 'url': 'http://arxiv.org/abs/2507.05257v1'}, page_content='Recent benchmarks for Large Language Model (LLM) agents primarily focus on\nevaluating reasoning, planning, and execution capabilities, while another\ncritical component-memory, encompassing how agents memorize, update, and\nretrieve long-term information-is under-evaluated due to the lack of\nbenchmarks. We term agents with memory mechanisms as memory agents. In this\npaper, we identify four core competencies essential for memory agents: accurate\nretrieval, test-time learning, long-range understanding, and conflict\nresolution. Existing datasets either rely on limited context lengths or are\ntailored for static, long-context settings like book-based QA, which do not\nreflect the interactive, multi-turn nature of memory agents that incrementally\nac

In [30]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200
)
splits = text_splitter.split_documents(docs)

In [31]:
print(splits[0])
print(splits[1])

page_content='Recent benchmarks for Large Language Model (LLM) agents primarily focus on
evaluating reasoning, planning, and execution capabilities, while another
critical component-memory, encompassing how agents memorize, update, and
retrieve long-term information-is under-evaluated due to the lack of
benchmarks. We term agents with memory mechanisms as memory agents. In this
paper, we identify four core competencies essential for memory agents: accurate
retrieval, test-time learning, long-range understanding, and conflict
resolution. Existing datasets either rely on limited context lengths or are
tailored for static, long-context settings like book-based QA, which do not
reflect the interactive, multi-turn nature of memory agents that incrementally
accumulate information. Furthermore, no existing benchmarks cover all four
competencies. Therefore, we introduce MemoryAgentBench, a new benchmark
specifically designed for memory agents. Our benchmark combines reformulated
existing datas

In [32]:
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004",google_api_key="XXXX")
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=embeddings)

In [33]:
print(vectorstore._collection.get())

{'ids': ['f616cba3-1f1c-49aa-a15f-353961c6691b', 'e8d71aa1-df18-4fa2-8486-64af089dab97', '90f65a49-c4d5-4dbd-bdbc-3002cdecf128', 'dc72c0f5-0b9e-4fcb-9cc6-ee5f259f6fdb', '7e0d661d-6494-47cc-a4eb-9af9df0c673d', '1546852f-9f69-4891-ae4f-315b86986c4d', '4b32919d-3035-462a-a1f6-f57aa32396d1', '80f51a8a-0589-43d7-948a-2794c4489548', '4a512b90-2965-4d6a-9a4a-7890db42d572', '7e4d8a78-25ca-4a13-b47b-b88f89736f82', 'd5426581-a24b-4d4e-9af0-67f3ebed0599', '76bfa924-f9f8-4149-a290-4f1c31677736', '86d8d39d-d780-4353-a3cf-3213e94eee8d', '1d8a7f7b-78c3-40c1-b460-390470fb3144', 'f4f473e1-a2fe-45ae-b543-d6e38e719e3c', 'b85d5491-73d9-4d9a-bbf7-ff84c4bab0f3', 'e3be6de4-7333-4ee2-a716-40ac8ab76c99', '94c007c3-3abd-47fd-b98c-31df95265900', 'eb2d285e-c2dc-41d1-821e-b71b44233960', '1fb8a7fc-5cb9-4cc6-9550-afae701681a9', 'b76a6713-1770-41d4-82e5-5836379d4940', '2cadfce3-a984-47ba-9939-e59fcfbfd35b', '22f155b9-18c9-48aa-8175-b9ed1c97c1f2', 'd6e48cd1-2dc1-41d7-b2e7-dd30567beeb9', 'eebdf9e9-7567-44d6-85eb-43cc4e

In [34]:
retriever = vectorstore.as_retriever()

In [35]:
from langchain import hub 
prompt = hub.pull("rlm/rag-prompt")



In [36]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    google_api_key="XXXXX"
)

In [37]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [38]:
def format_docs(docs):
    return "\n".join(doc.page_content for doc in docs)

In [39]:
rag_chain = (
    {'context':retriever | format_docs,"question":RunnablePassthrough()}
    | prompt
    | llm 
    | StrOutputParser()
)

In [41]:
rag_chain.invoke("What novel architectures are being proposed for language models?")

'One novel architecture proposed is LILITH, which combines developmental training of modular language models with brain-inspired token-based communication protocols. Another is the 2-simplicial Transformer, which generalizes standard dot-product attention to trilinear functions. Additionally, research suggests exploring models with frozen embedding layers derived from visual structures, challenging the traditional view of trainable embeddings.'