#  Build a Research Agent with LangGraph, GPT-4o, RAG, Pinecone, ArXiv and Google SerpAPI

### Install Required Libraries

In [6]:
!pip install -r requirements.txt -q

In [64]:
!pip install semantic_router -q

###  Extracting Data from ArXiv into a Pandas DataFrame and Saving it as JSON

In [8]:
import requests
import pandas as pd
import json
import xml.etree.ElementTree as ET

ARXIV_NAMESPACE = '{http://www.w3.org/2005/Atom}'

def extract_from_arxiv(search_query='cat:cs.AI', max_results=100, json_file_path='files/arxiv_dataset.json'):
    """
    Fetches papers from the ArXiv API based on a search query, saves them as JSON, 
    and returns a pandas DataFrame.

    Args:
        search_query (str): The search query for ArXiv (default is 'cat:cs.AI').
        max_results (int): The maximum number of results to retrieve (default is 100).
        json_file_path (str): File path where JSON data will be saved.

    Returns:
        pd.DataFrame: DataFrame containing the extracted paper information.
    """
    
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&max_results={max_results}'
    
    response = requests.get(url)
    
    root = ET.fromstring(response.content)
    
    papers = []
    
    for entry in root.findall(f'{ARXIV_NAMESPACE}entry'):
        title = entry.find(f'{ARXIV_NAMESPACE}title').text.strip()
        summary = entry.find(f'{ARXIV_NAMESPACE}summary').text.strip()

        author_elements = entry.findall(f'{ARXIV_NAMESPACE}author')
        authors = [author.find(f'{ARXIV_NAMESPACE}name').text for author in author_elements]

        paper_url = entry.find(f'{ARXIV_NAMESPACE}id').text
        arxiv_id = paper_url.split('/')[-1]

        pdf_link = next((link.attrib['href'] for link in entry.findall(f'{ARXIV_NAMESPACE}link') 
                         if link.attrib.get('title') == 'pdf'), None)

        papers.append({
            'title': title,
            'summary': summary,
            'authors': authors,
            'arxiv_id': arxiv_id,
            'url': paper_url,
            'pdf_link': pdf_link
        })
    
    df = pd.DataFrame(papers)
    
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)
        print(f'Data saved to {json_file_path} ...')
    
    return df

### Testing the function

In [13]:
df = extract_from_arxiv(max_results=20)

Data saved to files/arxiv_dataset.json ...


In [15]:
import json
file_name = 'files/arxiv_dataset.json'
with  open(file_name, 'r') as file:
    data = json.load(file)

print(data)

[{'title': 'Dynamic Backtracking', 'summary': 'Because of their occasional need to return to shallow points in a search\ntree, existing backtracking methods can sometimes erase meaningful progress\ntoward solving a search problem. In this paper, we present a method by which\nbacktrack points can be moved deeper in the search space, thereby avoiding this\ndifficulty. The technique developed is a variant of dependency-directed\nbacktracking that uses only polynomial space while still providing useful\ncontrol information and retaining the completeness guarantees provided by\nearlier approaches.', 'authors': ['M. L. Ginsberg'], 'arxiv_id': '9308101v1', 'url': 'http://arxiv.org/abs/cs/9308101v1', 'pdf_link': 'http://arxiv.org/pdf/cs/9308101v1'}, {'title': 'A Market-Oriented Programming Environment and its Application to\n  Distributed Multicommodity Flow Problems', 'summary': 'Market price systems constitute a well-understood class of mechanisms that\nunder certain conditions provide effec

In [17]:
import pandas as pd
df = pd.DataFrame(data)
df.sample(n=5)

Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link
18,Operations for Learning with Graphical Models,This paper is a multidisciplinary review of em...,[W. L. Buntine],9412102v1,http://arxiv.org/abs/cs/9412102v1,http://arxiv.org/pdf/cs/9412102v1
12,Applying GSAT to Non-Clausal Formulas,In this paper we describe how to modify GSAT s...,[R. Sebastiani],9406102v1,http://arxiv.org/abs/cs/9406102v1,http://arxiv.org/pdf/cs/9406102v1
15,A System for Induction of Oblique Decision Trees,This article describes a new system for induct...,"[S. K. Murthy, S. Kasif, S. Salzberg]",9408103v1,http://arxiv.org/abs/cs/9408103v1,http://arxiv.org/pdf/cs/9408103v1
19,Total-Order and Partial-Order Planning: A Comp...,"For many years, the intuitions underlying part...","[S. Minton, J. Bresina, M. Drummond]",9412103v1,http://arxiv.org/abs/cs/9412103v1,http://arxiv.org/pdf/cs/9412103v1
7,Learning the Past Tense of English Verbs: The ...,Learning the past tense of English verbs - a s...,[C. X. Ling],9402101v1,http://arxiv.org/abs/cs/9402101v1,http://arxiv.org/pdf/cs/9402101v1


#### Downloading Research Papers (PDFs)

In [26]:
import pandas as pd
import requests
import os

def download_pdfs(df, download_folder='files'):
    """
        Retrieves and stores academic papers from ArXiv as PDF files using URLs provided in a DataFrame.
        This function processes each paper systematically, handling potential download failures gracefully,
        and maintains a record of file locations for subsequent processing.
    
        Parameters
        ----------
        df : pandas.DataFrame
            Input DataFrame containing paper metadata with a required 'pdf_link' column 
            storing ArXiv PDF URLs.
        download_folder : str, optional
            Target directory for PDF storage (default: 'files'). Will be created if 
            it doesn't exist.
    
        Returns
        -------
        pandas.DataFrame
            Enhanced DataFrame with an additional 'pdf_file_name' column containing:
            - Full file paths for successfully downloaded PDFs
            - None values for failed downloads
            
        Notes
        -----
        The function implements error handling for network issues and invalid URLs,
        ensuring the process continues even if individual downloads fail.
    """
    
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    pdf_file_names = []
    
    for index, row in df.iterrows():
        pdf_link = row['pdf_link']
        
        try:
            response = requests.get(pdf_link)
            response.raise_for_status()
    
            file_name = os.path.join(download_folder, pdf_link.split('/')[-1]) + '.pdf'
            pdf_file_names.append(file_name)
    
            with open(file_name, 'wb') as f:
                f.write(response.content)
            
            print(f'PDF downloaded successfully and saved as {file_name}')
        
        except requests.exceptions.RequestException as e:
            print(f'Failed to download the PDF: {e}')
            pdf_file_names.append(None)
    
    df['pdf_file_name'] = pdf_file_names

    return df

In [28]:
df = download_pdfs(df)

PDF downloaded successfully and saved as files/9308101v1.pdf
PDF downloaded successfully and saved as files/9308102v1.pdf
PDF downloaded successfully and saved as files/9309101v1.pdf
PDF downloaded successfully and saved as files/9311101v1.pdf
PDF downloaded successfully and saved as files/9311102v1.pdf
PDF downloaded successfully and saved as files/9312101v1.pdf
PDF downloaded successfully and saved as files/9401101v1.pdf
PDF downloaded successfully and saved as files/9402101v1.pdf
PDF downloaded successfully and saved as files/9402102v1.pdf
PDF downloaded successfully and saved as files/9402103v1.pdf
PDF downloaded successfully and saved as files/9403101v1.pdf
PDF downloaded successfully and saved as files/9406101v1.pdf
PDF downloaded successfully and saved as files/9406102v1.pdf
PDF downloaded successfully and saved as files/9408101v1.pdf
PDF downloaded successfully and saved as files/9408102v1.pdf
PDF downloaded successfully and saved as files/9408103v1.pdf
PDF downloaded successfu

In [30]:
df

Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link,pdf_file_name
0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,http://arxiv.org/pdf/cs/9308101v1,files/9308101v1.pdf
1,A Market-Oriented Programming Environment and ...,Market price systems constitute a well-underst...,[M. P. Wellman],9308102v1,http://arxiv.org/abs/cs/9308102v1,http://arxiv.org/pdf/cs/9308102v1,files/9308102v1.pdf
2,An Empirical Analysis of Search in GSAT,We describe an extensive study of search in GS...,"[I. P. Gent, T. Walsh]",9309101v1,http://arxiv.org/abs/cs/9309101v1,http://arxiv.org/pdf/cs/9309101v1,files/9309101v1.pdf
3,The Difficulties of Learning Logic Programs wi...,As real logic programmers normally use cut (!)...,"[F. Bergadano, D. Gunetti, U. Trinchero]",9311101v1,http://arxiv.org/abs/cs/9311101v1,http://arxiv.org/pdf/cs/9311101v1,files/9311101v1.pdf
4,Software Agents: Completing Patterns and Const...,To support the goal of allowing users to recor...,"[J. C. Schlimmer, L. A. Hermens]",9311102v1,http://arxiv.org/abs/cs/9311102v1,http://arxiv.org/pdf/cs/9311102v1,files/9311102v1.pdf
5,Decidable Reasoning in Terminological Knowledg...,Terminological knowledge representation system...,"[M. Buchheit, F. M. Donini, A. Schaerf]",9312101v1,http://arxiv.org/abs/cs/9312101v1,http://arxiv.org/pdf/cs/9312101v1,files/9312101v1.pdf
6,Teleo-Reactive Programs for Agent Control,A formalism is presented for computing and org...,[N. Nilsson],9401101v1,http://arxiv.org/abs/cs/9401101v1,http://arxiv.org/pdf/cs/9401101v1,files/9401101v1.pdf
7,Learning the Past Tense of English Verbs: The ...,Learning the past tense of English verbs - a s...,[C. X. Ling],9402101v1,http://arxiv.org/abs/cs/9402101v1,http://arxiv.org/pdf/cs/9402101v1,files/9402101v1.pdf
8,Substructure Discovery Using Minimum Descripti...,The ability to identify interesting and repeti...,"[D. J. Cook, L. B. Holder]",9402102v1,http://arxiv.org/abs/cs/9402102v1,http://arxiv.org/pdf/cs/9402102v1,files/9402102v1.pdf
9,Bias-Driven Revision of Logical Domain Theories,The theory revision problem is the problem of ...,"[M. Koppel, R. Feldman, A. M. Segre]",9402103v1,http://arxiv.org/abs/cs/9402103v1,http://arxiv.org/pdf/cs/9402103v1,files/9402103v1.pdf


### Loading and Splitting PDF Files into Chunks, Expanding the DataFrame

In [33]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_chunk_pdf(pdf_file_name, chunk_size=512):
    """
    Processes PDF documents into semantically meaningful text chunks for AI analysis.
    
    This function handles the extraction of text from PDFs and implements intelligent
    text splitting to preserve context and meaning. It uses LangChain's document 
    processing capabilities for robust PDF handling.

    Parameters
    ----------
    pdf_file_name : str
        Path to the target PDF file for processing
    chunk_size : int, optional
        Maximum character length for each text chunk (default: 512)
        Chosen to optimize for transformer model context windows

    Returns
    -------
    List[Document]
        Collection of LangChain Document objects, each containing:
        - Chunk text content
        - Metadata from the original PDF
        - Page numbers and positions
    
    Notes
    -----
    The chunking process includes a 64-character overlap between segments
    to maintain context and prevent splitting of important phrases or concepts.
    """

    print(f'Loading and splitting into chunks: {pdf_file_name}')

    loader = PyPDFLoader(pdf_file_name)
    data = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=64)
    chunks = text_splitter.split_documents(data)

    return chunks

In [35]:
def expand_df(df):
    """
    Expands a DataFrame of PDF metadata into a structured collection of text chunks
    with preserved relationships and context.
    
    This function processes each PDF document into chunks while maintaining the
    relationships between segments and their associated metadata. It creates a
    traceable chain of text segments that preserves the document's logical flow.

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame containing document metadata with required columns:
        - pdf_file_name: Path to PDF file
        - arxiv_id: Unique identifier for the paper
        - title: Paper title
        - summary: Paper abstract
        - authors: List of authors
        - url: Source URL

    Returns
    -------
    pandas.DataFrame
        Expanded DataFrame where each row represents a document chunk with:
        - Unique chunk identifiers
        - Complete paper metadata
        - Chunk content
        - References to adjacent chunks (previous/next)
    
    Notes
    -----
    The expansion process creates bidirectional links between chunks,
    enabling reconstruction of the original document flow and context-aware
    processing in the AI pipeline.
    """

    expanded_rows = []  

    for idx, row in df.iterrows():
        try:
            chunks = load_and_chunk_pdf(row['pdf_file_name'])
        except Exception as e:
            print(f"Error processing file {row['pdf_file_name']}: {e}")
            continue

        for i, chunk in enumerate(chunks):
            prechunk_id = i-1 if i > 0 else ''  
            postchunk_id = i+1 if i < len(chunks) - 1 else ''  

            expanded_rows.append({
                'id': f"{row['arxiv_id']}#{i}",  
                'title': row['title'],
                'summary': row['summary'],
                'authors': row['authors'],
                'arxiv_id': row['arxiv_id'],
                'url': row['url'],
                'chunk': chunk.page_content,  
                'prechunk_id': '' if i == 0 else f"{row['arxiv_id']}#{prechunk_id}",  
                'postchunk_id': '' if i == len(chunks) - 1 else f"{row['arxiv_id']}#{postchunk_id}" 
            })

    return pd.DataFrame(expanded_rows)


In [37]:
expanded_df = expand_df(df)

Loading and splitting into chunks: files/9308101v1.pdf
Loading and splitting into chunks: files/9308102v1.pdf
Loading and splitting into chunks: files/9309101v1.pdf
Loading and splitting into chunks: files/9311101v1.pdf
Loading and splitting into chunks: files/9311102v1.pdf
Loading and splitting into chunks: files/9312101v1.pdf
Loading and splitting into chunks: files/9401101v1.pdf
Loading and splitting into chunks: files/9402101v1.pdf
Loading and splitting into chunks: files/9402102v1.pdf
Loading and splitting into chunks: files/9402103v1.pdf
Loading and splitting into chunks: files/9403101v1.pdf
Loading and splitting into chunks: files/9406101v1.pdf
Loading and splitting into chunks: files/9406102v1.pdf
Loading and splitting into chunks: files/9408101v1.pdf
Loading and splitting into chunks: files/9408102v1.pdf


Illegal character in Name Object (b'/\x84')
Illegal character in Name Object (b'/\x84')
Illegal character in Name Object (b'/\xd8')
Illegal character in Name Object (b'/\xd8')
Illegal character in Name Object (b'/\x99')
Illegal character in Name Object (b'/\x99')
Illegal character in Name Object (b'/\x8f')
Illegal character in Name Object (b'/\x8f')
Illegal character in Name Object (b'/\xb9')
Illegal character in Name Object (b'/\xb9')
Illegal character in Name Object (b'/\xda')
Illegal character in Name Object (b'/\xaa')
Illegal character in Name Object (b'/\xaa')
Illegal character in Name Object (b'/\xda')
Illegal character in Name Object (b'/\xd2')
Illegal character in Name Object (b'/\xd2')
Illegal character in Name Object (b'/\xb1')
Illegal character in Name Object (b'/\x99')
Illegal character in Name Object (b'/\x99')
Illegal character in Name Object (b'/\xb1')
Illegal character in Name Object (b'/\xb5')
Illegal character in Name Object (b'/\xfd')
Illegal character in Name Object

Loading and splitting into chunks: files/9408103v1.pdf
Loading and splitting into chunks: files/9409101v1.pdf
Loading and splitting into chunks: files/9412101v1.pdf
Loading and splitting into chunks: files/9412102v1.pdf
Loading and splitting into chunks: files/9412103v1.pdf


In [39]:
expanded_df

Unnamed: 0,id,title,summary,authors,arxiv_id,url,chunk,prechunk_id,postchunk_id
0,9308101v1#0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,Journal of Arti/cial In telligence Researc h ...,,9308101v1#1
1,9308101v1#1,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,"In this pap er/, w e presen t a metho d b y wh...",9308101v1#0,9308101v1#2
2,9308101v1#2,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,to solv e some constrain t/-satisfaction probl...,9308101v1#1,9308101v1#3
3,9308101v1#3,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,"try /, coloring p erhaps half a dozen of them ...",9308101v1#2,9308101v1#4
4,9308101v1#4,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,states/.W e successfully color the eastern sta...,9308101v1#3,9308101v1#5
...,...,...,...,...,...,...,...,...,...
4778,9412103v1#281,Total-Order and Partial-Order Planning: A Comp...,"For many years, the intuitions underlying part...","[S. Minton, J. Bresina, M. Drummond]",9412103v1,http://arxiv.org/abs/cs/9412103v1,"UCPOP /: A sound/, complete/, partial/-order p...",9412103v1#280,9412103v1#282
4779,9412103v1#282,Total-Order and Partial-Order Planning: A Comp...,"For many years, the intuitions underlying part...","[S. Minton, J. Bresina, M. Drummond]",9412103v1,http://arxiv.org/abs/cs/9412103v1,"ligenc e /, pp/. /1/0/0/{/1/1/1 SanktAugustin/...",9412103v1#281,9412103v1#283
4780,9412103v1#283,Total-Order and Partial-Order Planning: A Comp...,"For many years, the intuitions underlying part...","[S. Minton, J. Bresina, M. Drummond]",9412103v1,http://arxiv.org/abs/cs/9412103v1,"Minton/, Bresina/, /& Dr ummondSacerdoti/, E/....",9412103v1#282,9412103v1#284
4781,9412103v1#284,Total-Order and Partial-Order Planning: A Comp...,"For many years, the intuitions underlying part...","[S. Minton, J. Bresina, M. Drummond]",9412103v1,http://arxiv.org/abs/cs/9412103v1,"Univ/. of Edin burgh/, Mac hine In telligenceR...",9412103v1#283,9412103v1#285


### Building a Knowledge Base for the RAG System Using Embedding

In [42]:
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

True

In [74]:
import os
from getpass import getpass
from openai import OpenAI  

# Check if 'OPENAI_API_KEY' is set; prompt if not
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY') or getpass('OpenAI API key: ')

# Initialize the OpenAI client
client = OpenAI()

# Create a test embedding
test_text = "hello hallo hola salut"
response = client.embeddings.create(
    model="text-embedding-3-small",
    input=test_text
)

# Get embedding dimensions
dims = len(response.data[0].embedding)
print(f"Embedding dimensions: {dims}")

Embedding dimensions: 1536


### Creating the Pinecone Index

In [77]:
from pinecone import Pinecone, ServerlessSpec

api_key = os.getenv('PINECONE_API_KEY') or getpass('Pinecone API key: ')

pc = Pinecone(api_key=api_key)

spec = ServerlessSpec(
    cloud='aws', 
    region='us-east-1'
)

In [79]:
import time

index_name = 'langgraph-research-agent'

if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=dims,  # Embedding dimension (1536)
        metric='cosine',
        spec=spec 
    )

    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

index = pc.Index(index_name)

time.sleep(1)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Populating the knowledge base and uploading it to Pinecone

In [82]:
expanded_df.iloc[:5]

Unnamed: 0,id,title,summary,authors,arxiv_id,url,chunk,prechunk_id,postchunk_id
0,9308101v1#0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,Journal of Arti/cial In telligence Researc h ...,,9308101v1#1
1,9308101v1#1,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,"In this pap er/, w e presen t a metho d b y wh...",9308101v1#0,9308101v1#2
2,9308101v1#2,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,to solv e some constrain t/-satisfaction probl...,9308101v1#1,9308101v1#3
3,9308101v1#3,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,"try /, coloring p erhaps half a dozen of them ...",9308101v1#2,9308101v1#4
4,9308101v1#4,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,states/.W e successfully color the eastern sta...,9308101v1#3,9308101v1#5


In [86]:
from tqdm.auto import tqdm
data = expanded_df
batch_size = 64  

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i + batch_size)  
    batch = data[i:i_end].to_dict(orient='records')  
    
    metadata = [{
        'arxiv_id': r['arxiv_id'],
        'title': r['title'],
        'chunk': r['chunk'],
    } for r in batch]
    
    ids = [r['id'] for r in batch]
    
    chunks = [r['chunk'] for r in batch]
    
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=chunks
    )
    embeds = [e.embedding for e in response.data]
    
    index.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/75 [00:00<?, ?it/s]

In [90]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4783}},
 'total_vector_count': 4783}