### Import Libraries, Set keys, Load data

In [4]:
from backend.api.arxiv import load_data
import os
import requests
import json


from google.generativeai import GenerativeModel
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
# Count chunks per paper
from collections import Counter

from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

from fpdf import FPDF
import re


In [3]:
top_5_rag_papers = load_data(file_path="data/RAG/text/top_5_rag_papers.json")

Data successfully loaded from data/RAG/text/top_5_rag_papers.json


In [4]:
top_5_rag_papers

[{'paper_id': 1,
  'title': 'Advanced System Integration: Analyzing OpenAPI Chunking for\n  Retrieval-Augmented Generation',
  'text': 'Advanced System Integration: Analyzing OpenAPI Chunking for Retrieval-Augmented Generation Robin D. Pesl1, Jerin G. Mathew2, Massimo Mecella2, and Marco Aiello1 1University of Stuttgart, Stuttgart, Germany 2Sapienza Universit di Roma, Rome, Italy Abstract Integrating multiple (sub-)systems is essential to create ad- vanced Information Systems (ISs). Difficulties mainly arise when integrat- ing dynamic environments across the IS lifecycle, e.g., services not yet existent at design time. A traditional approach is a registry that provides the API documentation of the systems endpoints. Large Language Mod- els (LLMs) have shown to be capable of automatically creating system integrations (e.g., as service composition) based on this documentation but require concise input due to input token limitations, especially regard- ing comprehensive API descriptions. 

In [5]:
# Set OPENAI API KEY

os.environ["OPENAI_API_KEY"] = "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"

In [6]:
# Set OPENAI API KEY

os.environ["GROQ_API_KEY"] = "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"

### Text Chunking and Embedding

In [12]:
# Split the text into chunks for embeddings

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
documents = []

for paper in top_5_rag_papers:
    chunks = text_splitter.split_text(paper["text"])
    for i, chunk in enumerate(chunks):
        documents.append({
            "paper_id": paper["paper_id"],
            "title": paper["title"],
            "content": chunk,
            "chunk_id": i
        })

In [13]:
# Counts the number of chunks for each paper

chunk_counts = Counter(doc["paper_id"] for doc in documents)

# Print results
for paper_id, count in chunk_counts.items():
    print(f"Paper {paper_id}: {count} chunks")

Paper 1: 47 chunks
Paper 66: 78 chunks
Paper 77: 17 chunks
Paper 80: 61 chunks
Paper 87: 25 chunks


In [14]:
documents

[{'paper_id': 1,
  'title': 'Advanced System Integration: Analyzing OpenAPI Chunking for\n  Retrieval-Augmented Generation',
  'content': 'Advanced System Integration: Analyzing OpenAPI Chunking for Retrieval-Augmented Generation Robin D. Pesl1, Jerin G. Mathew2, Massimo Mecella2, and Marco Aiello1 1University of Stuttgart, Stuttgart, Germany 2Sapienza Universit di Roma, Rome, Italy Abstract Integrating multiple (sub-)systems is essential to create ad- vanced Information Systems (ISs). Difficulties mainly arise when integrat- ing dynamic environments across the IS lifecycle, e.g., services not yet existent at design time. A traditional approach is a registry that provides the API documentation of the systems endpoints. Large Language Mod- els (LLMs) have shown to be capable of automatically creating system integrations (e.g., as service composition) based on this documentation but require concise input due to input token limitations, especially regard- ing comprehensive API description

In [15]:
# Initialize embeddings
GOOGLE_API_KEY = "AIzaSyDLBN8OD_rJg93yVAyFP9mNKZQ0YX3IPL4"  # You'll need to set this
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",  # or "text-embedding-004" depending on access
    google_api_key=GOOGLE_API_KEY,
)

In [16]:
# Update the metadata structure when creating the FAISS index
texts = [doc["content"] for doc in documents]
metadata = [{
    "source": f"Paper_{doc['paper_id']}_chunk_{doc['chunk_id']}", # Add source field
    "paper_id": doc["paper_id"],
    "title": doc["title"],
    "chunk_id": doc["chunk_id"]
} for doc in documents]


In [21]:
# Create Vector Embeddings of the Chunks

vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadata)

In [22]:
# Save the index for reuse
vectorstore.save_local("data/RAG/embeddings/top5_papers_faiss_index")


### Defining RAG components and implementing the RAG pipeline

In [25]:
# Load the vector store
retriever = vectorstore.as_retriever(
    search_type="similarity", 
    search_kwargs={
        "k": 3,
        "filter": lambda metadata: metadata["paper_id"] == current_paper_id  # We'll set this dynamically
    }
)

# Define the LLM using ChatOpenAI
llm = ChatOpenAI(
    model="gpt-4o",
    openai_api_key=os.environ["OPENAI_API_KEY"],
    temperature=0
)

# Create the RAG pipeline with the correct chain type
qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type="map_reduce"  # or  if dealing with longer texts
)

In [26]:
# Define a function to create a structured prompt for each paper

def create_paper_prompt(question, title):
    return f"""You are an expert at analysing research papers. 
    For the paper with the title {title}, please answer this query.
    {question}.
    Be specific to this paper only."""

# Define the base questions
questions = [
    "Provide a comprehensive summary of the paper from the info that you have",
    "what specific problem does this paper solve?",
    "how does the paper solve this problem?",
    "what are the next steps or future work suggested in this paper?"
]

In [29]:
# Run the chunking, embedding and RAG pipeline for each paper separately

results = {}
for paper in top_5_rag_papers:
    paper_id = paper["paper_id"]
    title = paper["title"]
    paper_text = paper["text"]
    results[paper_id] = {}
    
    # Create a temporary vector store just for this paper
    temp_chunks = text_splitter.split_text(paper_text)
    temp_metadata = [{"source": f"Paper_{paper_id}_chunk_{i}", "paper_id": paper_id, "title": title, "chunk_id": i} 
                    for i in range(len(temp_chunks))]
    
    temp_vectorstore = FAISS.from_texts(
        temp_chunks, 
        embeddings, 
        metadatas=temp_metadata
    )
    
    # Create a new retriever and chain for this paper
    temp_retriever = temp_vectorstore.as_retriever(
        search_type="similarity", 
        search_kwargs={"k": 10}
    )
    
    temp_qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
        llm=llm,
        retriever=temp_retriever,
        return_source_documents=True,
        chain_type="map_reduce"
    )
    
    for question in questions:
        # Create a specific prompt for this paper
        specific_prompt = create_paper_prompt(question, title)
        
        # Query the RAG system
        result = temp_qa_chain({"question": specific_prompt})
        results[paper_id][question] = {
            "answer": result["answer"],
            "sources": result["sources"]
        }

In [8]:
# Read the JSON results file
with open('data/RAG/results/top_5_paper_analysis_results.json', 'r') as f:
    results = json.load(f)

In [9]:
results

{'1': {'Provide a comprehensive summary of the paper from the info that you have': {'answer': 'The paper titled "Advanced System Integration: Analyzing OpenAPI Chunking for Retrieval-Augmented Generation" explores the integration of multiple subsystems in dynamic environments using OpenAPI descriptions. The study focuses on the challenges of preprocessing API documentation to fit within the input token limitations of Large Language Models (LLMs) while maintaining relevant information. The authors propose the use of Retrieval-Augmented Generation (RAG) for endpoint discovery, employing various chunking strategies to optimize the retrieval process.\n\nThe paper introduces two main approaches: the OpenAPI RAG and the Discovery Agent. The OpenAPI RAG applies RAG for endpoint discovery using seven chunking strategies, validated through the RestBench benchmark. The Discovery Agent further enhances the process by breaking down queries into smaller tasks, improving precision and F1 scores whil

In [10]:
# Load fetched paper data from JSON 

rag_papers = load_data(file_path="data/RAG/text/rag_papers.json")

Data successfully loaded from data/RAG/text/rag_papers.json


In [13]:
rag_papers

[{'paper_id': 1,
  'title': 'Advanced System Integration: Analyzing OpenAPI Chunking for\n  Retrieval-Augmented Generation',
  'summary': 'Integrating multiple (sub-)systems is essential to create advanced\nInformation Systems (ISs). Difficulties mainly arise when integrating dynamic\nenvironments across the IS lifecycle. A traditional approach is a registry that\nprovides the API documentation of the systems\' endpoints. Large Language Models\n(LLMs) have shown to be capable of automatically creating system integrations\n(e.g., as service composition) based on this documentation but require concise\ninput due to input token limitations, especially regarding comprehensive API\ndescriptions. Currently, it is unknown how best to preprocess these API\ndescriptions. Within this work, we (i) analyze the usage of Retrieval Augmented\nGeneration (RAG) for endpoint discovery and the chunking, i.e., preprocessing,\nof OpenAPIs to reduce the input token length while preserving the most relevant\

In [23]:
def create_rag_pdf(results, papers_data):
    # Initialize PDF with custom page size and margins
    pdf = FPDF(format='A4')
    pdf.set_auto_page_break(auto=True, margin=30)
    pdf.add_page()
    
    # Set margins
    margin = 25
    pdf.set_margins(margin, margin, margin)
    
    # Calculate effective width for text
    effective_width = pdf.w - (2 * margin)
    
    # Set font for main header
    pdf.set_font('Arial', 'B', 16)
    pdf.cell(effective_width, 10, 'Top 5 papers in RAG posted in November 2024', ln=True, align='C')
    pdf.ln(10)
    
    # For each paper in results
    for paper_id_str, paper_content in results.items():
        paper_id = int(paper_id_str)
        paper_data = next((p for p in papers_data if p['paper_id'] == paper_id), None)
        if not paper_data:
            continue
        
        # Paper title
        pdf.set_font('Arial', 'B', 12)
        title = paper_data['title'].strip()
        pdf.multi_cell(effective_width, 8, title)
        
        # Hyperlinked text for paper link - aligned left
        pdf.set_font('Arial', '', 9)
        pdf.set_text_color(0, 0, 255)
        # Move cursor to left margin
        pdf.set_x(margin)
        pdf.cell(30, 8, 'Link to paper', ln=True, link=paper_data.get('link', ''), align='L')
        pdf.set_text_color(0, 0, 0)
        pdf.ln(5)
        
        # Process questions and answers
        for question, qa_content in paper_content.items():
            # Check if need new page
            if pdf.get_y() > pdf.h - 60:
                pdf.add_page()
            
            # Question
            pdf.set_font('Arial', 'B', 11)
            pdf.multi_cell(effective_width, 8, question)
            pdf.ln(2)
            
            # Answer
            answer = qa_content.get('answer', '')
            # Clean title references
            title_pattern = f"in the paper with( the title)? ['\"]?{paper_data['title']}['\"]?"
            answer = re.sub(title_pattern, "in the paper", answer, flags=re.IGNORECASE)
            
            pdf.set_font('Arial', '', 10)
            pdf.multi_cell(effective_width, 8, answer.strip())
            pdf.ln(8)
        
        # Add page break between papers
        if paper_id_str != list(results.keys())[-1]:
            pdf.add_page()
    
    # Save the PDF
    pdf.output('top_5_rag_papers_analysis.pdf')

In [24]:
# Create the PDF
create_rag_pdf(results, rag_papers)

  pdf.set_font('Arial', 'B', 16)
  pdf.cell(effective_width, 10, 'Top 5 papers in RAG posted in November 2024', ln=True, align='C')
  pdf.set_font('Arial', 'B', 12)
  pdf.set_font('Arial', '', 9)
  pdf.cell(30, 8, 'Link to paper', ln=True, link=paper_data.get('link', ''), align='L')
  pdf.set_font('Arial', 'B', 11)
  pdf.set_font('Arial', '', 10)
