# Building a RAG system

## A Basic RAG Architecture

In [53]:
 from IPython.display import Markdown, display


# Load the mermaid file
mermaid_file = "rag-system-architecture.mermaid"
with open(mermaid_file, "r") as file:
    mermaid_code = file.read()

# Display the Mermaid diagram
display(Markdown(f"```mermaid\n{mermaid_code}\n```"))


```mermaid
graph TD
    A[User Input] -->|Query| B(Retriever)
    B <-->|Fetch/Return Documents| C[(Knowledge Base / Document Store)]
    B -->|Retrieved Context| D(Language Model LLM)
    A -->|Original Query| D
    D --> E[Generated Response]

    classDef input fill:#e1f5fe,stroke:#01579b,stroke-width:2px,color:black;
    classDef process fill:#e8f5e9,stroke:#2e7d32,stroke-width:2px,color:black;
    classDef data fill:#fff3e0,stroke:#ff6f00,stroke-width:2px,color:black;
    classDef output fill:#fce4ec,stroke:#c2185b,stroke-width:2px,color:black;

    class A input;
    class B,D process;
    class C data;
    class E output;
```

### Let's take a quick peek into the data!

In [54]:
!ls

archive.zip                      rag-system-architecture.mermaid
arxiv-metadata-oai-snapshot.json tfidf_index_arxiv.pkl
building_basic_RAG_system.ipynb


In [55]:
import json


# Method 1: Quick peek at first few records
def peek_multiple_records(num_records=3):
    with open('arxiv-metadata-oai-snapshot.json', 'r') as f:
        for i in range(num_records):
            try:
                data = json.loads(next(f))
                print(f"\nRecord {i+1}:")
                print(json.dumps(data, indent=2))
            except StopIteration:
                print("Reached end of file")
                break

In [56]:
peek_multiple_records(3)


Record 1:
{
  "id": "0704.0001",
  "submitter": "Pavel Nadolsky",
  "authors": "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  "title": "Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies",
  "comments": "37 pages, 15 figures; published version",
  "journal-ref": "Phys.Rev.D76:013009,2007",
  "doi": "10.1103/PhysRevD.76.013009",
  "report-no": "ANL-HEP-PR-07-12",
  "categories": "hep-ph",
  "license": null,
  "abstract": "  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemons

## Basic Retrieval without LLMs

### Step1: Create an index and get [TFIDF](https://www.capitalone.com/tech/machine-learning/understanding-tf-idf/) for each article/abstract

In [25]:
import json
import pickle
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

def create_index(json_path, index_save_path='tfidf_index_arxiv.pkl'):
    # Initialize lists to store data
    documents = []
    metadata = []  # Store title, id, etc separately to save memory
    
    # Read and process documents in chunks
    print("Reading documents...")
    with open(json_path, 'r') as f:
        for line in tqdm(f):
            article = json.loads(line)
            # Store minimal text for TFIDF
            documents.append(f"{article['title']} {article['abstract']}")
            # Store metadata separately
            metadata.append({
                'title': article['title'],
                'abstract': article['abstract']
            })
    
    print("Creating TFIDF vectors...")
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    
    print("Saving index...")
    with open(index_save_path, 'wb') as f:
        pickle.dump({
            'vectorizer': vectorizer,
            'tfidf_matrix': tfidf_matrix,
            'metadata': metadata
        }, f)
    
    print("Index created and saved!")
    return vectorizer, tfidf_matrix, metadata

In [57]:
vectorizer, tfidf_matrix, metadata = create_index('arxiv-metadata-oai-snapshot.json', index_save_path='tfidf_index_arxiv.pkl')

Reading documents...


2631725it [00:31, 84045.08it/s] 


Creating TFIDF vectors...
Saving index...
Index created and saved!


### Step2:Now time to creat an interface for Retrieval

In [58]:
import gradio as gr
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class Searcher:
    def __init__(self, index_path='tfidf_index_arxiv.pkl'):
        print("Loading index...")
        with open(index_path, 'rb') as f:
            data = pickle.load(f)
            self.vectorizer = data['vectorizer']
            self.tfidf_matrix = data['tfidf_matrix']
            self.metadata = data['metadata']
    
    def search(self, query, top_k=5):
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        results = []
        for idx in top_indices:
            results.append({
                'title': self.metadata[idx]['title'],
                'abstract': self.metadata[idx]['abstract'],
                'score': similarities[idx]
            })
        return results

# Gradio interface
def setup_gradio():
    searcher = Searcher()
    
    def search_documents(query):
        results = searcher.search(query)
        output = ""
        for i, res in enumerate(results, 1):
            output += f"Result {i} (Score: {res['score']:.3f})\n"
            output += f"Title: {res['title']}\n"
            output += f"Abstract: {res['abstract']}\n\n"
        return output

    iface = gr.Interface(
        fn=search_documents,
        inputs=gr.Textbox(lines=2, placeholder="Enter your search query..."),
        outputs=gr.Textbox(lines=20),
        title="ArXiv Paper Search using TF-IDF",
        description="Search through ArXiv papers using TF-IDF based similarity"
    )
    return iface

In [59]:
iface = setup_gradio()
iface.launch()

Loading index...
* Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




# API Keys and Secrets:

In [64]:
# Add Azure openAI credentials

from openai import AzureOpenAI

client = AzureOpenAI(
    azure_endpoint = "", 
    api_key="",  
    api_version=""
)



# Arxiv paper chatbot using GPT-4

In [60]:
deployment_name = "gpt-4o-mini"

In [61]:
from openai import AzureOpenAI
import json
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
import pickle

class ArxivBot:
    def __init__(self, index_path='tfidf_index_arxiv.pkl', deployment_name=deployment_name,client=client):
        # Initialize Azure OpenAI client
        self.client = client
        self.deployment_name = deployment_name

        # Load TFIDF index
        print("Loading index...")
        with open(index_path, 'rb') as f:
            data = pickle.load(f)
            self.vectorizer = data['vectorizer']
            self.tfidf_matrix = data['tfidf_matrix']
            self.metadata = data['metadata']

    def search(self, query, top_k=3):
        query_vector = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        results = []
        for idx in top_indices:
            results.append({
                'title': self.metadata[idx]['title'],
                'abstract': self.metadata[idx]['abstract'],
                'score': similarities[idx]
            })
        return results

    def get_response(self, query, history):
        # Search relevant papers
        search_results = self.search(query)
        
        # Prepare context from search results
        
        for i, res in enumerate(search_results, 1):
            retrieved_papers = "\n".join([
                                            f"[Paper {i}]\n"
                                            f"Title: {res['title']}\n"
                                            f"Abstract: {res['abstract']}\n"
                                            f"Relevance Score: {res['score']:.2f}\n"
                                            for i, res in enumerate(search_results, 1)
                                        ])

        # Prepare messages for GPT-4
        messages = [
            { "role": "system",
        "content": f"""You are ArxivBot, a specialized research assistant focusing on arXiv papers. You are currently analyzing these relevant papers:

                                {retrieved_papers}
                                
                                Your goal is to provide informative and helpful responses:
            1. Use the information from the papers to provide detailed, substantive answers
            2. When papers contain relevant information, explain the concepts thoroughly
            3. If papers partially address the question, share what information is available and explain how it relates
            4. Only mention limitations if the papers truly contain no relevant information
            
            When responding:
            - Synthesize information across multiple papers when relevant
            - Explain technical concepts in an accessible way
            - Cite papers using "Paper X mentions/discusses/shows..."
            - Focus on being helpful while maintaining accuracy
                                """},
        ]
        
        # Add chat history
        for h in history:
            messages.append({"role": "user", "content": h[0]})
            if h[1]:
                messages.append({"role": "assistant", "content": h[1]})
        
        # Add current query
        messages.append({"role": "user", "content": query})

        # Get response from GPT-4
        response = self.client.chat.completions.create(
            model=self.deployment_name,
            messages=messages,
            temperature=0.7,
            max_tokens=800
        )
        
        return response.choices[0].message.content

def chat_with_arxiv():
    bot = ArxivBot()
    
    with gr.Blocks() as demo:
        chatbot = gr.Chatbot(label="Chat with ArxivBot")
        msg = gr.Textbox(label="Message")
        clear = gr.Button("Clear")
        
        def user(user_message, history):
            return "", history + [[user_message, None]]
        
        def bot_response(history):
            response = bot.get_response(history[-1][0], history[:-1])
            history[-1][1] = response
            return history
        
        msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
            bot_response, chatbot, chatbot
        )
        clear.click(lambda: None, None, chatbot, queue=False)
    
    return demo

In [63]:
demo = chat_with_arxiv()
demo.launch()

Loading index...




* Running on local URL:  http://127.0.0.1:7868

To create a public link, set `share=True` in `launch()`.


