In [1]:
import os
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Path to your data folder
DATA_PATH = "Data/"

# Function to extract text from PDFs
def load_pdfs(data_path):
    texts = []
    for file in os.listdir(data_path):
        if file.endswith(".pdf"):
            doc = fitz.open(os.path.join(data_path, file))
            pdf_text = ""
            for page in doc:
                pdf_text += page.get_text()
            texts.append({"file": file, "text": pdf_text})
    return texts

# Load PDFs
documents = load_pdfs(DATA_PATH)

print(f"Loaded {len(documents)} PDFs")
print("Example file:", documents[0]["file"])
print("First 500 characters:\n", documents[0]["text"][:500])


Loaded 6 PDFs
Example file: Attention Is All You Need.pdf
First 500 characters:
 Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.edu
Łukasz K


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

all_chunks = []
for doc in documents:
    chunks = splitter.split_text(doc["text"])
    for i, chunk in enumerate(chunks):
        all_chunks.append({
            "file": doc["file"],
            "chunk_id": i,
            "text": chunk
        })

print(f"Total chunks created: {len(all_chunks)}")
print("Example chunk:\n", all_chunks[0]["text"][:300])


Total chunks created: 663
Example chunk:
 Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Par


In [3]:
import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.Client()

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"  
)

collection = chroma_client.get_or_create_collection(
    name="arxiv_papers",
    embedding_function=embedding_func
)

for chunk in all_chunks:
    collection.add(
        documents=[chunk["text"]],
        metadatas=[{"file": chunk["file"], "chunk_id": chunk["chunk_id"]}],
        ids=[f'{chunk["file"]}_{chunk["chunk_id"]}']
    )

print("✅ All chunks stored in ChromaDB!")
print("Collection size:", collection.count())


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


✅ All chunks stored in ChromaDB!
Collection size: 663


In [4]:
# Example query
query = "What is the main contribution of the Transformer model?"

# Search in ChromaDB
results = collection.query(
    query_texts=[query],
    n_results=3
)

for i, doc in enumerate(results["documents"][0]):
    print(f"\nResult {i+1}:")
    print(doc[:300], "...")
    print("From:", results["metadatas"][0][i]["file"])



Result 1:
use publicly available data, making our work com-
patible with open-sourcing, while most existing
models rely on data which is either not publicly
available or undocumented (e.g. “Books – 2TB” or
“Social media conversations”). There exist some
exceptions, notably OPT (Zhang et al., 2022),
GPT-NeoX ( ...
From: LLaMa.pdf

Result 2:
Transformer Language Models.
Transformer (Vaswani et al., 2017) is a sequence-to-sequence
architecture that makes heavy use of self-attention. Radford et al. (a) applied it to autoregressive lan-
guage modeling by using a stack of Transformer decoders. Since then, Transformer-based language
models h ...
From: LoRA.pdf

Result 3:
the universal transformer [DGV+18]. Our work focuses on the ﬁrst approach (scaling compute and parameters together,
by straightforwardly making the neural net larger), and increases model size 10x beyond previous models that employ
this strategy.
Several efforts have also systematically studied the  ...
From: GPT-3.pdf


In [6]:
import ollama

response = ollama.chat(
    model="mistral",
    messages=[
        {"role": "system", "content": "You are an expert AI/ML assistant."},
        {"role": "user", "content": "Explain Transformers in simple terms."}
    ]
)

print(response)


model='mistral' created_at='2025-09-03T21:08:08.0035798Z' done=True done_reason='stop' total_duration=58944573700 load_duration=15315563600 prompt_eval_count=24 prompt_eval_duration=2362500400 eval_count=332 eval_duration=41255714700 message=Message(role='assistant', content=' Sure! Transformers are a type of model used in artificial intelligence, particularly in the field of natural language processing (NLP). They were introduced in a paper called "Attention is All You Need" by Vaswani et al., published in 2017.\n\nThe key concept behind Transformers is self-attention, which allows the model to focus on different words or parts of a sentence when generating an output, rather than simply considering each word sequentially. This makes Transformers more efficient and powerful compared to traditional recurrent neural networks (RNNs) for certain tasks, such as translation and language modeling.\n\nIn essence, Transformers work by using a mechanism called "attention" to weigh the importance

In [None]:
def ask_arxiv_bot_with_sources(question, top_k=3):
    if not question.strip():
        return " Please enter a question."
    
    # Retrieve top-k chunks
    results = collection.query(
        query_texts=[question],
        n_results=top_k
    )
    
    if not results["documents"][0]:
        return " No relevant content found in the PDFs."
    
    context = "\n\n".join(results["documents"][0])
    
    # Ollama chat
    try:
        messages = [
            {"role": "system", "content": "You are an expert AI/ML assistant."},
            {"role": "user", "content": f"Answer the question based ONLY on the context below. If not in the context, say 'I don't know'.\n\nContext:\n{context}\n\nQuestion:\n{question}"}
        ]
        response = ollama.chat(model="mistral", messages=messages)
        generated_text = response.message.content  # <-- extract text only
    except Exception as e:
        return f" Error generating answer: {e}"
    
    # Collect sources
    sources = [md["file"] for md in results["metadatas"][0]]
    
    return f"{generated_text}\n\n Sources: {', '.join(set(sources))}"


In [None]:
def add_pdfs_to_collection(file_paths):
    for path in file_paths:
        # path is now a string path
        doc = fitz.open(path)  
        text = ""
        for page in doc:
            text += page.get_text()
        chunks = splitter.split_text(text)
        for i, chunk in enumerate(chunks):
            collection.add(
                documents=[chunk],
                metadatas=[{"file": os.path.basename(path), "chunk_id": i}],
                ids=[f'{os.path.basename(path)}_{i}']
            )
    return " PDFs added to collection!"


In [33]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("## Arxiv RAG AI Assistant")
    
    # Upload PDFs (fixed)
    pdf_input = gr.File(file_types=[".pdf"], type="filepath", label="Upload new PDFs", file_count="multiple")
    
    # Question input
    question_input = gr.Textbox(lines=2, placeholder="Type your question here...", label="Ask a question")
    
    # Answer output
    answer_output = gr.Textbox(label="Answer")
    
    # Buttons
    upload_btn = gr.Button("Add PDFs")
    ask_btn = gr.Button("Ask Question")

    upload_btn.click(lambda files: add_pdfs_to_collection(files), inputs=[pdf_input], outputs=[answer_output])
    ask_btn.click(
    fn=ask_arxiv_bot_with_sources,  # must return a string
    inputs=[question_input],
    outputs=[answer_output]
)

    
demo.launch()


* Running on local URL:  http://127.0.0.1:7868
* To create a public link, set `share=True` in `launch()`.


