In [None]:
import sys
!{sys.executable} -m pip install arxiv pymupdf sentence-transformers faiss-cpu SentenceTransformer tqdm




ERROR: Could not find a version that satisfies the requirement SentenceTransformer (from versions: none)
ERROR: No matching distribution found for SentenceTransformer

[notice] A new release of pip is available: 24.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import arxiv
from sentence_transformers import SentenceTransformer
import pymupdf
import faiss
import numpy as np
import os
import json
from tqdm import tqdm #Progress bar
import pickle #For saving and loading the FAISS index

  from .autonotebook import tqdm as notebook_tqdm


**Sentence-Transformers:** It is a populor open-source framework by Hugging Face community designed to make semantic embeddings. These embeddings capture semantic meaning rather than just word-level similarity

## Data Collection:

Retriving the most recent papers from arXiv in cs.CL catagoriy

In [None]:
def data_clection(max_results=50, categories="cs.CL", save_paperlist=True, paperlist_filename="paperlist.json"):

    query = f"cat:{categories}"

    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate,
        sort_order=arxiv.SortOrder.Descending
    )

    results = search.results()

    if not results:
        print("No results found.")  
        return None
    else:    
        #print(f"Found {len(list(results))} results.")

        # Create a dir to save PDFs if it doesn't exist
        os.makedirs("papers", exist_ok=True)

        paperlist = []
        print(f"Starting download of {max_results} papers in category {categories}...")
        for index, result in enumerate(results):
            text = f"{index+1}. Title: {result.title} downloading ..."

            result.download_pdf(dirpath=os.getcwd()+"\\papers", filename=f"paper_{index}.pdf")
            paperlist.append({
                "paper_title": result.title,
                "filename": f"papers/paper_{index+1}.pdf",
                "summary": result.summary[:200]
            })
            print(f"{text} ... Done")
            index += 1

        
        if save_paperlist:
            with open(paperlist_filename, 'w', encoding="utf-8") as f:
                json.dump(paperlist, f, ensure_ascii=False, indent=4)
            print(f"Paper list saved to {paperlist_filename}")

        print("Download completed!")
        return(paperlist)

print(data_clection(20))


  results = search.results()


Starting download of 20 papers in category cs.CL...
1. Title: Searching for Privacy Risks in LLM Agents via Simulation downloading ... ... Done
2. Title: A Survey on Diffusion Language Models downloading ... ... Done
3. Title: SSRL: Self-Search Reinforcement Learning downloading ... ... Done
4. Title: From Black Box to Transparency: Enhancing Automated Interpreting Assessment with Explainable AI in College Classrooms downloading ... ... Done
5. Title: Psyche-R1: Towards Reliable Psychological LLMs through Unified Empathy, Expertise, and Reasoning downloading ... ... Done
6. Title: Reinforced Language Models for Sequential Decision Making downloading ... ... Done
7. Title: Memory-Augmented Transformers: A Systematic Review from Neuroscience Principles to Technical Solutions downloading ... ... Done
8. Title: Beyond "Not Novel Enough": Enriching Scholarly Critique with LLM-Assisted Feedback downloading ... ... Done
9. Title: Pass@k Training for Adaptively Balancing Exploration and Exploi

## RAG Pipeline

In [44]:

class ragpipeline:
    def __init__(self, paperlist_filename="paperlist.json", model_name="all-MiniLM-L6-v2"):
        self.paperlist_filename = paperlist_filename
        self.model_name = model_name
        self.paperlist = self.load_paperlist()
        self.model = SentenceTransformer(self.model_name)
        self.index = None  # Initialize FAISS index
        self.chunks = []  
        self.chunk_metadata = []
        self.total_chunks = 0
        self.total_vectors = 0
        self.total_indexed = 0
        # --- File Paths for Saved Data ---
        self.FAISS_INDEX_FILE = "index.faiss"
        self.CHUNKS_FILE = "chunks.pkl"
        self.METADATA_FILE = "metadata.pkl"



        # New: Check for and load existing data on startup
        self.load_index()
        if self.index is None:
            print("No saved index found. The pipeline needs to be built first.")
        else:
            print("FAISS index and data loaded successfully.")


    def load_paperlist(self):
        if os.path.exists(self.paperlist_filename):
            with open(self.paperlist_filename, 'r', encoding="utf-8") as f:
                paperlist = json.load(f)
            return paperlist
        else:
            print(f"Paper list file {self.paperlist_filename} not found.")
            return []
    
    
    def extract_text_from_pdf(self, pdf_path:str) -> str:
        """ Text Extraction: Extract raw text from each PDF. Clean and concatenate the page text into full-document strings"""
        doc = None
        try:
            #Opening a document
            doc = pymupdf.open(pdf_path)
            pages=[]
            for page in doc:
                page_text = page.get_text().strip() # Get raw text from the page
                pages.append(page_text)
            full_text = "\n".join(pages)  # Concatenate all page texts into a single string
            return full_text
        except Exception as e:
            print(f"Error reading {pdf_path}: {e}")
        finally:
            if doc:
                doc.close()


    def chunk_text_sliding_window(self, text:str, max_token:int =512, overlap: int =50) -> list[str]:
        """Chunking Logic (Sliding Windows)"""
        tokens = text.split()  # Simple tokenization by whitespace
        chunks = []
        step = max_token - overlap
        for i in range(0, len(tokens), step):
            chunk = tokens[i:i + max_token]
            chunks.append(" ".join(chunk))
            if i + max_token >= len(tokens):
                break
        return chunks
    

    def embadding_text_chunks(self, chunks: list[str]) -> list[tuple[str, list[float]]]:
        """Embedding Logic: Convert text chunks into embeddings using a pre-trained model.
           Sample return format: [("This is chunk 1", embedding1), ("This is chunk 2", embedding2)]
        """
        if not chunks:
            print("No text chunks to embed.")
            return []
        embeddings = self.model.encode(chunks, show_progress_bar=True) # GPU and pyTorch: convert_to_tensor=True  
        return embeddings


    def build_index(self, embeddings: np.ndarray) -> list[str]:
        """Build FAISS Index
            embeddings should be a 2D numpy array of shape (num_chunks, dimension), example: (100, 384)
        """
        dim = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dim)  # FAISS needs to know the dimensionality of the vectors it will be indexing. L2 distance (Euclidean distance)
        self.index.add(embeddings.astype(np.float32))  # Ensure embeddings are in float32 format
        # print(f"FAISS index built with {embeddings.shape[0]} vectors of dimension {dim}.")
        # print(f"Number of vectors in index: {self.index.ntotal}")

    def save_index(self,index_path="faiss_data"):
        """
        Saves the FAISS index, chunks, and metadata to disk.
        """

        if self.index is None:
            print("No index to save.")
            return

        try:
            # Create directory if it doesn't exist
            os.makedirs(index_path, exist_ok=True)
            
            # Define file paths
            faiss_index_file = os.path.join(index_path, self.FAISS_INDEX_FILE)
            chunks_file = os.path.join(index_path, self.CHUNKS_FILE)
            metadata_file = os.path.join(index_path, self.METADATA_FILE)

            # Save the FAISS index
            faiss.write_index(self.index, faiss_index_file)
            print(f"\nFAISS index saved to {faiss_index_file}")

            # Save the chunks list
            with open(chunks_file, 'wb') as f:
                pickle.dump(self.chunks, f)
            print(f"Chunks saved to {chunks_file}")

            # Save the chunk metadata
            with open(metadata_file, 'wb') as f:
                pickle.dump(self.chunk_metadata, f)
            print(f"Metadata saved to {metadata_file}")

        except Exception as e:
            print(f"Error saving files: {e}")

        
    def load_index(self, index_path="faiss_data") -> bool:
        """
        Loads the FAISS index, chunks, and metadata from disk if they exist.
        """
        FAISS_INDEX_FILE = os.path.join(index_path, self.FAISS_INDEX_FILE)
        CHUNKS_FILE = os.path.join(index_path, self.CHUNKS_FILE)
        METADATA_FILE = os.path.join(index_path, self.METADATA_FILE)
        
        if os.path.exists(FAISS_INDEX_FILE) and os.path.exists(CHUNKS_FILE) and os.path.exists(METADATA_FILE):
            try:
                # Load the FAISS index
                self.index = faiss.read_index(FAISS_INDEX_FILE)
                
                # Load the chunks list
                with open(CHUNKS_FILE, 'rb') as f:
                    self.chunks = pickle.load(f)

                # Load the chunk metadata
                with open(METADATA_FILE, 'rb') as f:
                    self.chunk_metadata = pickle.load(f)
                
                return True
            except Exception as e:
                print(f"Error loading saved files: {e}")
                self.index = None
                self.chunks = []
                self.chunk_metadata = []
                return False
        return False


    def search(self, query: str, k: int = 3) -> tuple[np.ndarray, np.ndarray]:
        """Search the FAISS index for the k nearest neighbors of the query embedding.
           Returns distances and indices of the nearest neighbors.
        """
        if self.index is None:
            raise ValueError("FAISS index is not built. Call build_faiss_index() first.")
        
        # Get query_embedding from the text
        query_embedding = self.model.encode([query], show_progress_bar=False)  # Encode the query
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        if query_embedding.shape[0] != 1:
            raise ValueError("Query embedding should be a single vector, but got shape: {}".format(query_embedding.shape))
        if query_embedding.shape[1] != self.index.d:
            raise ValueError(f"Query embedding dimension {query_embedding.shape[1]} does not match index dimension {self.index.d}.")
        
        query_embedding = query_embedding.astype('float32')
        distances, indices = self.index.search(query_embedding, k)  # Search the index
        print(f"Search completed. Found {len(distances[0])} nearest neighbors.")

        result=[]
        for indice, distance in zip(indices[0], distances[0]):
            result.append({
                "distance": float(distance),
                "chunk": self.chunks[indice],
                "metadata":self.chunk_metadata[indice]
            })

        return result
        

    def build_rag_runner(self):
        """ Process papers """
       
        if not self.paperlist:
            print("No papers found in the paper list. Please run data clection first.")
            return
        
        print(f"Building RAG from {len(self.paperlist)} PDF files ...")

        idx =1
        all_embeddings = []
        for paper in tqdm(self.paperlist, desc="Processing papers"):

            text = self.extract_text_from_pdf(paper['filename'])
            if text:
                chunks = self.chunk_text_sliding_window(text)
                embeddings = self.embadding_text_chunks(chunks)
                
                # Store chunks and metadata, and append embeddings to a single list
                for i, chunk in enumerate(chunks):
                    self.chunks.append(chunk)
                    self.chunk_metadata.append({
                        "paper_title": paper['paper_title'],
                        "filename": paper['filename'],
                        "chunk_index_in_paper": i,
                    })
                all_embeddings.append(embeddings)
                # For statistics
                self.total_chunks += len(chunks)
                self.total_vectors += len(embeddings)
               
                idx += 1  

        if all_embeddings:
            final_embeddings =np.concatenate(all_embeddings, axis=0)  # Concatenate all embeddings into a single array

            # Build the FAISS index with the final embeddings
            self.build_index(final_embeddings)

            # Finally, save the built index and collected data
            self.save_index()


## Build RAG

In [None]:
print("\nRAG Pipeline Starting ...")
rag = ragpipeline(paperlist_filename="paperlist.json", model_name="all-MiniLM-L6-v2")
rag.build_rag_runner()

#Statistics
print("\nTotal papers processed: ", len(rag.paperlist))
print("Total text chunks created: ", rag.total_chunks)
print("Total embeddings generated: ", rag.total_vectors)
#print("Total vectors indexed in FAISS: ", rag.total_indexed)

print("\nRAG Pipeline Completed.") 


RAG Pipeline Starting ...
No saved index found. The pipeline needs to be built first.
Building RAG from 20 PDF files ...


Batches: 100%|██████████| 2/2 [00:01<00:00,  1.50it/s]/s]
Batches: 100%|██████████| 2/2 [00:01<00:00,  1.65it/s]  1.46s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]  1.43s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.06it/s]  1.10s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]  1.14it/s]
Batches: 100%|██████████| 2/2 [00:01<00:00,  1.92it/s]  1.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]  1.10it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  2.15it/s]  1.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.43it/s]  1.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.48it/s]  1.31it/s]
Batches: 100%|██████████| 2/2 [00:01<00:00,  1.21it/s],  1.31it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.81it/s],  1.07s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.49it/s],  1.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.41it/s],  1.36it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.48it/s],  1.54it/s]
Batches: 100%

FAISS index saved to faiss_data\index.faiss
Chunks saved to faiss_data\chunks.pkl
Metadata saved to faiss_data\metadata.pkl

Total papers processed:  20
Total text chunks created:  552
Total embeddings generated:  552

RAG Pipeline Completed.





## Search

Search through the saved faiss index data

In [46]:
rag = ragpipeline(paperlist_filename="paperlist.json", model_name="all-MiniLM-L6-v2")

query = "What is BERT and how does it work?"
#query ="What are transformers in NLP?"
results = rag.search(query, k=3)

print("\nQuestion:", query)
for idx, result in enumerate(results):
    #print(result)
    print(f"\nResult: {idx + 1}")
    print(f"Title: {result['metadata']['paper_title']}")
    print(f"Filename: {result['metadata']['filename']}")
    print(f"Distance: {result['distance']:.4f}")
    print(f"Chunk Choosed: {result['chunk'][:200]}...")
   

FAISS index and data loaded successfully.
Search completed. Found 3 nearest neighbors.

Question: What is BERT and how does it work?

Result: 1
Title: Searching for Privacy Risks in LLM Agents via Simulation
Filename: papers/paper_1.pdf
Distance: 1.0834
Chunk Choosed: volume 1 (long and short papers), 2019, pp. 4171–4186. [131] Y. Liu, M. Ott, N. Goyal, J. Du, M. Joshi, D. Chen, O. Levy, M. Lewis, L. Zettlemoyer, and V. Stoyanov, “Roberta: A ro- bustly optimized be...

Result: 2
Title: Continuous Bangla Sign Language Translation: Mitigating the Expense of Gloss Annotation with the Assistance of Graph
Filename: papers/paper_12.pdf
Distance: 1.1304
Chunk Choosed: that fine-tunes BERT-based models to predict human judgments of text quality. It combines semantic similarity, flu- ency, and grammaticality into a single score, and has been widely used to evaluate m...

Result: 3
Title: Pass@k Training for Adaptively Balancing Exploration and Exploitation of Large Reasoning Models
Filename: pa