Goal of this step1 is to ingest the documents and spilt them into chunks , embedded them and store them into a vector database eg (FAISS : Facebook AI Similarity Search)

In [5]:
#Installing the lib and importing them

!pip install -qU langchain cohere


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: C:\Users\sri19\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [8]:
!pip install -qU "faiss-cpu" "tiktoken" "PyPDF2"
!pip install  -qU "sentence-transformers" "python-dotenv"


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: C:\Users\sri19\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: C:\Users\sri19\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Setting up API key 

In [14]:
import os
from dotenv import load_dotenv
load_dotenv()


True

In [None]:
!pip install cryptography


Loading the PDF / Documents

In [None]:
import json
from pathlib import Path
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader

documents = []

# === 1. Load from arXiv JSON ===
json_path = Path("../../archive/arxiv-metadata-oai-snapshot.json")
if json_path.exists():
    with open(json_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= 100:  # Limit for testing; remove for full ingestion
                break
            entry = json.loads(line)

            title = entry.get("title", "").strip()
            abstract = entry.get("abstract", "").strip()
            categories = entry.get("categories", "").strip()
            authors = entry.get("authors", "").strip()
            update_date = entry.get("update_date", "")

            content = f"Title: {title}\n\nAbstract: {abstract}\n\nCategories: {categories}"
            metadata = {
                "arxiv_id": entry.get("id", ""),
                "authors": authors,
                "update_date": update_date
            }

            documents.append(Document(page_content=content, metadata=metadata))
    print(f" Loaded {len(documents)} papers from arXiv JSON")
else:
    print(" arXiv JSON file not found")

# === 2. Load new PDFs ===
pdf_path = Path("../NewPapers")
if pdf_path.exists():
    pdf_files = list(pdf_path.glob("*.pdf"))
    for file in pdf_files:
        loader = PyPDFLoader(str(file))
        pdf_docs = loader.load()
        documents.extend(pdf_docs)
    print(f" Loaded {len(pdf_files)} PDFs with {len(documents)} total documents (JSON + PDF)")
else:
    print(" PDF folder ../NewPapers not found.")


 Loaded 100 papers from arXiv JSON


Now splitting the Document into chunks

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Small chunks for precise retrieval
small_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)
small_chunks = small_splitter.split_documents(documents)

# Large chunks for broader context
large_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200
)
large_chunks = large_splitter.split_documents(documents)

# In the above we need overlap so that we do not miss the context while using the chunks and I am using samll_chunks for facts and large_chunks for summerization task





## Embedding Small chunks first 

In [9]:
from tqdm import tqdm  # progress bar
import time

def embed_in_batches(texts, batch_size=96):  # 96 is safe for Cohere's free tier
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        try:
            response = co.embed(
                texts=batch,
                model="embed-english-v3.0",
                input_type="search_document"
            )
            all_embeddings.extend(response.embeddings)
        except Exception as e:
            print(f" Failed on batch {i // batch_size + 1}: {e}")
            time.sleep(5)  # Retry buffer for rate limits or server errors
    return all_embeddings

#  Run batching
small_embeddings = embed_in_batches(small_texts)

print("Embedded", len(small_embeddings), "small chunks.")


100%|██████████| 5/5 [00:02<00:00,  2.22it/s]

Embedded 387 small chunks.





## Embedding larger chunks

In [10]:
from tqdm import tqdm
import time

def embed_in_batches(texts, batch_size=96, input_type="search_document"):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding large chunks"):
        batch = texts[i:i+batch_size]
        try:
            response = co.embed(
                texts=batch,
                model="embed-english-v3.0",
                input_type=input_type
            )
            all_embeddings.extend(response.embeddings)
        except Exception as e:
            print(f" Error on batch {i // batch_size + 1}: {e}")
            time.sleep(5)  # wait a bit before retrying
    return all_embeddings

#  Prepare texts
large_texts = [chunk.page_content for chunk in large_chunks]

# Embed with batching
large_embeddings = embed_in_batches(large_texts)

print("Embedded", len(large_embeddings), "large chunks.")


Embedding large chunks: 100%|██████████| 2/2 [00:01<00:00,  1.91it/s]

Embedded 126 large chunks.





## Saving Small and Large Embeddings to FAISS

In [11]:
from langchain.schema import Document

# Convert small and large texts into LangChain documents
small_docs = [Document(page_content=text) for text in small_texts]
large_docs = [Document(page_content=text) for text in large_texts]


In [12]:
# Embedding wrapper

from langchain_core.embeddings import Embeddings

class WorkingCohereEmbeddings(Embeddings):
    def embed_documents(self, texts):
        response = co.embed(
            texts=texts,
            model="embed-english-v3.0",
            input_type="search_document"
        )
        return response.embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]

embedding_model = WorkingCohereEmbeddings()


In [13]:
from langchain.vectorstores import FAISS

# Save small chunk index
faiss_small = FAISS.from_documents(small_docs, embedding_model)
faiss_small.save_local("vectorstore/faiss_small", index_name="small_index")
print("Saved small chunks to FAISS at vectorstore/faiss_small")

# Save large chunk index
faiss_large = FAISS.from_documents(large_docs, embedding_model)
faiss_large.save_local("vectorstore/faiss_large", index_name="large_index")
print(" Saved large chunks to FAISS at vectorstore/faiss_large")


Saved small chunks to FAISS at vectorstore/faiss_small
 Saved large chunks to FAISS at vectorstore/faiss_large


## Load the embeddings

In [14]:
# Small chunks for factual queries
vector_store_small = FAISS.load_local(
    folder_path="vectorstore/faiss_small",
    index_name="small_index",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

# Large chunks for summary/reasoning queries
vector_store_large = FAISS.load_local(
    folder_path="vectorstore/faiss_large",
    index_name="large_index",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

print(" Both FAISS indexes loaded.")


 Both FAISS indexes loaded.


## Retreival

In [15]:
# Use k=3 for top-3 most similar chunks
retriever_small = vector_store_small.as_retriever(search_kwargs={"k": 3})
retriever_large = vector_store_large.as_retriever(search_kwargs={"k": 3})


## Creating RAG Chains

In [16]:
from langchain.chains import RetrievalQA
from langchain.llms import Cohere as CohereLLM

llm = CohereLLM(cohere_api_key=api_key)

# Chain for factual Q&A
rag_chain_small = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever_small,
    return_source_documents=True
)

# Chain for summary-level reasoning
rag_chain_large = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever_large,
    return_source_documents=True
)


  llm = CohereLLM(cohere_api_key=api_key)


In [None]:
query = "How many papers do you have with you using the json file that I providedd ?"
result = rag_chain_small.invoke(query)
print("🔹 Answer:\n", result["result"])
