In [None]:
import os
import glob
from uuid import uuid4
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer

from pinecone import Pinecone, ServerlessSpec

from langchain import hub
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker

from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")
os.environ['GOOGLE_API_KEY']=os.getenv("GOOGLE_API_KEY")
os.environ['PINECONE_API_KEY']=os.getenv("PINECONE_API_KEY")

In [3]:
# Step 1: Load all PDFs from directory
pdf_dir = "./data/"
pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))

In [4]:
all_docs = []
for pdf_file in pdf_files:
    loader = PyPDFLoader(pdf_file)
    all_docs.extend(loader.load())

In [5]:
print(f"Loaded {len(all_docs)} page documents (should cover >200 pages).")

Loaded 247 page documents (should cover >200 pages).


In [7]:
# Step 2: Semantic Chunking
embedder = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
chunker = SemanticChunker(embedder, breakpoint_threshold_type="standard_deviation", min_chunk_size=200)
semantic_chunks = chunker.create_documents([doc.page_content for doc in all_docs])

print(f"Total semantic chunks: {len(semantic_chunks)}")

Total semantic chunks: 308


In [8]:
semantic_chunks[0].page_content

'Journal of Machine Learning Research 21 (2020) 1-67 Submitted 1/20; Revised 6/20; Published 6/20\nExploring the Limits of Transfer Learning with a Unified\nText-to-Text Transformer\nColin Raffel∗ craffel@gmail.com\nNoam Shazeer∗ noam@google.com\nAdam Roberts∗ adarob@google.com\nKatherine Lee∗ katherinelee@google.com\nSharan Narang sharannarang@google.com\nMichael Matena mmatena@google.com\nYanqi Zhou yanqiz@google.com\nWei Li mweili@google.com\nPeter J. Liu peterjliu@google.com\nGoogle, Mountain View, CA 94043, USA\nEditor: Ivan Titov\nAbstract\nTransfer learning, where a model is first pre-trained on a data-rich task before being fine-\ntuned on a downstream task, has emerged as a powerful technique in natural language\nprocessing (NLP). The effectiveness of transfer learning has given rise to a diversity of\napproaches, methodology, and practice. In this paper, we explore the landscape of transfer\nlearning techniques for NLP by introducing a unified framework that converts all text

In [9]:
texts = [chunk.page_content for chunk in semantic_chunks]
embeddings = embedder.embed_documents(texts)

In [10]:
len(embeddings)

308

In [13]:

pc=Pinecone()

index_name="agenticbatch2-assignment"

if not pc.has_index(index_name):
    pc.create_index(
    name=index_name,
    dimension=768,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws",region="us-east-1")    
)

In [15]:
#loading the index
index=pc.Index(index_name)
vector_store=PineconeVectorStore(index=index,embedding=embedder)

In [18]:
# Each item: (id, vector, metadata)
items = [
    (str(uuid4()), emb, {"text": text}) for emb, text in zip(embeddings, texts)
]
# uuids = [str(uuid4()) for _ in range(len(texts))]

In [19]:
batch_size = 100
for i in range(0, len(items), batch_size):
    index.upsert(vectors=items[i:i+batch_size])

In [35]:
def retrieve(query, top_k=5, threshold=0.7):
    query_emb = embedder.embed_query(query)
    # Query Pinecone
    result = index.query(
        vector=query_emb,
        top_k=top_k,
        include_metadata=True,
        filter=None,  # Optionally filter on metadata
        include_values=True
    )
    # Filter by score threshold
    matches = [m for m in result['matches'] if m['score'] >= threshold]
    return matches, query_emb


In [36]:
import time

test_query = "What are the main findings about transformer models?"

start = time.time()
matches, _ = retrieve(test_query)
elapsed = time.time() - start

print(f"Retrieval time: {elapsed:.4f}s")
for m in matches:
    print(f"Score: {m['score']:.2f}, Text: {m['metadata']['text'][:200]}...")


Retrieval time: 1.0249s
Score: 0.72, Text: V. P OPULAR DATASETS FOR LLM S
Large language models exhibit promising accomplish-
ments, but the main question that arises is how effectively
they function and how their performance can be assessed i...
Score: 0.72, Text: 7. A figure showing the training time of the custom recurrent model 
VSGRU and the transformer-based model CDGPT2 . O. Alfarghaly et al....
Score: 0.70, Text: Contents
1 Introduction 3
1.1 Key Results . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3
1.2 Notation . . . . . . . . . . . . . . . . . . . . . . . . . . . . ....


In [37]:
# Reranking

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def mmr(query_emb, doc_embs, texts, top_k=5, lambda_param=0.7):
    doc_embs = np.array(doc_embs)
    query_emb = np.array(query_emb).reshape(1, -1)
    sim = cosine_similarity(doc_embs, query_emb).flatten()
    selected, candidates = [], list(range(len(texts)))
    while len(selected) < top_k and candidates:
        if not selected:
            idx = int(np.argmax(sim))
        else:
            sim_to_query = sim[candidates]
            sim_to_selected = np.max(cosine_similarity(doc_embs[candidates], doc_embs[selected]), axis=1)
            mmr_scores = lambda_param * sim_to_query - (1 - lambda_param) * sim_to_selected
            idx = candidates[int(np.argmax(mmr_scores))]
        selected.append(idx)
        candidates.remove(idx)
    return [texts[i] for i in selected]


In [38]:
# 1. Get matches and query embedding
matches, query_emb = retrieve("What are the main findings about transformer models?", top_k=10)

# 2. Prepare for MMR
doc_embs = [m['values'] for m in matches]
texts = [m['metadata']['text'] for m in matches]

# 3. Rerank
reranked_mmr = mmr(query_emb, doc_embs, texts, top_k=5)
print("Top reranked doc:", reranked_mmr[0][:300])

Top reranked doc: V. P OPULAR DATASETS FOR LLM S
Large language models exhibit promising accomplish-
ments, but the main question that arises is how effectively
they function and how their performance can be assessed in
specific tasks or applications.


In [39]:
from langchain.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

prompt_template = """You are a research assistant. Use the context below to answer the question concisely.
Context:
{context}
Question: {question}
Answer:"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

context = "\n\n".join(reranked_mmr[:3])
question = test_query

llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash')
final_prompt = prompt.format(context=context, question=question)
llm_response = llm.invoke(final_prompt)
print("LLM output:\n", llm_response)


LLM output:
 content='The provided text does not offer findings about transformer models beyond mentioning CDGPT2 as a transformer-based model whose training time is compared to another model (VSGRU).  No specific performance results or conclusions about transformer models are given.' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-flash', 'safety_ratings': []} id='run--9e59c749-047a-476f-bf65-e203b52620ff-0' usage_metadata={'input_tokens': 229, 'output_tokens': 47, 'total_tokens': 276, 'input_token_details': {'cache_read': 0}}


In [48]:
from docx import Document as DocxDoc

docx = DocxDoc()
docx.add_heading('RAG Output', 0)
docx.add_heading('Question', level=1)
docx.add_paragraph(question)
docx.add_heading('Answer', level=1)
docx.add_paragraph(str(llm_response))

docx.save("rag_llm_output.docx")
print("Saved output to rag_llm_output.docx")

Saved output to rag_llm_output.docx
