## Assignment-2: RAG

In [31]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [32]:
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")

## Langsmith Tracking And Tracing
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"]=os.getenv("LANGCHAIN_PROJECT")
os.environ["PINECONE_API_KEY"]=os.getenv("PINECONE_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"]="true"

In [33]:
os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")

In [34]:
FILE_PATH = r"D:\Agentic_AI\Assignments\NISM_ResearchAnalyst.pdf"

In [35]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [36]:
loader = PyPDFLoader(FILE_PATH)
docs = loader.load()

In [37]:
len(docs)

277

In [38]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
split_docs = splitter.split_documents(docs)

In [39]:
len(split_docs)

871

In [40]:
from langchain_huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

In [41]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(split_docs))]

In [42]:
from pinecone import Pinecone
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

In [43]:
index_name = 'nism-rag'
index=pc.Index(index_name)

In [44]:
from langchain_pinecone import PineconeVectorStore
vectorstore = PineconeVectorStore(index=index,embedding=embedding)

In [50]:
MAX_BYTES = 4194304  # 4 MiB

def is_valid_doc(doc):
    size = len(doc.page_content.encode('utf-8'))
    return size <= MAX_BYTES

# Filter out oversized documents
valid_docs = [doc for doc in split_docs if is_valid_doc(doc)]
oversized_docs = [doc for doc in split_docs if not is_valid_doc(doc)]

print(f"Valid docs: {len(valid_docs)}, Oversized docs: {len(oversized_docs)}")

# If you have UUIDs, sync them with the filtered docs
valid_ids = [uuid for doc, uuid in zip(split_docs, uuids) if is_valid_doc(doc)]

BATCH_SIZE = 50  # try 20–50 if the problem persists

for i in range(0, len(valid_docs), BATCH_SIZE):
    batch_docs = valid_docs[i:i + BATCH_SIZE]
    batch_ids = valid_ids[i:i + BATCH_SIZE]
    try:
        vectorstore.add_documents(documents=batch_docs, ids=batch_ids)
        print(f"Uploaded batch {i // BATCH_SIZE + 1}")
    except Exception as e:
        print(f"Failed to upload batch {i // BATCH_SIZE + 1}: {e}")



Valid docs: 871, Oversized docs: 0
Uploaded batch 1
Uploaded batch 2
Uploaded batch 3
Uploaded batch 4
Uploaded batch 5
Uploaded batch 6
Uploaded batch 7
Uploaded batch 8
Uploaded batch 9
Uploaded batch 10
Uploaded batch 11
Uploaded batch 12
Uploaded batch 13
Uploaded batch 14
Uploaded batch 15
Uploaded batch 16
Uploaded batch 17
Uploaded batch 18


In [51]:
retriever = vectorstore.as_retriever(
    search_type="similarity", 
    search_kwargs={"k": 5}      
)


In [52]:
query = "Analysis and Decision making, the two imperative parameters of a research analyst’s role are affected by which of the following factors? "
results = retriever.get_relevant_documents(query)

for i, doc in enumerate(results):
    print(f"\n--- Document {i+1} ---")
    print(doc.page_content[:300])  


  results = retriever.get_relevant_documents(query)



--- Document 1 ---
14 
 
CHAPTER 1: INTRODUCTION TO RESEARCH ANALYST PROFESSION 
 
 
 
 
 
1.1 Primary Role of a Research Analyst 
Imagine you've decided to buy a  new phone. What would be your  process of selection? For the price 
range decided, you would short list a set of brands,  compare various technical specifi

--- Document 2 ---
sources including the financial statements/Annual reports filed by the companies as part of regulatory 
compliance requirements, meeting officials of the company authorized to provide it and other sources 
such as plant/factory visits, market surveys and employee/stakeholder interviews. 
Analysis an

--- Document 3 ---
20 
 
Sample Questions 
1. What is the role of Research Analyst? 
a. RAs are only involved in the analysis of data 
b. RAs are only involved in collection of the data 
c. RAs help their clients take informed decisions 
d. RAs help in financial planning of their client 
 
2. Analysis and Decision mak

--- Document 4 ---
expected that RAs

In [53]:
from langchain_google_genai import ChatGoogleGenerativeAI
model=ChatGoogleGenerativeAI(model='gemini-1.5-flash')

In [54]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [55]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [56]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [57]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [59]:
rag_chain.invoke("Which of these Qualities are desired in a good research analyst? ")

'A good research analyst possesses strong quantitative skills, including comfort with numbers and data analysis tools like Excel.  They are also methodical, inquisitive, and discerning in finding relevant information, understanding business models and competitive dynamics.  Finally, honesty and ethical conduct are crucial.'