In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
import numpy as np
import faiss
import pickle

In [3]:
# Step 1: Load the PDF document
loader = PyPDFLoader("./data/document.pdf")  # Use the correct path to your PDF
documents = loader.load()

# Step 2: Split the document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

# Step 3: Create embeddings and FAISS index
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Get the actual vector size from the embeddings
sample_vector = embeddings.embed_query("test")
vector_size = len(sample_vector)
print(f"Embedding vector size: {vector_size}")

# Create FAISS index with the correct vector size
faiss_index = faiss.IndexFlatL2(vector_size)

# Step 4: Add vectors to FAISS index
vectors = [embeddings.embed_query(doc.page_content) for doc in texts]
vectors_np = np.array(vectors, dtype='float32')  # Ensure float32 type for FAISS
print(f"Shape of vectors_np: {vectors_np.shape}")

# Add vectors to the index
faiss_index.add(vectors_np)
print("Vectors added to FAISS index successfully!")

# Step 5: Create a docstore and index_to_docstore_id mapping
docstore = InMemoryDocstore({str(i): texts[i] for i in range(len(texts))})
index_to_docstore_id = {i: str(i) for i in range(len(texts))}

# Step 6: Initialize the FAISS vector store
vectordb = FAISS(
    index=faiss_index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
    embedding_function=embeddings.embed_query
)

print("FAISS vector store initialized successfully!")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


Embedding vector size: 384


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


Shape of vectors_np: (8, 384)
Vectors added to FAISS index successfully!
FAISS vector store initialized successfully!


In [4]:
faiss_index_file = "faiss_index.index"
embeddings_file = "embeddings.pkl"

# Save the FAISS index
faiss.write_index(faiss_index, faiss_index_file)
print(f"FAISS index saved to {faiss_index_file}")

# Save the metadata
with open(embeddings_file, "wb") as f:
    pickle.dump(texts, f)
print(f"Metadata saved to {embeddings_file}")


FAISS index saved to faiss_index.index
Metadata saved to embeddings.pkl


In [5]:
import os
from langchain_groq import ChatGroq
# from langchain.prompts import PromptTemplate
# from langchain.chains import RetrievalQA

In [6]:
groq_api_key = "YOUR_GROQ_API_KEY"

llm = ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0.3,
    api_key=groq_api_key,
)

answer = llm.invoke("Hello there!")
print(answer.content)

Hello. It's nice to meet you. Is there something I can help you with or would you like to chat?


In [7]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

In [8]:
def process_qa_retrieval_chain(chain, query):
    response = chain.invoke({'query': query})
    
    result_str = f'Query: {response["query"]}\n\n'
    result_str += f'Result: {response["result"]}\n\n'
    
    relevant_docs = response['source_documents']
    for i in range(len(relevant_docs)):
        result_str += f'Relevant Doc {i+1}:\n'
        result_str += relevant_docs[i].page_content + '\n'
        result_str += str(relevant_docs[i].metadata) + '\n\n'
    
    return result_str

In [9]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, 
just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. 
Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [10]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={'prompt': QA_CHAIN_PROMPT}
)

In [11]:
query = "What is her educational background?"

In [12]:
result = process_qa_retrieval_chain(qa_chain, query)
print(result)

Query: What is her educational background?

Result: Rabail Anees is pursuing a Bachelor of Data Science at the University of the Punjab, Lahore, from 2021-2025. Relevant coursework includes Programming, Advanced Statistics, Machine Learning, and Artificial Intelligence.

Relevant Doc 1:
● DevelopedaresponsivefrontendinterfaceusingHTMLandCSS.● PortfolioWebsite:● BuiltaresponsivepersonalportfoliousingTailwindCSSandJavaScript.
AwardsandAchievements
● Winner:DataAnalyticscompetitionatNutec2024.● Top5Finalist:AIcompetitionatNAScon2024.● Participatedinspeedprogrammingcompetitions,including:● CodeBees.● Softec.● CodeFest.
Certifications
● MicrosoftOfficeSpecialist(MOS).● IntroductiontoFrontendDevelopment(Coursera).
CommunityWork
{'source': './data/document.pdf', 'page': 2}

Relevant Doc 2:
● DataAnalysisandVisualization:Excel,Plotly, Matplotlib,Seaborn,NumPy.● DatabaseManagement:MySQLWorkbench,SQLite3,SQLServer.● AIandMachineLearning:Pandas,Scikit-learn.
EducationalBackground
BachelorofDataSc