#### Libraries installation

In [44]:
# Install the sentence_transformers library for state-of-the-art sentence embeddings
!pip install sentence_transformers

# Install the pypdf library to handle PDF file reading and manipulation
!pip install pypdf

# Install faiss-gpu for efficient similarity search and clustering with GPUs
# !pip install faiss-gpu
# Install the FAISS library optimized for CPU usage
!pip install faiss-cpu

# Install langchain, a framework for developing applications with large language models
!pip install langchain

# Install langchain-openai, which provides OpenAI integration for LangChain
!pip install langchain-openai

# langchain-community: Includes additional community-contributed modules for LangChain, such as document loaders.
# langchain-core: The core LangChain library required for foundational functionality.
!pip install langchain-community langchain-core




[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


#### Importing Necessary Libraries

In [20]:
# Import the PyPDFLoader for loading PDF documents
from langchain_community.document_loaders import PyPDFLoader

# Import the Document schema for structuring document data
from langchain.schema import Document

# Import the RecursiveCharacterTextSplitter for splitting text into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Import the HuggingFaceEmbeddings for generating text embeddings
from langchain.embeddings import HuggingFaceEmbeddings

# Import the FAISS vector store for storing and querying embeddings
from langchain.vectorstores import FAISS

# Import the ChatOpenAI model for generating conversational responses from OpenAI
from langchain.chat_models import ChatOpenAI

# Import function to create a chain that combines documents using a "stuff" approach
from langchain.chains.combine_documents import create_stuff_documents_chain

# Import the ChatPromptTemplate for formatting prompts in the chat model
from langchain_core.prompts import ChatPromptTemplate

# Import the function to create a retrieval-based document chain
from langchain.chains import create_retrieval_chain

# # Import the os module to set environment variables for OpenAI API key
# import os

# # Set the OpenAI API key for authentication
# os.environ["OPENAI_API_KEY"] = "Your-OpenAI-API-Key"


#### Load PDF

In [3]:
# Specify the path to your PDF file
pdf_path = r"C:\F\ml_projexts\chatbot_rag_llm_langchain\content\SaimAhmed_BI.pdf"

# Load PDF documents using the PyPDFLoader

# Create a loader instance with the specified path
loader = PyPDFLoader(pdf_path)

# Load the documents from the PDF
documents = loader.load()

In [4]:
documents

[Document(metadata={'source': 'C:\\F\\ml_projexts\\chatbot_rag_llm_langchain\\content\\SaimAhmed_BI.pdf', 'page': 0}, page_content="SAIM\nAHMED\nSQL\nDEVELOPER\nAddress:\nAm\nEhrenberg\n6,\nilmenau,\nThüringen\nEmail:\nsaim.ahmed92@hotmail.com\nPhone:\n+4915737811565\nGithub:\nhttps://github.com/saimboxer\nProfessional\nProfile\nDedicated\nsoftware\nengineer\nwith\n3\nyears\nof\nexperience\nspecializing\nin\ndatabase\ndesign,\nETL\nprocesses,\nand\ndata\nanalysis.\nProficient\nin\nMicrosoft\nPower\nBI,\nTableau\nand\nQlikView,\nwith\na\nsolid\nfoundation\nin\ndata\nvisualization,\ndata\nwarehousing,\nand\nquery\noptimization.\nEager\nto\nleverage\nmy\nskills\nin\na\nwork\nstudent\nposition\nto\ncontribute\nto\ndata-driven\ndecision-making\nprocesses.\nWork\nHistory\nSoftware\nEngineer\nData\nCheck\nLimited\n●\nDeveloped\nan\nend-to-end\nsolution\nto\nanalyze\nfinancial\nperformance\nacross\nthe\ncompany's\ndiverse\nportfolio,\ncreating\nvisualizations\nwith\nPower\nBI.\n●\nImproved\nda

In [5]:
def clean_text(text):
    # Remove unwanted newline characters and bullet points
    text = text.replace('\n', ' ')
    text = text.replace('● ', '')
    return text

#### Preprocess data

In [6]:
# Clean the text of each document
cleaned_documents = [Document(metadata=doc.metadata, page_content=clean_text(doc.page_content)) for doc in documents]

In [7]:
cleaned_documents

[Document(metadata={'source': 'C:\\F\\ml_projexts\\chatbot_rag_llm_langchain\\content\\SaimAhmed_BI.pdf', 'page': 0}, page_content="SAIM AHMED SQL DEVELOPER Address: Am Ehrenberg 6, ilmenau, Thüringen Email: saim.ahmed92@hotmail.com Phone: +4915737811565 Github: https://github.com/saimboxer Professional Profile Dedicated software engineer with 3 years of experience specializing in database design, ETL processes, and data analysis. Proficient in Microsoft Power BI, Tableau and QlikView, with a solid foundation in data visualization, data warehousing, and query optimization. Eager to leverage my skills in a work student position to contribute to data-driven decision-making processes. Work History Software Engineer Data Check Limited Developed an end-to-end solution to analyze financial performance across the company's diverse portfolio, creating visualizations with Power BI. Improved data accessibility for stakeholders by creating and managing interactive Power BI dashboards that visuali

In [50]:
# Print the type of 'documents' and inspect the structure
print(f"Type of 'documents': {type(documents)}")
print(f"Number of documents: {len(documents)}")

# Print the type of 'documents' and inspect the structure
print(f"Type of 'cleaned_documents': {type(cleaned_documents)}")
print(f"Number of cleaned_documents: {len(cleaned_documents)}")

Type of 'documents': <class 'list'>
Number of documents: 2
Type of 'cleaned_documents': <class 'list'>
Number of cleaned_documents: 2


#### split the document into chunks.

In [8]:
# Create an instance of RecursiveCharacterTextSplitter
# This splitter will handle the division of text into smaller, more manageable pieces based on character length.
text_splitter = RecursiveCharacterTextSplitter()

In [9]:
# Split text 
text = RecursiveCharacterTextSplitter().split_documents(cleaned_documents)
text

[Document(metadata={'source': 'C:\\F\\ml_projexts\\chatbot_rag_llm_langchain\\content\\SaimAhmed_BI.pdf', 'page': 0}, page_content="SAIM AHMED SQL DEVELOPER Address: Am Ehrenberg 6, ilmenau, Thüringen Email: saim.ahmed92@hotmail.com Phone: +4915737811565 Github: https://github.com/saimboxer Professional Profile Dedicated software engineer with 3 years of experience specializing in database design, ETL processes, and data analysis. Proficient in Microsoft Power BI, Tableau and QlikView, with a solid foundation in data visualization, data warehousing, and query optimization. Eager to leverage my skills in a work student position to contribute to data-driven decision-making processes. Work History Software Engineer Data Check Limited Developed an end-to-end solution to analyze financial performance across the company's diverse portfolio, creating visualizations with Power BI. Improved data accessibility for stakeholders by creating and managing interactive Power BI dashboards that visuali

#### Embeddings

In [10]:
# Initialize a HuggingFace embeddings model with normalization
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",       # Specify the pre-trained embedding model from Hugging Face.
    encode_kwargs={"normalize_embeddings": True}  # Apply normalization to ensure embeddings have a unit norm,
                                                  # which helps in maintaining consistency in distance calculations
                                                  # when comparing embeddings.
)

# Explanation:
# - HuggingFaceEmbeddings: This class from the `langchain` library allows us to use pre-trained models from Hugging Face
#   to convert text into numerical embeddings.
# - model_name="BAAI/bge-small-en-v1.5": Specifies the name of the pre-trained model to use. Here, we are using
#   "BAAI/bge-small-en-v1.5", a model designed to generate embeddings for English text.
# - encode_kwargs={"normalize_embeddings": True}: Additional arguments passed to the encoding function. In this case,
#   normalization ensures that all generated embeddings have a unit norm (i.e., their length is 1). This is beneficial
#   because it makes the embeddings consistent in scale, which is important for various similarity and distance metrics
#   used in tasks like clustering or nearest neighbor search.


  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


#### Vector Store using FAISS to store embeddings and text chunks.

In [11]:
# Create a vectorstore
vectorstore = FAISS.from_documents(text, embeddings)

In [12]:
# Save the documents and embeddings
vectorstore.save_local("vectorstore.db")

#### Create a Retriever Using the Vector Store

In [13]:
# Create retriever
retriever = vectorstore.as_retriever()

#### Load the Language Model (LLM) and Create a Document Chain

In [14]:
# Load the llm 
llm = ChatOpenAI(model_name="gpt-3.5-turbo")

  warn_deprecated(


In [24]:
# Define prompt template
template = """
You are an HR Manager Named Micheala, assisting me to hunt developer who will be help me in growing my buisiness and take care of all projects, candidate should positive minded can do critical thinking, try to give me 2 3 lines short answers after your critical analysis.
Use the provided context only to answer the following question:

<context>
{context}
</context>

Question: {input}
"""

In [26]:
# Create a prompt template
prompt = ChatPromptTemplate.from_template(template)

In [27]:
# Create a chain 
doc_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, doc_chain)

#### create a retrieval chain by combining the retriever and document chain

In [30]:
# User query 
response = chain.invoke({"input": "What is he doing now?"})

# Get the Answer only
response['answer']

'Saim Ahmed is currently working as a Software Developer at Software Channel, where he is developing data and reporting procedures using SQL and Power BI/Tableau/Excel to provide real-time insights into key performance indicators across various business functions.'