In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.chains import RetrievalQA

## Load the pdf and convert to pages

In [2]:
loader = PyPDFLoader('File/HR.pdf')
pages = loader.load()

## Splits the pages into chunks

In [3]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(pages)
print(f"✅ Loaded {len(pages)} pages and split into {len(docs)} chunks")

✅ Loaded 53 pages and split into 338 chunks


## Embedding model
    - Here m using an embedding hugging face model without the need of any token
    - Embedding model converts the pdf text to dense numerical vectors

In [4]:
embedding_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

  embedding_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


## Store the embedded vectors in chroma_db folder for future use

In [8]:
db = Chroma.from_documents(documents=docs, embedding=embedding_model, persist_directory='./chroma_db')
db.persist()
print("✅ Stored embeddings in Chroma DB.")

✅ Stored embeddings in Chroma DB.


  db.persist()


## Define the LLM (text Generator)
    Using a light weight transformer to genrate an answer
    This wraps the HuggingFace pipeline as a LangChain-compatible LLM

In [14]:

qa_pipeline = pipeline('text-generation', model='distilgpt2', max_new_tokens=100)
llm = HuggingFacePipeline(pipeline=qa_pipeline)

Device set to use cpu


## Convert Chroma db into a retriever
    - it wraps the chroma DB in a search interface
    - when you ask a question, it uses cosine similarity to find the top relevant chunks

In [17]:
retriever = db.as_retriever() 

## Create the Retrieval QA chain
    it connects the LLM with the retriever

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, retriever = retriever) #it connects the llm with the retirver

**Now LangChain:** 
- Takes your question
- Retrieves the most relevant text from Chroma
- Passes it + the question to the LLM

## Ask a question

In [16]:
query = 'What is this document about?'
answer = qa.run(query)
print("🤖 Answer:", answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


🤖 Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

http://nwthumanrights.ca/wp-content/uploads/2020/05/fn_handbook.pdf

http://nwthumanrights.ca/wp-content/uploads/2020/05/fn_handbook.pdf

https://www.csst.qc.ca/en/Pages/CSST_communications_french_only.aspx
Charter of Human Rights and Freedoms

https://www.csst.qc.ca/en/Pages/CSST_communications_french_only.aspx
Charter of Human Rights and Freedoms

Question: What is this document about?
Helpful Answer:
The document is a document about human rights related to human rights related to human rights related to human rights related to human rights related to human rights related to human rights related to human rights related to human rights related to human rights related to human rights related to human rights related to human rights related to human rights related to human rights related to human rights related to hu

**LangChain handles all:**
- Retrieval
- Combining chunks + question
- Passing it to the model
- Getting output



In [10]:
loader = PyPDFLoader("File/HR.pdf")  
pages = loader.load()   ## conver the pdf into pages

In [11]:
## converting it to chunk of text with 500 words

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(pages)
print(f"✅ Loaded {len(pages)} pages and split into {len(docs)} chunks.")

✅ Loaded 53 pages and split into 338 chunks.


In [28]:
# token free embedding
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

ImportError: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.

In [13]:
! pip install sentence-transformers



In [3]:
! pip install langchain_community

Collecting langchain_community
  Using cached langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<1.0.0,>=0.3.66 (from langchain_community)
  Using cached langchain_core-0.3.68-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain<1.0.0,>=0.3.26 (from langchain_community)
  Using cached langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain_community)
  Using cached sqlalchemy-2.0.41-cp310-cp310-macosx_10_9_x86_64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain_community)
  Using cached aiohttp-3.12.13-cp310-cp310-macosx_10_9_x86_64.whl.metadata (7.6 kB)
Collecting tenacity!=8.4.0,<10,>=8.1.0 (from langchain_community)
  Using cached tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_commun

In [16]:
! pip show sentence-transformers

Name: sentence-transformers
Version: 2.2.2
Summary: Multilingual text embeddings
Home-page: https://github.com/UKPLab/sentence-transformers
Author: Nils Reimers
Author-email: info@nils-reimers.de
License: Apache License 2.0
Location: /Users/subhratarakesh/Documents/UPWORK/PortFolio/LLM/PDF_chatbot/rag_env/lib/python3.10/site-packages
Requires: huggingface-hub, nltk, numpy, scikit-learn, scipy, sentencepiece, torch, torchvision, tqdm, transformers
Required-by: 


In [8]:
!pip install pypdf

Collecting pypdf
  Using cached pypdf-5.7.0-py3-none-any.whl.metadata (7.2 kB)
Using cached pypdf-5.7.0-py3-none-any.whl (305 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.7.0


In [18]:
!which python

/Users/subhratarakesh/Documents/UPWORK/PortFolio/LLM/PDF_chatbot/rag_env/bin/python


In [22]:
pip list | grep sentence

sentence-transformers    2.2.2
sentencepiece            0.2.0
Note: you may need to restart the kernel to use updated packages.


In [21]:
! source rag_env/bin/activate

In [23]:
from sentence_transformers import SentenceTransformer

ImportError: cannot import name 'cached_download' from 'huggingface_hub' (/Users/subhratarakesh/Documents/UPWORK/PortFolio/LLM/PDF_chatbot/rag_env/lib/python3.10/site-packages/huggingface_hub/__init__.py)

In [24]:
pip install huggingface_hub==0.16.4

Collecting huggingface_hub==0.16.4
  Downloading huggingface_hub-0.16.4-py3-none-any.whl.metadata (12 kB)
Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
Installing collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.33.2
    Uninstalling huggingface-hub-0.33.2:
      Successfully uninstalled huggingface-hub-0.33.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.53.1 requires huggingface-hub<1.0,>=0.30.0, but you have huggingface-hub 0.16.4 which is incompatible.[0m[31m
[0mSuccessfully installed huggingface_hub-0.16.4
Note: you may need to restart the kernel to use updated packages.


In [27]:
!pip install --upgrade pip
!pip uninstall huggingface_hub sentence-transformers -y
!pip install sentence-transformers huggingface_hub==0.16.4

Found existing installation: huggingface-hub 0.16.4
Uninstalling huggingface-hub-0.16.4:
  Successfully uninstalled huggingface-hub-0.16.4
Found existing installation: sentence-transformers 2.2.2
Uninstalling sentence-transformers-2.2.2:
  Successfully uninstalled sentence-transformers-2.2.2
Collecting sentence-transformers
  Using cached sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting huggingface_hub==0.16.4
  Using cached huggingface_hub-0.16.4-py3-none-any.whl.metadata (12 kB)
INFO: pip is looking at multiple versions of sentence-transformers to determine which version is compatible with other requirements. This could take a while.
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
  Downloading sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
  Downloading sentence_transformers-4.0.1-py3-none-any.whl.metadata (13 kB)
  Downloading sentence_transformers-4.0.0-py3-none-any.whl.metadata (13

/bin/bash: deactivate: command not found
