In [None]:
# !pip install PyPDF2
# !pip install langchain
# !pip install --upgrade langchain
# !pip install -U langchain-text-splitters
# !pip install faiss-cpu

In [None]:
from PyPDF2 import PdfReader

In [None]:
pdf_path = './data/book.pdf'
reader = PdfReader(pdf_path) # it creates object of pdfreader where pdf is loaded

In [None]:
first_page = reader.pages[30] # .page reads the content 
print(first_page)
text = first_page.extract_text() # .extract_text extract the text from the page and it doesnt read img 
print(text)

In [None]:
print('first page text:\n')
print(text)

In [None]:


from langchain_core.documents import Document
from PyPDF2 import PdfReader

In [None]:
pdf_path = './data/book.pdf'
reader = PdfReader(pdf_path)
print(reader)

In [None]:
document = []

for i, page in enumerate(reader.pages[30:35]):
    text = page.extract_text()
    if text:
        doc = Document(
            page_content=text,
            metadata = {'page_number' : i + 1, 'source': pdf_path}
        )
        document.append(doc)

print('page sonten preview:\n',document[2].page_content)
print('metadata:\n',document[2].metadata)

In [None]:

from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50
)

chunks = splitter.split_documents(document)

print('total chunks created: \n', len(chunks))
print('first chunk preview: \n', chunks[2].page_content)
print('metadata of first chunk: \n', chunks[0].metadata)

In [None]:
from langchain_text_splitters import TokenTextSplitter

# Example second splitter
token_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=20)

# Let user choose
def chunk_documents(documents, strategy="recursive"):
    if strategy == "recursive":
        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    elif strategy == "token":
        splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=20)
    else:
        raise ValueError("Unknown strategy")
    return splitter.split_documents(documents)
You sent
from langchain_qdrant import Qdrant
from qdrant_client import QdrantClient

client = QdrantClient(":memory:")  # or connect to a running Qdrant server
vectorstore = Qdrant.from_documents(chunks, embedding_model, client=client, collection_name="my_collection")

In [2]:
from langchain_text_splitters import TokenTextSplitter

In [6]:
with open('../rag_demo/data/educational_corpus.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

token_splitter = TokenTextSplitter(
    chunk_size = 128,
    chunk_overlap = 20
)

chunks = token_splitter.split_text(text)

print('total chunk created', len(chunks))
print('firts chunk preview', chunks[0])
print('second chunk preview', chunks[1])

total chunk created 469
firts chunk preview Physics explores the fundamental forces of nature, including gravity, electromagnetism, and nuclear interactions. It provides the foundation for understanding how the universe behaves at both macroscopic and microscopic levels.

Modern literature explores themes of identity, alienation, and social change. Authors use diverse styles and genres to express contemporary concerns.

The Renaissance was a period of great cultural and intellectual growth in Europe, marked by advancements in art, science, and philosophy. It laid the groundwork for the modern age.

Literature encompasses written works, especially those considered to have artistic or intellectual value. It reflects cultural values, human experiences,
second chunk preview  works, especially those considered to have artistic or intellectual value. It reflects cultural values, human experiences, and societal issues.

The invention of the internet revolutionized how people access informatio

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()
HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

embedding_model = HuggingFaceEmbeddings(model_name ="sentence-transformers/all-MiniLM-L6-v2")

sample_chunks = chunks[:3]
embeddings = embedding_model.embed_documents([chunk.page_content for chunk in sample_chunks])

print("Number of embeddings created:", len(embeddings)) 
print("Length of one embedding vector:", len(embeddings[0])) 
print("First 10 values of first embedding:\n", embeddings[0][:30])

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name ="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = FAISS.from_documents(chunks, embedding_model)

vectorstore.save_local('faiss_index')

print('FAISS index created and saved')

In [None]:
query = 'why machine learning is used'
query1 = 'Why Use Machine Learning?'

results = vectorstore.similarity_search(query, k =3)
results1 = vectorstore.similarity_search(query1, k =3)

for i, res in enumerate(results):
    print(f'\n-------- Results {i + 1} -------')
    print('page content preview: \n', res.page_content[:300])
    print('metadata: \n', res.metadata)

print('\n\n\n')

for i, res in enumerate(results1):
    print(f'\n-------- Results {i + 1} -------')
    print('page content preview: \n', res.page_content[:300])
    print('metadata: \n', res.metadata)

In [None]:
from transformers import pipeline

In [None]:
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")

In [None]:
prompt_template = """ You are a helpful assistant. 
Answer the question strictly using the provided context. 
If the answer is not in the context, say "I don't know." 

Context: 
{context} 

Question: 
{question} 

Answer: 
"""

query = 'Why Use Machine Learning?'

results = vectorstore.similarity_search(query, k=3)
context = '\n\n'.join([res.page_content for res in results])

prompt = prompt_template.format(context=context, question=query)

answer = qa_pipeline(prompt, max_length = 200)[0]['generated_text']

print('final answer:\n', answer)

In [8]:
from PyPDF2 import PdfReader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import pipeline

# -----------------------------
# Define the RAG pipeline
# -----------------------------
def rag_pipeline(pdf_path: str, query: str, start_page: int = 0, end_page: int = 5):
    # Step 1: Read PDF
    reader = PdfReader(pdf_path)
    documents = []
    for i, page in enumerate(reader.pages[start_page:end_page]):
        text = page.extract_text()
        if text:
            doc = Document(
                page_content=text,
                metadata={"page_number": i + start_page + 1, "source": pdf_path}
            )
            documents.append(doc)

    # Step 2: Split into chunks
    character_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    character_chunks = character_splitter.split_documents(documents)

    # Step 3: Create embeddings
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(character_chunks, embedding_model)

    # Step 4: Similarity search
    results = vectorstore.similarity_search(query, k=3)
    context = "\n\n".join([res.page_content for res in results])

    # Step 5: Strict prompt
    prompt_template = """You are a helpful assistant.
Answer the question strictly using the provided context.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}

Answer:
"""
    prompt = prompt_template.format(context=context, question=query)

    # Step 6: Generation with Flan-T5
    qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")
    answer = qa_pipeline(prompt, max_length=200)[0]["generated_text"]

    return answer, results


# -----------------------------
# Example usage
# -----------------------------
pdf_path = "./data/book.pdf"
query = "Why Use Machine Learning?"

final_answer, retrieved_chunks = rag_pipeline(pdf_path, query, start_page=30, end_page=35)

print("\nFinal Answer:\n", final_answer)
print("\nRetrieved Chunks Metadata:")
for res in retrieved_chunks:
    print(res.metadata)


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Final Answer:
 â€¢Getting insights about complex problems and large amounts of data

Retrieved Chunks Metadata:
{'page_number': 33, 'source': './data/book.pdf'}
{'page_number': 31, 'source': './data/book.pdf'}
{'page_number': 33, 'source': './data/book.pdf'}


In [None]:
from PyPDF2 import PdfReader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import pipeline

# -----------------------------
# Define the RAG pipeline
# -----------------------------
def rag_pipeline(pdf_path: str, query: str, start_page: int = 0, end_page: int = 5):
    print("Step 1: Reading PDF pages...")
    reader = PdfReader(pdf_path)
    documents = []
    for i, page in enumerate(reader.pages[start_page:end_page]):
        text = page.extract_text()
        if text:
            doc = Document(
                page_content=text,
                metadata={"page_number": i + start_page + 1, "source": pdf_path}
            )
            documents.append(doc)
    print(f"âœ… Extracted {len(documents)} pages of text from PDF.")

    print("\nStep 2: Splitting documents into chunks...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(documents)
    print(f"âœ… Created {len(chunks)} text chunks.")

    print("\nStep 3: Creating embeddings...")
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embedding_model)
    print("âœ… Embeddings generated and stored in FAISS vector database.")

    print("\nStep 4: Performing similarity search...")
    results = vectorstore.similarity_search(query, k=3)
    print(f"âœ… Retrieved {len(results)} most relevant chunks for the query.")

    context = "\n\n".join([res.page_content for res in results])

    print("\nStep 5: Building strict prompt...")
    prompt_template = """You are a helpful assistant.
Answer the question strictly using the provided context.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}

Answer:
"""
    prompt = prompt_template.format(context=context, question=query)
    print("âœ… Prompt prepared for generation.")

    print("\nStep 6: Generating answer with Flan-T5...")
    qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")
    answer = qa_pipeline(prompt, max_length=200)[0]["generated_text"]
    print("âœ… Answer generated successfully.")

    return answer, results


# -----------------------------
# Example usage
# -----------------------------
pdf_path = "./data/book.pdf"
query = "Why Use Machine Learning?"

print("\nðŸš€ Starting RAG pipeline...\n")
final_answer, retrieved_chunks = rag_pipeline(pdf_path, query, start_page=30, end_page=35)

print("\n==============================")
print("Final Answer:\n", final_answer)
print("==============================")
print("\nRetrieved Chunks Metadata:")
for res in retrieved_chunks:
    print(res.metadata)


In [None]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import pipeline

# -----------------------------
# Define the RAG pipeline for TXT
# -----------------------------
def rag_pipeline_txt(txt_path: str, query: str):
    print("Step 1: Reading TXT file...")
    with open(txt_path, "r", encoding="utf-8") as f:
        text = f.read()
    print(f"âœ… Loaded text file: {txt_path} (length: {len(text)} characters)")

    print("\nStep 2: Wrapping into a Document...")
    documents = [Document(page_content=text, metadata={"source": txt_path})]
    print(f"âœ… Created {len(documents)} Document object.")

    print("\nStep 3: Splitting into chunks...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(documents)
    print(f"âœ… Split into {len(chunks)} chunks.")

    print("\nStep 4: Creating embeddings...")
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embedding_model)
    print("âœ… Embeddings generated and stored in FAISS vector database.")

    print("\nStep 5: Performing similarity search...")
    results = vectorstore.similarity_search(query, k=3)
    print(f"âœ… Retrieved {len(results)} most relevant chunks for the query.")

    context = "\n\n".join([res.page_content for res in results])

    print("\nStep 6: Building strict prompt...")
    prompt_template = """You are a helpful assistant.
Answer the question strictly using the provided context.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}

Answer:
"""
    prompt = prompt_template.format(context=context, question=query)
    print("âœ… Prompt prepared for generation.")

    print("\nStep 7: Generating answer with Flan-T5...")
    qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")
    answer = qa_pipeline(prompt, max_length=200)[0]["generated_text"]
    print("âœ… Answer generated successfully.")

    return answer, results


# -----------------------------
# Example usage
# -----------------------------
txt_path = "./data/educational_corpus.txt"
query = "Explain the Renaissance."

print("\nðŸš€ Starting RAG pipeline for TXT...\n")
final_answer, retrieved_chunks = rag_pipeline_txt(txt_path, query)

print("\n==============================")
print("Final Answer:\n", final_answer)
print("==============================")
print("\nRetrieved Chunks Metadata:")
for res in retrieved_chunks:
    print(res.metadata)


In [4]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import pipeline

# -----------------------------
# Define the RAG pipeline for TXT
# -----------------------------
def rag_pipeline_txt(txt_path: str, query: str):
    print("Step 1: Reading TXT file...")
    with open(txt_path, "r", encoding="utf-8") as f:
        text = f.read()
    print(f"âœ… Loaded text file: {txt_path} (length: {len(text)} characters)")

    print("\nStep 2: Wrapping into a Document...")
    documents = [Document(page_content=text, metadata={"source": txt_path})]
    print(f"âœ… Created {len(documents)} Document object.")

    print("\nStep 3: Splitting into chunks...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=75)
    chunks = splitter.split_documents(documents)
    print(f"âœ… Split into {len(chunks)} chunks.")

    print("\nStep 4: Creating embeddings...")
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embedding_model)
    print("âœ… Embeddings generated and stored in FAISS vector database.")

    print("\nStep 5: Performing similarity search...")
    results = vectorstore.similarity_search(query, k=3)
    # for i, res in enumerate(results):
    #     print(f'\n-------- Results {i + 1} -------')
    #     print('page content preview: \n', res.page_content[:300])
    #     print('metadata: \n', res.metadata)

    print(f"âœ… Retrieved {len(results)} most relevant chunks for the query.")

    context = "\n\n".join([res.page_content for res in results])

    print("\nStep 6: Building strict prompt...")
    prompt_template = """You are a helpful assistant.
Answer the question strictly using the provided context.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}

Answer:
"""
    prompt = prompt_template.format(context=context, question=query)
    print("âœ… Prompt prepared for generation.")

    print("\nStep 7: Generating answer with Flan-T5...")
    qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")
    answer = qa_pipeline(prompt, max_length=200)[0]["generated_text"]
    print("âœ… Answer generated successfully.")

    return answer, results


# -----------------------------
# Example usage
# -----------------------------
txt_path = "./data/ott_subscription_faq.txt"
query = "Netflix price"

print("\nðŸš€ Starting RAG pipeline for TXT...\n")
final_answer, retrieved_chunks = rag_pipeline_txt(txt_path, query)

print("\n==============================")
print("Final Answer:\n", final_answer)
print("==============================")
print("\nRetrieved Chunks Metadata:")
for res in retrieved_chunks:
    print(res.metadata)



ðŸš€ Starting RAG pipeline for TXT...

Step 1: Reading TXT file...
âœ… Loaded text file: ./data/ott_subscription_faq.txt (length: 2904 characters)

Step 2: Wrapping into a Document...
âœ… Created 1 Document object.

Step 3: Splitting into chunks...
âœ… Split into 5 chunks.

Step 4: Creating embeddings...
âœ… Embeddings generated and stored in FAISS vector database.

Step 5: Performing similarity search...
âœ… Retrieved 3 most relevant chunks for the query.

Step 6: Building strict prompt...
âœ… Prompt prepared for generation.

Step 7: Generating answer with Flan-T5...


Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (731 > 512). Running this sequence through the model will result in indexing errors
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


âœ… Answer generated successfully.

Final Answer:
 3500 for 12 months of Netflix (1 screen).

Retrieved Chunks Metadata:
{'source': './data/ott_subscription_faq.txt'}
{'source': './data/ott_subscription_faq.txt'}
{'source': './data/ott_subscription_faq.txt'}


In [37]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import pipeline

# -----------------------------
# Define the RAG pipeline for TXT
# -----------------------------
def rag_pipeline_txt(txt_path: str, query: str):
    print("Step 1: Reading TXT file...")
    with open(txt_path, "r", encoding="utf-8") as f:
        text = f.read()
    print(f"âœ… Loaded text file: {txt_path} (length: {len(text)} characters)")

    print("\nStep 2: Wrapping into a Document...")
    documents = [Document(page_content=text, metadata={"source": txt_path})]
    print(f"âœ… Created {len(documents)} Document object.")

    # print("\nStep 3: Splitting into chunks...")
    # splitter = RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=75)
    # chunks = splitter.split_documents(documents)
    # print(f"âœ… Split into {len(chunks)} chunks.")

    print("\nStep 3: Splitting into chunks...")
    splitter = TokenTextSplitter(chunk_size=25, chunk_overlap=5)
    chunks = splitter.split_documents(documents)
    print(f"âœ… Split into {len(chunks)} chunks.")

    print("\nStep 4: Creating embeddings...")
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-mpnet-base-v2")
    vectorstore = FAISS.from_documents(chunks, embedding_model)
    print("âœ… Embeddings generated and stored in FAISS vector database.")

    print("\nStep 5: Performing similarity search...")
    results = vectorstore.similarity_search(query, k=3)
    # for i, res in enumerate(results):
    #     print(f'\n-------- Results {i + 1} -------')
    #     print('page content preview: \n', res.page_content[:300])
    #     print('metadata: \n', res.metadata)

    print(f"âœ… Retrieved {len(results)} most relevant chunks for the query.")

    context = "\n\n".join([res.page_content for res in results])

    print("\nStep 6: Building strict prompt...")
    prompt_template = """You are a helpful assistant.
Answer the question strictly using the provided context.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}

Answer:
"""
    prompt = prompt_template.format(context=context, question=query)
    print("âœ… Prompt prepared for generation.")

    print("\nStep 7: Generating answer with Flan-T5...")
    qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")
    answer = qa_pipeline(prompt, max_length=200)[0]["generated_text"]
    print("âœ… Answer generated successfully.")

    return answer, results


# -----------------------------
# Example usage
# -----------------------------
# txt_path = "./data/ott_subscription_faq.txt"
# query = "1 moonth Netflix"

# print("\nðŸš€ Starting RAG pipeline for TXT...\n")
# final_answer, retrieved_chunks = rag_pipeline_txt(txt_path, query)

# print("\n==============================")
# print("Final Answer:\n", final_answer)
# print("==============================")
# print("\nRetrieved Chunks Metadata:")
# for res in retrieved_chunks:
#     print(res.metadata)


In [38]:
txt_path = "./data/ott_subscription_faq.txt"
query = input("please give your query")

print("\nðŸš€ Starting RAG pipeline for TXT...\n")
final_answer, retrieved_chunks = rag_pipeline_txt(txt_path, query)

print("\n==============================")
print("Final Answer:\n", final_answer)
print("==============================")
print("\nRetrieved Chunks Metadata:")
for res in retrieved_chunks:
    print(res.metadata)

please give your query How much does Netflix cost for 3 months?



ðŸš€ Starting RAG pipeline for TXT...

Step 1: Reading TXT file...
âœ… Loaded text file: ./data/ott_subscription_faq.txt (length: 2904 characters)

Step 2: Wrapping into a Document...
âœ… Created 1 Document object.

Step 3: Splitting into chunks...
âœ… Split into 46 chunks.

Step 4: Creating embeddings...
âœ… Embeddings generated and stored in FAISS vector database.

Step 5: Performing similarity search...
âœ… Retrieved 3 most relevant chunks for the query.

Step 6: Building strict prompt...
âœ… Prompt prepared for generation.

Step 7: Generating answer with Flan-T5...


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


âœ… Answer generated successfully.

Final Answer:
 3500 for 12 months of Netflix (1 screen).

Retrieved Chunks Metadata:
{'source': './data/ott_subscription_faq.txt'}
{'source': './data/ott_subscription_faq.txt'}
{'source': './data/ott_subscription_faq.txt'}
