## Import Libraries


In [1]:
# Imports
#INITIAL CODE
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Jupyter-specific imports
from IPython.display import display, Markdown

# Set environment variable for protobuf
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

## Load PDF

In [2]:
from langchain.document_loaders import PyPDFLoader


local_path = "D:\Devlancers\data\HP.pdf"
#D:\Devlancers\data
if local_path:
    loader = PyPDFLoader(local_path)
    data = loader.load()
    print(f"PDF loaded successfully with {len(data)} pages.")

    # Print first page content for verification
    print(data[0].page_content if data else "No data extracted.")
else:
    print("Upload a PDF file")


PDF loaded successfully with 768 pages.



## Split text into chunks

In [3]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=200)
chunks = text_splitter.split_documents(data)
print(f"Text split into {len(chunks)} chunks")

Text split into 1808 chunks


# Batch Processing for Embeddings

In [None]:

from concurrent.futures import ThreadPoolExecutor
import numpy as np
embedding_model = OllamaEmbeddings(model="nomic-embed-text")
embeddings = []
batch_size = 10
with ThreadPoolExecutor() as executor:
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1} / {len(chunks) // batch_size + 1}")  # Debug statement
        batch_texts = [chunk.page_content for chunk in batch]  # Extract texts from chunks
        batch_embeddings = embedding_model.embed_documents(batch_texts)  # Correct method
        embeddings.extend(batch_embeddings)
embeddings = np.array(embeddings, dtype=np.float32)
print("Embeddings generated successfully.")  # Debug statement

Processing batch 1 / 181
Processing batch 2 / 181
Processing batch 3 / 181
Processing batch 4 / 181
Processing batch 5 / 181
Processing batch 6 / 181
Processing batch 7 / 181
Processing batch 8 / 181
Processing batch 9 / 181
Processing batch 10 / 181
Processing batch 11 / 181
Processing batch 12 / 181
Processing batch 13 / 181
Processing batch 14 / 181
Processing batch 15 / 181
Processing batch 16 / 181
Processing batch 17 / 181
Processing batch 18 / 181
Processing batch 19 / 181
Processing batch 20 / 181
Processing batch 21 / 181
Processing batch 22 / 181
Processing batch 23 / 181
Processing batch 24 / 181
Processing batch 25 / 181
Processing batch 26 / 181
Processing batch 27 / 181
Processing batch 28 / 181
Processing batch 29 / 181
Processing batch 30 / 181
Processing batch 31 / 181
Processing batch 32 / 181
Processing batch 33 / 181
Processing batch 34 / 181
Processing batch 35 / 181
Processing batch 36 / 181
Processing batch 37 / 181
Processing batch 38 / 181
Processing batch 39 /

# Create FAISS Index with IndexIVFFlat

In [None]:

from langchain.vectorstores import FAISS
import faiss 
num_clusters = 100
dim = embeddings.shape[1]
quantizer = faiss.IndexFlatL2(dim)  # Quantizer for clustering
index = faiss.IndexIVFFlat(quantizer, dim, num_clusters, faiss.METRIC_L2)
print("Training FAISS index...")  # Debug statement
index.train(embeddings)  # Train clustering
index.add(embeddings)  # Add vectors to index
vector_db = FAISS.from_documents(chunks, embedding_model)

# Save the FAISS index associated with the vector_db
vector_db.save_local("faiss_index")

print("FAISS vector database created and saved successfully.")

Training FAISS index...
FAISS vector database created and saved successfully.


## Set up LLM and Retrieval

In [None]:
local_model = "phi3:medium"
llm = ChatOllama(model=local_model)

In [15]:
# Query prompt template
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate 2
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

# Set up retriever
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

## Create chain

In [16]:
# RAG prompt template
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [20]:
# Create chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | ChatOllama(model="phi3:medium")
    | StrOutputParser()
)

## Chat with PDF

In [21]:
def chat_with_pdf(question):
    """
    Chat with the PDF using the RAG chain.
    """
    return display(Markdown(chain.invoke(question)))

In [None]:
#RESPONSE WITH Phi3:mini
chat_with_pdf("How did Voldemort die in the book?")

According to the given documents that make up a part of J.K. Rowling's "Harry Potter" series, specifically from "The Death Eaters" and related texts where Severus Snape is involved: As per the narrative provided in these fragments, Voldemort did not die within this specific text excerpt; he survives after an apparent fall which causes him to seemingly be on fire for a moment. However, it's important to note that by the end of "Harry Potter and The Deathly Hallows," all copies of Harry are transported back in time using his Patronus Charm when they confront Voldemort at Hogwarts School; therefore, he ultimately meets his demise there later.

In [None]:
#RESPONSE WITH phi3:medium
chat_with_pdf("How did Voldemort die in the book?")

In the book, Voldemort died when a fragment of his own soul rebounded upon him after he attempted to kill Harry Potter with the Killing Curse. This happened on the night Lord Voldemort tried to murder Lily and her son Harry. As Lily cast her life as a shield between them, the curse backfired onto Voldemort, destroying his body but leaving behind a fragment of his soul attached to Harry.

When Harry Potter faced Voldemort for the final battle, this lingering piece of Voldemort's soul was drawn out by Harry using the Elder Wand. This ultimately led to Voldemort's death as he was killed not directly but rather through the fatal effects of his own rebounding curse and the actions that followed. The event is narrated in a document with id='5409da6b-b3b1-44f4-ba82-9938de61ae2f'.

In [23]:
chat_with_pdf("Explain the seven Horcruxes.?. How was it created?")

In J.K. Rowling's Harry Potter series, a Horcrux is an object in which a dark wizard hides part of their soul to achieve immortality. Voldemort made seven Horcruxes by splitting his soul and hiding each piece within these objects.

The process of creating a Horcrux involves the murderer committing cold-blooded murder, which is considered the supreme act of evil in the wizarding world. This act rips their soul apart from itself; however, this doesn't result in immediate death due to dark magic that binds them together again temporarily.

The seven Horcruxes Voldemort made are as follows:
1. His diary (destroyed by Harry Potter) - The first Horcrux created after killing his father and the Dursleys.
2. Marvolo Gaunt's Ring (found in Gringotts Bank, destroyed by Albus Dumbledore) - Made when Voldemort killed the old man who owned it.
3. Salazar Slytherin's Locket (destroyed by Ron Weasley and Hermione Granger using Fiendfyre) - Created after killing Hepzibah Smith, an ancestor of Marvolo Gaunt.
4. Helga Hufflepuff's Cup (destroyed by Hermione Granger in the Room of Requirement) - Made when Voldemort killed Mundungus Fletcher and several other muggles to frame Sirius Black.
5. Tom Riddle Sr.'s Diadem (found at Hogwarts, destroyed by Vincent Crabbe under orders from Snape in the Battle of the Seven Potters) - Created after killing Hepzibah Smith as well.
6. Nagini (the snake killed by Neville Longbottom and Fiendfyre).
7. Harry himself (unintentionally) - The last piece of Voldemort's soul was unintentionally attached to the infant Harry when he tried to kill him, resulting in a fragment of his own soul living within him instead.

It should be noted that creating Horcruxes is extremely dark and dangerous magic; it can never be undone by anyone but the creator themselves, and even then, it's an unstable process that leads to Voldemort's distorted appearance, lack of true human emotions, and a fragmented mind. The creation of more than one Horcrux is considered the highest form of Dark Magic in the series.

In [24]:
chat_with_pdf(" how did harry leave the dursley's house for the last time in the book?") 

In the final scene where Harry leaves the Dursleys' house for the last time in "Harry Potter and the Deathly Hallows", he does so cheerfully despite their solid dislike. The text doesn't describe his departure as an event of great significance or ceremony, but it is implied that they left soon after a couple of events - Harry told Hedwig (his owl) about their imminent departure and Dudley (Harry's cousin), Aunt Petunia, Uncle Vernon, and himself were leaving the house. The text doesn't specify how he physically exited the house. However, it is mentioned that as they approached the lifts in the Atrium to leave, Harry had misgivings about being intercepted due to their arrival with a silver stag (Patronus) and several other people, which could draw attention.