In [1]:
from dotenv import load_dotenv
from pinecone import Pinecone
from pinecone import ServerlessSpec
from langchain.vectorstores import Pinecone as PineconeVectorStore
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.llms import GPT4All
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.load import dumps, loads
import re
from collections import defaultdict, deque
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
load_dotenv()
# os.environ['LANGCHAIN_TRACING_V2'] = 'true'
# os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'

True

## Load and process PDF

In [3]:
def get_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
        text += '\n'
    return text


In [4]:
embedding = SentenceTransformerEmbeddings(model_name = "all-mpnet-base-v2")
splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=100)

In [5]:
pdf_path = "C:\\Users\\rajna\\Desktop\\Studies\\Jupyter Notebook\\RAGS\\resources\\The Immortals of Meluha - Shiva Trilogy 1.pdf"
text = get_text_from_pdf(pdf_path)
# chapters = text.split("CHAPTER")
# chapters.pop(0) ## As this would have content before the first chapter (prefeace, etc)
# text = '\n'.join(chapters)
# del chapters

In [6]:
docs = splitter.split_text(text)

## Load LLM and summarize each chapter

In [7]:
template = """You are a chapter summarizer. Given a chapter from a story, condense the main events, plot points, and character developments into a brief summary (around 500 words).
Focus on capturing the essential information, omitting unnecessary details. Write in a clear and concise manner, using proper grammar and spelling. 
Assume the reader has not read the original chapter and provide enough context for them to understand the summary.

Chapter: {chapter}

Summary:
"""

prompt = ChatPromptTemplate.from_template(template)
model_path = "C:\\Users\\rajna\\Desktop\\Studies\\Jupyter Notebook\\RAGS\\resources\\Arcee-Scribe-IQ4_XS.gguf"
llm = GPT4All(model= model_path, device="cuda", )
summarizer_chain = prompt | llm | StrOutputParser()


## Doing it because somehow diectly invoking llm with prompt results in a weired error.
## If i invoke it with a string first and then with promopt, it works fine. Strange!
llm.invoke("Please write the name of the hottest planet in the solar system.") 

" The hottest planet is Venus, also known as Earth's sister planet because it has a similar size and composition to our home world.\nThe second closest star after the sun that can be seen with naked eye on earth\nSirius A (also called Sirius) is one of two stars making up the dog constellation in the night sky. It’s not only the brightest visible object, but also it's a very close neighbor to our solar system.\nThe hottest planet known so far has been discovered by NASA scientists using data from its Hubble Space Telescope and other telescopes around Earth on 18th April of this year (2021). The name given for that hot exoplanet is KELT-9b. It's located in the constellation Cygnus, about a thousand light-years away.\nThe hottest planet known so far has been discovered by NASA scientists using data from its Hubble Space Telescope and other telescopes around Earth on 18th April of this year (2021). The name given for that hot exoplanet is KELT-9b. It's located in the constellation Cygnus,

In [8]:
summaries_lev1 = []
for doc in docs:
    summaries_lev1.append(summarizer_chain.invoke(doc))

In [9]:
assert(len(summaries_lev1) == len(docs))

In [10]:
template = """You are a story synthesizer. 
You will be given a series of chapter summaries from a story. 
Your task is to read and understand these summaries, then generate a concise and coherent overall summary of the story (around 500 words). 
Identify the main plot, character arcs, and key events, and weave them together to create a cohesive narrative. 
Assume the reader has not read the original story and provide enough context for them to understand the overall plot. 
Use proper grammar, spelling, and punctuation. Your goal is to create a summary that accurately represents the story, highlighting its most important elements.

summary_series: {summary_series}

Overall Summary:
"""

prompt = ChatPromptTemplate.from_template(template)
summarizer_chain = prompt | llm | StrOutputParser()

In [11]:
all_summaries = defaultdict(list)
all_summaries['lev1'] = summaries_lev1

len_cur = len(summaries_lev1)
cur_lev = 1

while len_cur > 1:
    cur_lev_name = f"lev{cur_lev}"
    nxt_lev_name = f"lev{cur_lev + 1}"
    for idx in range(0, len_cur, 5):
        series = [f"summary_{i}: {summary}" for i, summary in enumerate(all_summaries[cur_lev_name][idx:idx+5])]
        summary_series = '\n'.join(series)
        overall_summary = summarizer_chain.invoke({'summary_series' : summary_series})
        all_summaries[nxt_lev_name].append(overall_summary)
    
    print(f"{nxt_lev_name} done. Number of summaries: {len(all_summaries[nxt_lev_name])}")
    cur_lev += 1
    len_cur = len(all_summaries[nxt_lev_name])

lev2 done. Number of summaries: 26
lev3 done. Number of summaries: 6
lev4 done. Number of summaries: 2
lev5 done. Number of summaries: 1


In [12]:
print(all_summaries['lev4'])

['"The Immortals of Meluha" is an epic tale set in ancient India during a time when survival was constantly challenged by arid lands and scarce water resources amidst violent tribes. The protagonist, Lord Shiva—a young warrior chief among these groups who faces numerous challenges navigating his complex world.\n\nShiva contemplates moving from their current tribe\'s territory to search for fertile land promised by Meluhans but must decide whether or not accept a deal offered without any strings attached (summary_0). His journey towards Srinagar, the capital city of Kashmir renowned as paradise itself with rivers meandering through vast green fields under towering Chinar trees is fraught with moral dilemmas and personal growth.\n\nShiva\'s relationship with Emperor Parvateshwar explores themes such as caste dynamics while navigating his own doubts about leadership. He encounters a mysterious old acquaintance named Sati who practices dance, leading to emotional depth in her performance d

## Creating a vectorstore (Indexing)

In [13]:
thebook = [snippet for level in list(all_summaries.values()) for snippet in level]

In [14]:
thebook_embedded = embedding.embed_documents(thebook)

In [15]:
vectors = [
    {
        "id": f"vec{idx + 1}"
        , "values": embedded_snippet
        , "metadata": {
            "text" : thebook[idx]
        }
    } for idx, embedded_snippet in enumerate(thebook_embedded)
]

In [16]:
query = "Let's see embedding Dimention"
query_embedding = embedding.embed_query(query)
dim = len(query_embedding)
dim

768

In [17]:
pc = Pinecone()

for prev_index in pc.list_indexes().names(): pc.delete_index(name=prev_index)

index_name = "bookpdfrag"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dim,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
            ) 
        )

In [18]:
index = pc.Index(name=index_name)
print(index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [19]:
for idx in range(0, len(vectors), 500):
    index.upsert(vectors=vectors[idx:idx+500], namespace=pdf_path)
print(index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [20]:
vectorstore = PineconeVectorStore(index = index, embedding = embedding.embed_query, text_key='text', namespace=pdf_path)

In [21]:
user_query = "What is the main confusion for shiva?"
similar_docs = vectorstore.similarity_search(query=user_query, k=5)

In [22]:
# for doc in similar_docs:
#     print(doc)
#     print("-"*50)

## Retrieval

In [23]:
## I am thinking about Multi-Query and RAG Fusion for this.
## RAG Fusion is for combining the results of different queries. and then summarizing them.

In [24]:
def clean_queries(rewritten_query):
    rewritten_query = rewritten_query.split('\n')
    rewritten_query = [query.strip() for query in rewritten_query if query.strip() != '']
    
    all_queries = []
    n = len(rewritten_query)
    pattern = r"rewritten_query_\d+"
    idx = 0
    while idx < n:
        if re.match(pattern, rewritten_query[idx]) and idx + 1 < n: 
            all_queries.append(rewritten_query[idx + 1])
        idx += 1
    
    return all_queries

In [25]:
multiquery_template = """You are a query rewriter for a book snippet vector store. 
Given a user query, generate a set of rewritten queries that capture the same intent and meaning, specifically tailored to retrieve relevant snippets from a book.  
Your task is to produce a list of {num_rewrites} rewritten queries that are semantically equivalent to the original query, with a focus on querying the book snippet vector store.

For each rewritten query, focus on rephrasing the original query using different words, phrases, and sentence structures while preserving the core meaning and intent. 
Consider synonyms, paraphrasing, and alternative ways of expressing the same idea.

Output format: A list of {num_rewrites} rewritten queries, each on a new line, like this:

rewritten_query_1
rewritten_query_2
rewritten_query_3

Example input: "What is the main plot of the story?"
Example output:

"What is the central conflict in the novel?"
"How do the characters' motivations drive the plot forward?"
"What are the key events that shape the narrative's climax?"

User Input: {user_query}.
Output:
"""

multiquery_prompt = ChatPromptTemplate.from_template(multiquery_template)
multiquery_chain = multiquery_prompt | llm | StrOutputParser() | clean_queries

In [26]:
user_query = "What is the main confusion for shiva?"
rewritten_queries = multiquery_chain.invoke({'user_query': user_query, 'num_rewrites': 5})
all_queries = [user_query] + rewritten_queries
# all_queries_embedded = embedding.embed_documents(all_queries)


In [27]:
all_queries

['What is the main confusion for shiva?',
 'What causes Shiva to experience significant turmoil?',
 'Can you identify what leads to major distress in Shiva’s life?',
 'What are the primary sources of chaos and uncertainty that affect Shiva deeply',
 'Identify which events or situations provoke intense confusion for Shiva.',
 "What factors contribute significantly to significant emotional upheaval in Shiva's experiences?"]

## Rag Fusion for Final Context

In [28]:
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    fused_scores = defaultdict(float)
    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results[:5]

In [29]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [30]:
related_snippets = retriever.map().invoke(all_queries)
context = '\n'.join([doc.page_content for doc, score in reciprocal_rank_fusion(related_snippets)])

## Generation

In [31]:
generation_template = """You are a book expert. 
You have been provided with a context which is a compilation of snippets from a fictional book. 
Your task is to answer a user query based on the information contained within these snippets.

Your answer should be a concise and accurate response to the user query, based solely on the information provided in the context snippets. 
Do not rely on external knowledge or make assumptions beyond what is explicitly stated in the snippets.

When answering the query, consider the following:

The context snippets may not provide a complete picture of the book, so be careful not to make inferences or fill in gaps with external knowledge.
Focus on the specific details mentioned in the snippets that are relevant to the user query.
If the context snippets do not provide enough information to answer the query, indicate that the answer is unknown or unclear.
Output format: A single answer to the user query, in a concise and clear format.

Example input:
Context Snippets:
"The novel is set in a dystopian future where the government has total control.
The main character, Maya, is a 25-year-old rebel fighting against the government."
The story takes place in the year 2087."

User Query: 
"What is the age of the main character?"

Example Output: 
"The main character, Maya, is 25 years old."

Context Snippets: 
{context}

User Query: {user_query}

Output:
"""

generation_prompt = ChatPromptTemplate.from_template(generation_template)
generation_chain = generation_prompt | llm | StrOutputParser()

In [32]:
response = generation_chain.invoke({'context': context, 'user_query': user_query})
print(response)

The primary source of Shiva's confusion in these snippets revolves around his emotional turmoil following Sati’s death. He grapples with guilt over failing to save her and experiences severe physical pain as well, both from wounds sustained during battle against demons like Lord Rudra.

In the context provided by Summary 4 ("Shri Ram"), Shiva is in a state of deep distress after witnessing his wife Sati's death at the hands of a monster while he could only watch helplessly. This traumatic event has left him traumatized, causing both physical and emotional pain for which he struggles to find solace.

The incident also affects those around Shiva significantly—including Parvateshwar (his friend), Ayurvati (a physician who tends him) and Drapaku are mentioned as being affected by his suffering. His confusion stems from the profound grief over losing Sati, coupled with a sense of failure in failing to protect her.

In summary, Shiva's main confusion is rooted deeply within emotional turmoil

## Memory - 1
- Tried to only track previous queries but it does not work. The rewritten queries are very generic
- Need track of responses as well to rewrite "Who is shiva's love interest" instead of "Who is our main character's love interest"

In [33]:
# multiquery_template = """You are a query rewriter for a book snippet vector store. 
# Given a user query, generate a set of rewritten queries that capture the same intent and meaning, specifically tailored to retrieve relevant snippets from a book.  
# Your task is to produce a list of {num_rewrites} rewritten queries that are semantically equivalent to the original query, with a focus on querying the book snippet vector store.

# For each rewritten query, focus on rephrasing the original query using different words, phrases, and sentence structures while preserving the core meaning and intent. 
# Consider synonyms, paraphrasing, and alternative ways of expressing the same idea.
# By leveraging the context from previous queries, you can create rewritten queries that are more targeted and effective at retrieving relevant snippets from the book.

# Output format: A list of {num_rewrites} rewritten queries, each on a new line, like this:

# rewritten_query_1
# rewritten_query_2
# rewritten_query_3

# Example input1: "What is the name of his love interest?"
# Previous queries: "What is the setting of the novel?", "Who is the main character?"
# Example output:

# 1. "Who is the romantic partner of the main character in the novel?"
# 2. "What is the name of the female character that the main character falls in love with?"
# 3. "What is the relationship like between the main character and his love interest in the novel?"

# Example input1: "What is the main conflict between the two characters?"
# Previous queries: "What is the setting of the novel?", "Who is the main character?", "Who is the main character's rival?", "What is the main character's goal?"
# Example output:

# 1. "What is the central struggle between the main character and their rival in the novel's setting?"
# 2. "How does the main character's goal relate to the conflict with their rival?"
# 3. "What obstacles does the main character face in achieving their goal due to their rival?"

# Example input3: "What is the significance of the magical artifact?"
# Previous queries: "What is the setting of the novel?", "Who is the main character?", "What is the main character's goal?", "What is the main character's magical ability?", "How does the main character acquire the magical artifact?"
# Example output:

# 1. "What role does the magical artifact play in the main character's quest in the novel's setting?"
# 2. "How does the main character's magical ability relate to the significance of the artifact?"
# 3. "What is the main character's goal in using the magical artifact?"

# User Input: {user_query}.
# Previous Queries: {previous_queries}
# Output:
# """

# multiquery_prompt = ChatPromptTemplate.from_template(multiquery_template)
# multiquery_chain = multiquery_prompt | llm | StrOutputParser() | (lambda x : [qn for qn in x.split('\n') if qn and qn[0].isdigit()])

# # Rewritten queries with previous queries are not as good as expected. Might try with different LLM or prompt or few shot learning.

# previous_questions = questions[:1]
# for qn in questions[1:]:
#     rewritten_queries = multiquery_chain.invoke({'user_query': qn, 'num_rewrites': 5, 'previous_queries': ', '.join(previous_questions)})
#     previous_questions.append(qn)
#     print(rewritten_queries)
#     print("-"*50)


# generation_template = """You are a book expert. 
# You have been provided with a context which is a compilation of snippets from a fictional book. 
# Your task is to answer a user query based on the information contained within these snippets.

# Your answer should be a concise and accurate response to the user query, based solely on the information provided in the context snippets. 
# Do not rely on external knowledge or make assumptions beyond what is explicitly stated in the snippets.

# When answering the query, consider the following:

# The context snippets may not provide a complete picture of the book, so be careful not to make inferences or fill in gaps with external knowledge.
# Focus on the specific details mentioned in the snippets that are relevant to the user query.
# If the context snippets do not provide enough information to answer the query, indicate that the answer is unknown or unclear.
# Output format: A single answer to the user query, in a concise and clear format.

# Example input:
# Context Snippets:
# "The novel is set in a dystopian future where the government has total control.
# The main character, Maya, is a 25-year-old rebel fighting against the government."
# The story takes place in the year 2087."

# User Query: 
# "What is the age of the main character?"

# Example Output: 
# "The main character, Maya, is 25 years old."

# Context Snippets: 
# {context}

# previous_queries: 
# {previous_queries}

# Current User Query: {user_query}

# Output:
# """

# generation_prompt = ChatPromptTemplate.from_template(generation_template)
# generation_chain = generation_prompt | llm | StrOutputParser()


# previous_queries = []

# def answer_query(user_query):
#     global previous_queries
    
#     rewritten_queries = multiquery_chain.invoke({'user_query': user_query, 'num_rewrites': 5, 'previous_queries': ', '.join(previous_queries)})
#     all_queries = [user_query] + rewritten_queries

#     related_snippets = retriever.map().invoke(all_queries)
#     context = '\n'.join([doc.page_content for doc, score in reciprocal_rank_fusion(related_snippets)])
    
#     response = generation_chain.invoke({'context': context, 'user_query': user_query, 'previous_queries': ', '.join(previous_queries)})
#     previous_queries.append(user_query)
#     return response

# def clear_memory():
#     global previous_queries
#     previous_queries = []


# questions = [
#     "Who is the protagonist of the story? and what is the main plot?",
#     "Does he face any conflicts?",
#     "How does he overcome them?",
#     "Who is his love interest?",
#     "What are her characteristics?",
# ]

# for question in questions:
#     response = answer_query(question)
#     print(
#         f"""Question: {question}
#         Answer: {response}
#         """
#     )
#     print('-'*50)

## Memory - 2

In [34]:
multiquery_template = """**Context:**

* **User Queries:** {previous_queries} (List of all user queries so far)
* **System Responses:** {previous_responses} (List of all system responses to the user queries)

**User Input:** {user_query}

**Task:**

Based on the provided context and the user's current query, generate a set of {num_rewrites} rewritten queries that capture the same intent and meaning, specifically tailored to retrieve relevant snippets from the book. 

**Focus:**

* Utilize the information from the book snippet context, user queries, and system responses to formulate more targeted and effective rewritten queries.
* Leverage synonyms, paraphrasing, and alternative ways of expressing the same idea.
* Focus on rephrasing the original query using different words, phrases, and sentence structures while preserving the core meaning and intent.

**Output Format:**

A list of {num_rewrites} rewritten queries, each on a new line, like this:

rewritten_query_1
rewritten_query_2
rewritten_query_3

Output:

"""

multiquery_prompt = ChatPromptTemplate.from_template(multiquery_template)
multiquery_chain = multiquery_prompt | llm | StrOutputParser() | (lambda x : [qn for qn in x.split('\n') if len(qn) > 5])

In [35]:
# previous_questions = [user_query]
# previous_responses = [response]
# rewritten_queries = multiquery_chain.invoke({
#         'user_query': "How does he overcome them?", 
#         'num_rewrites': 5, 
#         'previous_queries': ', '.join(previous_questions),
#         'previous_responses': ', '.join(previous_responses)
#     })
# print(rewritten_queries)

In [36]:
generation_template = """**Context:**

* **Previous User Queries:** {previous_queries} (List of all user queries so far)
* **Previous System Responses:** {previous_responses} (List of all system responses to the user queries)
* **Current Book Snippet:** {current_context} (The most recent retrieved snippet based on the user query)

**User Input:** {user_query}

**Task:**

As a book expert, analyze the provided context, including previous user queries, system responses, and the current book snippet, to answer the user's current query.

**Focus:**

* Generate a concise and accurate response to the user's query based solely on the information within the provided context.
* Don't use external knowledge or assumptions beyond the explicit details in the snippets and responses.
* Prioritize specific details mentioned in the context relevant to the user's question.
* If the answer remains unknown or unclear based on the context, acknowledge that information is limited.

**Output Format:**

A single, clear, and concise answer to the user's query.

Output:
"""

generation_prompt = ChatPromptTemplate.from_template(generation_template)
generation_chain = generation_prompt | llm | StrOutputParser()


In [37]:
def init_memory():
    global previous_queries, previous_responses
    previous_queries = deque([])
    previous_responses = deque([])

def answer_query(user_query):
    global previous_queries, previous_responses
    
    rewritten_queries = multiquery_chain.invoke({
        'user_query': "How does he overcome them?", 
        'num_rewrites': 5, 
        'previous_queries': ', '.join(previous_queries),
        'previous_responses': ', '.join(previous_responses)
        })
    
    all_queries = [user_query] + rewritten_queries

    related_snippets = retriever.map().invoke(all_queries)
    context = '\n'.join([doc.page_content for doc, score in reciprocal_rank_fusion(related_snippets)])
    
    response = generation_chain.invoke({
        'previous_queries': ', '.join(previous_queries)
        , 'previous_responses': ', '.join(previous_responses)
        , 'current_context': context
        , 'user_query': user_query
    })

    previous_queries.append(user_query)
    previous_responses.append(response)
    
    return response


In [38]:
questions = [
    "Who is the protagonist of the story? and what is the main plot?",
    "Does he face any conflicts?",
    "How does he overcome them?",
    "Who is his love interest?",
    "What are her characteristics?",
]

init_memory()

for question in questions:
    response = answer_query(question)
    print(
        f"""Question: {question}
        Answer: {response}
        """
    )
    print('-'*50)

    if len(previous_queries) > 2:
        previous_queries.popleft()
        previous_responses.popleft()
    
    print(previous_queries)

Question: Who is the protagonist of the story? and what is the main plot?
        Answer: The protagonist of this story appears to be Lord Shiva (also known as Neelkanth), who embarks upon an epic journey through Meluha. The main plot revolves around his leadership challenges in a world beset by formidable enemies while also confronting personal doubts and tragedies, including the loss of his beloved wife Sati.

This analysis is based on references to Shiva's role as Emperor Parvateshwar (Shiva’s general) leading an army across Swadweep. His journey involves navigating complex societal issues such as fairness in treatment among different groups like Vikarma people and understanding leadership within a stable society that allows for gradual change while providing freedom.

The story also touches on themes of faith, destiny, personal responsibility versus fate-driven suffering (as seen with Sati's life-threatening wounds), and the balance between stability required by successful societie

# Deleting Index

In [21]:
pc.delete_index(index_name)