In [1]:
!pip install langchain_community
!pip install faiss-gpu
!pip install rank_bm25 
!pip install ragas
!pip install datasets
!pip install pdfplumber

Collecting langchain_community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain_community)
  Downloading langchain_core-0.3.54-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.23 (from langchain_community)
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain<1.0.0,>=0.3.23->langchain_community)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting async-timeout<6.0,>=4.0 (from aiohttp<4.0.0,>=3.8.3->langchain_community)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting python-

In [2]:
import os
os.getcwd()

'/kaggle/working'

In [3]:
import pdfplumber

# Open the PDF file
with pdfplumber.open('/kaggle/input/star-wars/Star Wars - Brotherhood Mike Chen.pdf') as pdf:
    # Open the text file for writing
    with open('knowledge_base.txt', 'w', encoding='utf-8') as output:
        # Iterate over pages 10 to 349 (0-indexed, so subtract 1)
        for i in range(9, 349):  # Page 10 is index 9
            page = pdf.pages[i]
            text = page.extract_text() or ""  # Handle cases where text is None
            output.write(text + '\n')  # Write text to file with a newline

In [4]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
from rank_bm25 import BM25Okapi
import numpy as np
import warnings

from datasets import Dataset
warnings.filterwarnings("ignore")
import textwrap

In [5]:
# 1. Load your text files
file_paths = ["/kaggle/working/knowledge_base.txt"]
documents = []

for file_path in file_paths:
    loader = TextLoader(file_path)
    docs = loader.load()
    documents.extend(docs)

# 2. Define chunking parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=768,        # You can try 100, 250, 512, etc.
    chunk_overlap=128       # Try 0, 50, 100, etc.
)

# 3. Split the documents
chunks = text_splitter.split_documents(documents)


# 4. Output result
print(f"Total chunks: {len(chunks)}")
print(f"First chunk content:\n{chunks[0].page_content}")

# Optional: Save the chunks to a file
with open("chunked_output.txt", "w", encoding='utf-8') as f:
    for i, chunk in enumerate(chunks):
        f.write(f"--- Chunk {i + 1} ---\n")
        f.write(chunk.page_content + "\n\n")

Total chunks: 897
First chunk content:
A long time ago in a galaxy far, far away….
The CLONE WARS have erupted. Caught off guard by the quickly
expanding conflict, the overwhelmed Jedi Order has rushed the
advancement of Padawans to better integrate into the Grand Army of the
Republic and assist the war effort.
Newly promoted Jedi Knight Anakin Skywalker is increasingly torn
between his growing duties to the Republic and his secret marriage to
Senator Padmé Amidala of Naboo. With his Knighting, his mentor Obi-
Wan Kenobi has been elevated to the Jedi Council under the rank of Jedi
Master.
As dark forces push the Jedi further toward their transformation from
guardians to soldiers, Anakin and Obi-Wan find themselves on equal
footing yet opposing paths, each pondering the meaning of peace and


In [6]:
# Prepare documents and their metadata
texts = [chunk.page_content for chunk in chunks]
metadata = [chunk.metadata for chunk in chunks]
print(len(texts))

897


In [7]:
# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

# Create FAISS vector database
vectordb = FAISS.from_documents(chunks, embedding_model)

# Save FAISS index to disk for later use
vectordb.save_local("faiss_index")

# Check the number of stored documents
print(f"Number of documents in the vector store: {vectordb.index.ntotal}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Number of documents in the vector store: 897


In [8]:
# BM25 Indexing
tokenized_texts = [text.split() for text in texts]
bm25 = BM25Okapi(tokenized_texts)

def reciprocal_rank_fusion(results_bm25, results_embedding, k=2):
    scores = {}

    # Use document content or metadata as the key
    for rank, (doc, score) in enumerate(results_bm25):
        doc_id = doc.page_content  # Or use doc.metadata.get("source", "unknown") if available
        scores[doc_id] = scores.get(doc_id, 0) + 1 / (rank+1) # (k + rank + 1)
        print("BM25", scores[doc_id])

    for rank, (doc, score) in enumerate(results_embedding):
        doc_id = doc.page_content  # Use the same identifier
        scores[doc_id] = scores.get(doc_id, 0) + 1 / (rank+1) # (k + rank + 1)
        print("Dense", scores[doc_id])

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)


# Extract page content and metadata properly
def format_response(doc):
    return f"Page {doc.metadata.get('page', 'Unknown')}: {doc.page_content.strip()}"

In [9]:
# Retrieve function
def retrieve(query, k=3):
    query_embedding = embedding_model.embed_query(query)
    results_embedding = vectordb.similarity_search_with_score_by_vector(query_embedding, k=k)
    results_embedding = sorted(results_embedding, key=lambda x: x[1], reverse=True)
    
    print("============Dense Embeddings=============")
    for doc, score in results_embedding:
        print(f"page {doc.metadata.get('page','Unknown')} - Score: {score:.4f} - {doc.page_content[:100]}...")

    # Get BM25 scores for all documents and sort to get top-k results
    results_bm25 = [(idx, bm25.get_scores(query.split())[idx]) for idx in range(len(texts))]
    results_bm25 = sorted(results_bm25, key=lambda x: x[1], reverse=True)[:k]  # Keep only top-k results
    # Convert BM25 results to (Document, score) format
    results_bm25_docs = [(Document(page_content=texts[idx], metadata=metadata[idx]), score) for idx, score in results_bm25]
   
    print("************BM25 Results*************")
    for doc, score in results_bm25_docs:
        print(f"page {doc.metadata.get('page','Unknown')} - Score: {score:.4f} - {doc.page_content[:100]}...")
    
    # Create a lookup dictionary {document content -> Document object}
    doc_lookup = {doc.page_content: doc for doc, _ in results_bm25_docs}
    doc_lookup.update({doc.page_content: doc for doc, _ in results_embedding})

    # Fuse results
    fused_results = reciprocal_rank_fusion(results_bm25_docs, results_embedding)
    
    # Format results, ensuring document IDs are mapped back to actual Documents
    return [format_response(doc_lookup[doc_id]) for doc_id, _ in fused_results if doc_id in doc_lookup]

    #fused_results = reciprocal_rank_fusion(results_bm25, results_embedding)
    #return [(texts[idx], metadata[idx]["page"] if "page" in metadata[idx] else "Unknown") for idx, _ in fused_results]

In [10]:
from langchain.schema import Document

def retrieve_dense(query, k=3):
    query_embedding = embedding_model.embed_query(query)
    results_embedding = vectordb.similarity_search_with_score_by_vector(query_embedding, k=k)
    
    # Optionally sort descending by score if needed
    results_embedding = sorted(results_embedding, key=lambda x: x[1], reverse=True)
    
    print("============Dense Embeddings=============")
    for doc, score in results_embedding:
        print(f"page {doc.metadata.get('page','Unknown')} - Score: {score:.4f} - {doc.page_content[:100]}...")
    
    # Return just the documents (or both doc and score if you want)
    return [format_response(doc) for doc, _ in results_embedding]

In [11]:
# model_name = "tiiuae/Falcon3-3B-Instruct"
model_name = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto", #device_map='cuda'
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [12]:
import time
start_time = time.time()

In [13]:
# Query example
question = "What is the name of the Neimoidian guard who assists Obi-Wan?"
retrieved_responses = retrieve(question, k=15)

page Unknown - Score: 0.2394 - from the moment he emerged.
“Master Kenobi. Welcome, emissary,” Eyam said. “I’m pleased to make
your...
page Unknown - Score: 0.2391 - posture clear enough for others to see. “Yes, the datapad points to the
Republic. But we don’t know ...
page Unknown - Score: 0.2362 - And not exactly with the Jedi, either. If I recall, you were involved with
some of that.”
Obi-Wan’s ...
page Unknown - Score: 0.2320 - “Qui-Gon Jinn was an honorable man.”
Now Obi-Wan moved to the offensive, a momentum to his words. “I...
page Unknown - Score: 0.2316 - And that was all he needed to know.
Obi-Wan attached the armor to his forearm, then moved in precise...
page Unknown - Score: 0.2307 - But he had a different goal than simply defeating an opponent. His entire
purpose here was to clear ...
page Unknown - Score: 0.2306 - and Obi-Wan put his faith in it—and in Ruug.
Five. Four. Three.
Obi-Wan’s angle shifted enough to pr...
page Unknown - Score: 0.2305 - as she walked forward. “

In [14]:
# # Query processing
# question = "What was the cause of the bombing on Cato Neimoidia in Brotherhood?"
# retriever = vectordb.as_retriever(search_kwargs={"k": 10})
# docs = retriever.get_relevant_documents(question)

# # Print results
# for i, doc in enumerate(docs, 1):
#     page_number = doc.metadata.get('page', 'Unknown')
#     # print(f"Document {i} - Page {page_number} - Score: {doc.metadata.get('score', 'N/A')}")
#     print(doc.page_content[:])  # Print first 500 characters of each result
#     print("-" * 80)

In [15]:
# Create a pipeline
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=False,
max_new_tokens=5000,
do_sample=False
)

Device set to use cuda:0


In [16]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hBuilding wheels for collected packages: breadability, docopt
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Created wheel for breadabil

In [17]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import textwrap

def generate_lsa_summary(retrieved_responses, num_summary_sentence=50):
    # Combine the retrieved responses into one string
    text = " ".join(retrieved_responses)
    
    # Initialize LSA summarizer
    LANGUAGE = "english"
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    lsa_summarizer = LsaSummarizer()
    
    # Generate the summary
    summary = []
    for sentence in lsa_summarizer(parser.document, num_summary_sentence):
        summary.append(str(sentence))
    
    # Join the summarized sentences and wrap them for better readability
    summarized_text = " ".join(summary)
    return textwrap.fill(summarized_text, 100)

summarized_responses = generate_lsa_summary(retrieved_responses)

In [18]:
def reorder_sorted_responses(sorted_responses):
    # Alternate between most important (edges) and least important (center)
    most_important = sorted_responses[::2]  # Take every other response starting with the first
    least_important = sorted_responses[1::2]  # Take every other response starting with the second

    # Merge: Place least important in the center
    reordered_responses = []
    while most_important or least_important:
        if most_important:
            reordered_responses.append(most_important.pop(0))  # Add from most important
        if least_important:
            reordered_responses.append(least_important.pop())  # Add from least important
    
    return reordered_responses
reordered_responses = reorder_sorted_responses(retrieved_responses)

In [19]:

# ### **Summarized Retrieved Information**:
# {summarized_responses}

# Construct the RAG prompt
prompt = f"""
You are an AI assistant tasked with answering questions based on retrieved knowledge from the book Star Wars Brotherhood.

The summary of the book is this: **Star Wars: Brotherhood** by Mike Chen explores the early days of the Clone Wars, focusing on the evolving relationship between Anakin Skywalker and Obi-Wan Kenobi. Following Anakin's recent promotion to Jedi Knight, the two must navigate their new dynamic as equals while dealing with personal struggles and the pressures of war. When a bombing on Cato Neimoidia threatens to escalate the conflict, Obi-Wan is sent to investigate, uncovering a conspiracy to frame the Republic. Meanwhile, Anakin mentors a young Jedi, Mill Alibeth, forcing him to confront his own complexities and ideals. The novel delves into themes of loyalty, moral ambiguity, and the impact of war on individuals and relationships, highlighting the brotherhood between Anakin and Obi-Wan while deepening the lore of the Star Wars universe.

### **Retrieved Information**:
1. {reordered_responses[0]}
2. {reordered_responses[1]}
3. {reordered_responses[2]}
4. {reordered_responses[3]}
5. {reordered_responses[4]}

### **Question**:
{question}

### **Instructions**:
- Integrate the key points from all retrieved responses into a **cohesive, well-structured answer**.
- If the responses are **contradictory**, mention the different perspectives.
- If none of the retrieved responses contain relevant information, reply:
  **"I couldn't find a good response to your query in the database."**
"""

In [20]:
# for i in range(0,len(retrieved_responses)):
#     print(retrieved_responses[i])
#     print("-------")

In [21]:
# Use Qwen2.5 3B with the correct message format
messages = [
    {"role": "user", "content": prompt}
]

# Generate output using the model
output = generator(messages)

# Print formatted response
print(textwrap.fill(output[0]["generated_text"], width=80))

The name of the Neimoidian guard who assists Obi-Wan is Ruug. This is evident
from the following excerpt:  "‘Don’t move, Jedi!’ Ruug yelled, and though her
volume increased, her voice carried a level tone. She slowed her approach,
pistol trained on him as she got closer."  Additionally, Ruug is described as a
former special ops Neimoidian guard who helps Obi-Wan during their mission on
Cato Neimoidia.


In [22]:
end_time = time.time()
time_taken = end_time - start_time
print(time_taken)

26.798535346984863


**Evaluation Using RAGAS**

In [23]:
!pip install azure-openai

[31mERROR: Could not find a version that satisfies the requirement azure-openai (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for azure-openai[0m[31m
[0m

In [60]:
from langchain_openai.chat_models import AzureChatOpenAI
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

azure_configs = {
    "base_url": "https://sala-m9pmmei0-eastus2.cognitiveservices.azure.com/",
    "model_deployment": "my-gpt-deployment",
    "model_name": "gpt-4o-mini",
}


azure_llm = AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["model_deployment"],
    model=azure_configs["model_name"],
    openai_api_key="9q7KXIPJUv4MiY5ynSWq0IuBeUvaPWl4MGTnw1qP3MKORkCSmsPfJQQJ99BDACHYHv6XJ3w3AAAAACOGYVRV",
    validate_base_url=False,
)

In [61]:
from ragas import EvaluationDataset, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

# Initialize query, reference, and RAG model
query = question
reference = "The Neimoidian guard who assists Obi-Wan is Ruug Quarnom. She is a seasoned former commando in the Neimoidian Defense Legion, reassigned to the Royal Guard. Ruug becomes a key ally to Obi-Wan during his investigation on Cato Neimoidia, helping him uncover the truth behind the bombing and navigating the political complexities of the situation."

# Retrieve relevant documents and generate response
relevant_docs = reordered_responses
response = output[0]["generated_text"]

# Create the evaluation dataset for a single query
dataset = [
    {
        "user_input": query,
        "retrieved_contexts": relevant_docs,
        "response": response,
        "reference": reference
    }
]
evaluation_dataset = EvaluationDataset.from_list(dataset)

# Initialize evaluator and evaluate
evaluator_llm = LangchainLLMWrapper(azure_llm)
metrics = [LLMContextRecall(), Faithfulness(), FactualCorrectness()]
result = evaluate(dataset=evaluation_dataset, metrics=metrics, llm=evaluator_llm)

# Print the evaluation results
print(result)


Evaluating:   0%|          | 0/3 [00:00<?, ?it/s]

{'context_recall': 0.2500, 'faithfulness': 1.0000, 'factual_correctness(mode=f1)': 0.3600}
