In [None]:
!pip install langchain_community
!pip install faiss-gpu
!pip install rank_bm25 
!pip install ragas
!pip install datasets
!pip install pdfplumber
!pip install azure-openai
!pip install sumy

In [None]:
import os
os.getcwd()

from langchain_community.document_loaders import PyMuPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
from rank_bm25 import BM25Okapi
import numpy as np
import warnings

from datasets import Dataset
warnings.filterwarnings("ignore")
import textwrap

In [None]:
import pdfplumber

# Open the PDF file
with pdfplumber.open('/kaggle/input/star-wars/Star Wars - Brotherhood Mike Chen.pdf') as pdf:
    # Open the text file for writing
    with open('knowledge_base.txt', 'w', encoding='utf-8') as output:
        # Iterate over pages 10 to 349 (0-indexed, so subtract 1)
        for i in range(9, 349):  # Page 10 is index 9
            page = pdf.pages[i]
            text = page.extract_text() or ""  # Handle cases where text is None
            output.write(text + '\n')  # Write text to file with a newline

In [None]:
# 1. Load your text files
file_paths = ["/kaggle/working/knowledge_base.txt"]
documents = []

for file_path in file_paths:
    loader = TextLoader(file_path)
    docs = loader.load()
    documents.extend(docs)

# 2. Define chunking parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,        # You can try 100, 250, 512, etc.
    chunk_overlap=600       # Try 0, 50, 100, etc.
)

# 3. Split the documents
chunks = text_splitter.split_documents(documents)


# 4. Output result
print(f"Total chunks: {len(chunks)}")
print(f"First chunk content:\n{chunks[0].page_content}")

# Optional: Save the chunks to a file
with open("chunked_output.txt", "w", encoding='utf-8') as f:
    for i, chunk in enumerate(chunks):
        f.write(f"--- Chunk {i + 1} ---\n")
        f.write(chunk.page_content + "\n\n")

In [None]:
# Prepare documents and their metadata
texts = [chunk.page_content for chunk in chunks]
metadata = [chunk.metadata for chunk in chunks]
print(len(texts))

In [None]:
# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

# Create FAISS vector database
vectordb = FAISS.from_documents(chunks, embedding_model)

# Save FAISS index to disk for later use
vectordb.save_local("faiss_index")

# Check the number of stored documents
print(f"Number of documents in the vector store: {vectordb.index.ntotal}")

In [None]:
# BM25 Indexing
tokenized_texts = [text.split() for text in texts]
bm25 = BM25Okapi(tokenized_texts)

def reciprocal_rank_fusion(results_bm25, results_embedding, k=2):
    scores = {}

    # Use document content or metadata as the key
    for rank, (doc, score) in enumerate(results_bm25):
        doc_id = doc.page_content  # Or use doc.metadata.get("source", "unknown") if available
        scores[doc_id] = scores.get(doc_id, 0) + 1 / (rank+1) # (k + rank + 1)
        print("BM25", scores[doc_id])

    for rank, (doc, score) in enumerate(results_embedding):
        doc_id = doc.page_content  # Use the same identifier
        scores[doc_id] = scores.get(doc_id, 0) + 1 / (rank+1) # (k + rank + 1)
        print("Dense", scores[doc_id])

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)


# Extract page content and metadata properly
def format_response(doc):
    return f"Page {doc.metadata.get('page', 'Unknown')}: {doc.page_content.strip()}"

In [None]:
# Retrieve function
def retrieve(query, k=3):
    query_embedding = embedding_model.embed_query(query)
    results_embedding = vectordb.similarity_search_with_score_by_vector(query_embedding, k=k)
    results_embedding = sorted(results_embedding, key=lambda x: x[1], reverse=True)
    
    print("============Dense Embeddings=============")
    for doc, score in results_embedding:
        print(f"page {doc.metadata.get('page','Unknown')} - Score: {score:.4f} - {doc.page_content[:100]}...")

    # Get BM25 scores for all documents and sort to get top-k results
    results_bm25 = [(idx, bm25.get_scores(query.split())[idx]) for idx in range(len(texts))]
    results_bm25 = sorted(results_bm25, key=lambda x: x[1], reverse=True)[:k]  # Keep only top-k results
    # Convert BM25 results to (Document, score) format
    results_bm25_docs = [(Document(page_content=texts[idx], metadata=metadata[idx]), score) for idx, score in results_bm25]
   
    print("************BM25 Results*************")
    for doc, score in results_bm25_docs:
        print(f"page {doc.metadata.get('page','Unknown')} - Score: {score:.4f} - {doc.page_content[:100]}...")
    
    # Create a lookup dictionary {document content -> Document object}
    doc_lookup = {doc.page_content: doc for doc, _ in results_bm25_docs}
    doc_lookup.update({doc.page_content: doc for doc, _ in results_embedding})

    # Fuse results
    fused_results = reciprocal_rank_fusion(results_bm25_docs, results_embedding)
    
    # Format results, ensuring document IDs are mapped back to actual Documents
    return [format_response(doc_lookup[doc_id]) for doc_id, _ in fused_results if doc_id in doc_lookup]

    #fused_results = reciprocal_rank_fusion(results_bm25, results_embedding)
    #return [(texts[idx], metadata[idx]["page"] if "page" in metadata[idx] else "Unknown") for idx, _ in fused_results]

In [None]:
from langchain.schema import Document

def retrieve_dense(query, k=3):
    query_embedding = embedding_model.embed_query(query)
    results_embedding = vectordb.similarity_search_with_score_by_vector(query_embedding, k=k)
    
    # Optionally sort descending by score if needed
    results_embedding = sorted(results_embedding, key=lambda x: x[1], reverse=True)
    
    print("============Dense Embeddings=============")
    for doc, score in results_embedding:
        print(f"page {doc.metadata.get('page','Unknown')} - Score: {score:.4f} - {doc.page_content[:100]}...")
    
    # Return just the documents (or both doc and score if you want)
    return [format_response(doc) for doc, _ in results_embedding]

In [None]:
def retrieve_BM25(query, k=3):
    # Get BM25 scores for all documents
    results_bm25 = [(idx, bm25.get_scores(query.split())[idx]) for idx in range(len(texts))]
    
    # Sort by score in descending order and select top-k
    results_bm25 = sorted(results_bm25, key=lambda x: x[1], reverse=True)[:k]
    
    # Convert BM25 results to (Document, score) format
    results_bm25_docs = [
        (Document(page_content=texts[idx], metadata=metadata[idx]), score)
        for idx, score in results_bm25
    ]
    
    # Print results
    print("*BM25 Results")
    for doc, score in results_bm25_docs:
        print(f"page {doc.metadata.get('page', 'Unknown')} - Score: {score:.4f} - {doc.page_content[:100]}...")
    
    # Return only the Document objects
    return [doc for doc, _ in results_bm25_docs]

In [None]:
# model_name = "tiiuae/Falcon3-3B-Instruct"
model_name = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto", #device_map='cuda'
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
import time
start_time = time.time()

In [None]:
# Query example
question = "What is the name of the Neimoidian guard who assists Obi-Wan?"
retrieved_responses = retrieve_dense(question, k=6)

In [None]:
# # Query processing
# question = "What was the cause of the bombing on Cato Neimoidia in Brotherhood?"
# retriever = vectordb.as_retriever(search_kwargs={"k": 10})
# docs = retriever.get_relevant_documents(question)

# # Print results
# for i, doc in enumerate(docs, 1):
#     page_number = doc.metadata.get('page', 'Unknown')
#     # print(f"Document {i} - Page {page_number} - Score: {doc.metadata.get('score', 'N/A')}")
#     print(doc.page_content[:])  # Print first 500 characters of each result
#     print("-" * 80)

In [None]:
# Create a pipeline
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=False,
max_new_tokens=5000,
do_sample=False
)

In [None]:
# from sumy.parsers.plaintext import PlaintextParser
# from sumy.nlp.tokenizers import Tokenizer
# from sumy.summarizers.lsa import LsaSummarizer
# import textwrap

# def generate_lsa_summary(retrieved_responses, num_summary_sentence=50):
#     # Combine the retrieved responses into one string
#     text = " ".join(retrieved_responses)
    
#     # Initialize LSA summarizer
#     LANGUAGE = "english"
#     parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
#     lsa_summarizer = LsaSummarizer()
    
#     # Generate the summary
#     summary = []
#     for sentence in lsa_summarizer(parser.document, num_summary_sentence):
#         summary.append(str(sentence))
    
#     # Join the summarized sentences and wrap them for better readability
#     summarized_text = " ".join(summary)
#     return textwrap.fill(summarized_text, 100)

# summarized_responses = generate_lsa_summary(retrieved_responses)

In [None]:
def reorder_sorted_responses(sorted_responses):
    # Alternate between most important (edges) and least important (center)
    most_important = sorted_responses[::2]  # Take every other response starting with the first
    least_important = sorted_responses[1::2]  # Take every other response starting with the second

    # Merge: Place least important in the center
    reordered_responses = []
    while most_important or least_important:
        if most_important:
            reordered_responses.append(most_important.pop(0))  # Add from most important
        if least_important:
            reordered_responses.append(least_important.pop())  # Add from least important
    
    return reordered_responses
reordered_responses = reorder_sorted_responses(retrieved_responses)

In [None]:
# ### **Summarized Retrieved Information**:
# {summarized_responses}

# Construct the RAG prompt
prompt = f"""
You are an AI assistant tasked with answering questions based on retrieved knowledge from the book Star Wars Brotherhood.

### **Retrieved Information**:
1. {reordered_responses[0]}
2. {reordered_responses[1]}
3. {reordered_responses[2]}
4. {reordered_responses[3]}
5. {reordered_responses[4]}
6. {reordered_responses[5]}

### **Question**:
{question}

### **Instructions**:
- Integrate the key points from all retrieved responses into a **cohesive, well-structured answer**.
- If the responses are **contradictory**, mention the different perspectives.
- If none of the retrieved responses contain relevant information, reply:
  **"I couldn't find a good response to your query in the database."**
"""

In [None]:
for i in range(0,len(reordered_responses)):
    print(reordered_responses[i])
    print("-------")

In [None]:
# Use Qwen2.5 3B with the correct message format
messages = [
    {"role": "user", "content": prompt}
]

# Generate output using the model
output = generator(messages)

# Print formatted response
print(textwrap.fill(output[0]["generated_text"], width=80))

In [None]:
end_time = time.time()
time_taken = end_time - start_time
print(time_taken)

**Evaluation Using RAGAS**

In [None]:
# from langchain_openai.chat_models import AzureChatOpenAI
# from langchain_openai.embeddings import AzureOpenAIEmbeddings
# from ragas.llms import LangchainLLMWrapper
# from ragas.embeddings import LangchainEmbeddingsWrapper

# azure_configs = {
#     "base_url": "https://sala-m9pmmei0-eastus2.cognitiveservices.azure.com/",
#     "model_deployment": "my-gpt-deployment",
#     "model_name": "gpt-4o-mini",
# }


# azure_llm = AzureChatOpenAI(
#     openai_api_version="2023-05-15",
#     azure_endpoint=azure_configs["base_url"],
#     azure_deployment=azure_configs["model_deployment"],
#     model=azure_configs["model_name"],
#     openai_api_key="",
#     validate_base_url=False,
# )

In [None]:
# from ragas import EvaluationDataset, evaluate
# from ragas.llms import LangchainLLMWrapper
# from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

# # Initialize query, reference, and RAG model
# query = question
# reference = "The Jedi Padawan accompanying Obi-Wan on this mission is Anakin Skywalker."


# # Retrieve relevant documents and generate response
# relevant_docs = reordered_responses
# response = output[0]["generated_text"]

# # Create the evaluation dataset for a single query
# dataset = [
#     {
#         "user_input": query,
#         "retrieved_contexts": relevant_docs,
#         "response": response,
#         "reference": reference
#     }
# ]
# evaluation_dataset = EvaluationDataset.from_list(dataset)

# # Initialize evaluator and evaluate
# evaluator_llm = LangchainLLMWrapper(azure_llm)
# metrics = [LLMContextRecall(), Faithfulness(), FactualCorrectness()]
# result = evaluate(dataset=evaluation_dataset, metrics=metrics, llm=evaluator_llm)

In [None]:
# print("Query: " ,question)
# print("")
# print("Response: " ,textwrap.fill(response, width=80))
# print("")
# print("Golden Answer: ",textwrap.fill(reference))
# print("")
# print("Result:",result)