In [1]:
# Import necessary libraries
from datasets import load_dataset
import faiss
import numpy as np
import pickle
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util
import openai
from openai import OpenAI
from tqdm import tqdm
from rank_bm25 import BM25Okapi
import torch.nn.functional as F
from eval_metrics import evaluate_single_input
import os 
os.environ['HF_TOKEN'] = "YOUR_HF_TOKEN"
openai = OpenAI(
    api_key="YOUR_API_KEY",  # Replace with your actual API key
    base_url="https://api.deepinfra.com/v1/openai",
)
# Define the model name
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"

In [3]:
def load_faiss_index(index_path, metadata_path):
    """
    Loads the FAISS index and metadata from local storage.
    """
    index = faiss.read_index(index_path)
    with open(metadata_path, 'rb') as f:
        metadata = pickle.load(f)
    return index, metadata

In [4]:
def search_faiss(query, top_k, index_path, metadata_path):
    """
    Searches FAISS for the most relevant chunks to the query.
    """
    # Load index and metadata
    index, metadata = load_faiss_index(index_path, metadata_path)

    # Generate embedding for the query
    response = openai.embeddings.create(
        model="BAAI/bge-m3",
        input=query,
        encoding_format="float"
    )
    query_embedding = np.array([response.data[0].embedding])

    # Search FAISS index for the top_k results
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve matching chunks
    results = []
    for i, idx in enumerate(indices[0]):
        results.append((metadata[idx], distances[0][i]))
    return results

In [5]:
# Load the dataset
dataset = load_dataset(
path='BlackFear/istanbul-qa-dataset',
trust_remote_code=True,
)

data = dataset['test']

# Extract questions and answers
queries = data['question']
references = data['reference']


In [6]:
def generate_answer(query, context):
    """
    Generates an answer based on the query and retrieved context using the LLM.
    """
    # Prepare the input prompt
    prompt = f"""
    Use the following contexts to answer the question:

    Context:
    {context}

    Question:
    {query}

    Answer:
    """

    # Query the Llama model
    response = openai.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You will be answering questions about Istanbul. Please provide the answer to the following question."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content.strip()

In [7]:
def load_bge():
    """
    Loads BGE reranker from HuggingFace.
    """
    model_name = "BAAI/bge-small-en-v1.5"  
    model = SentenceTransformer(model_name)
    return model

In [8]:
def load_tildev2():
    """
    Loads TildeV2 reranker from HuggingFace.
    """
    model_name = "ielab/TILDEv2-TILDE128-exp"  
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return tokenizer, model

In [9]:
def hyde(query):
    """
    Generates a hypothetical document based on the query using LLM (Query2Doc).
    Combines the query and hypothetical document for retrieval.
    """
    # Prompt LLM to create a hypothetical document
    prompt = f"""
    Generate a hypothetical paragraph based on the following query.
    Just give the created paragraph, no need to answer the question.
    Query: {query}

    Hypothetical Paragraph:
    """
    response = openai.chat.completions.create(
        model=model_name,
        messages=[{'role': 'user', 'content': prompt}],
    )
    hypothetical_doc = response.choices[0].message.content.strip()

    # Combine query with the hypothetical document
    return hypothetical_doc

In [10]:
def hybrid_search(query, top_k, index_path, metadata_path):
    """
    Combines dense (FAISS) and sparse (BM25) retrieval for better results.
    """
    # Load FAISS index and metadata
    index, metadata = load_faiss_index(index_path, metadata_path)

    # Dense Retrieval (FAISS)
    # Generate embedding for query
    response = openai.embeddings.create(
        model="BAAI/bge-m3",  # Dense embedding model
        input=query,
        encoding_format="float"
    )
    query_embedding = np.array([response.data[0].embedding])
    distances, indices = index.search(query_embedding, top_k)

    # Sparse Retrieval (BM25)
    tokenized_metadata = [doc.split() for doc in metadata]  # Preprocess metadata
    bm25 = BM25Okapi(tokenized_metadata)
    sparse_scores = bm25.get_scores(query.split())

    # Normalize and combine scores (Hybrid Search)
    dense_scores = 1 / (1 + distances[0])  # Convert FAISS distances to similarity
    sparse_scores = np.array(sparse_scores)
    sparse_scores = sparse_scores / np.max(sparse_scores)  # Normalize BM25 scores

    # Combine dense and sparse scores (50-50 weight)
    combined_scores = 0.5 * dense_scores + 0.5 * sparse_scores[indices[0]]

    # Sort results based on combined scores
    sorted_indices = np.argsort(-combined_scores)  # Descending order
    results = [(metadata[indices[0][i]], combined_scores[i]) for i in sorted_indices]

    return results[:top_k]

In [11]:
def bge_rerank(query, results, model):
    """
    Reranks results using BGE embeddings.
    """
    # Encode query and results
    query_embedding = model.encode(query, convert_to_tensor=True)
    doc_embeddings = model.encode([res[0] for res in results], convert_to_tensor=True)

    # Compute similarity
    scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]
    reranked = sorted(zip([res[0] for res in results], scores.tolist()), key=lambda x: x[1], reverse=True)
    return reranked

In [12]:
def tildev2_rerank(query, results, tokenizer, model):
    """
    Reranks results using TildeV2 embeddings and cosine similarity.
    """
    reranked = []

    # Process the query
    query_inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    query_outputs = model(**query_inputs)
    query_embedding = query_outputs.last_hidden_state.mean(dim=1)  # Take mean of embeddings

    for doc in results:
        # Process each document
        doc_inputs = tokenizer(doc[0], return_tensors="pt", padding=True, truncation=True)
        doc_outputs = model(**doc_inputs)
        doc_embedding = doc_outputs.last_hidden_state.mean(dim=1)

        # Calculate cosine similarity
        score = F.cosine_similarity(query_embedding, doc_embedding).item()
        reranked.append((doc[0], score))

    # Sort by scores in descending order
    reranked = sorted(reranked, key=lambda x: x[1], reverse=True)
    return reranked

In [13]:
def repack_results(results, method='forward'):
    """
    Repack results based on specified method: 'forward', 'reverse', or 'side'.
    """
    if method == 'forward':
        # Sequentially combine results
        context = "\n\n".join([f"Chunk {i+1}: {result[0]}" for i, result in enumerate(results)])

    elif method == 'reverse':
        # Start from the most relevant and work backward
        context = "\n\n".join([f"Chunk {i+1}: {result[0]}" for i, result in enumerate(results[::-1])])

    elif method == 'side':
        # Alternate between high and low relevance results
        mid = len(results) // 2
        merged = []
        for i in range(mid):
            merged.append(results[i])  # High rank
            if i + mid < len(results):
                merged.append(results[i + mid])  # Low rank

        context = "\n\n".join([f"Chunk {i+1}: {res[0]}" for i, res in enumerate(merged)])
    else:
        raise ValueError("Invalid repack method. Choose 'forward', 'reverse', or 'side'.")

    return context

In [14]:
def summarize_context(query, context):
    """
    Summarizes the retrieved context into a concise and representative format using GPT.
    """
    prompt = f"""
    Summarize the following context to make it concise yet representative of the main ideas, keeping it relevant to the query.
    Just give the summarized content.

    Query: {query}

    Context:
    {context}

    Summary:
    """
    # Use GPT to generate the summary
    response = openai.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}]
    )

    # Extract and return the summary
    summary = response.choices[0].message.content.strip()
    return summary

In [15]:
def rag_pipeline(query, 
                 top_k=5, 
                 method="hyde", 
                 retrive_method='hybrid', 
                 rerank_method='monot5', 
                 repack_method='forward',
                 summarize=True,
                 index_path='index.faiss', 
                 metadata_path='metadata.pkl'):
    """
    Executes the RAG pipeline with retrieval, reranking, repacking, and summarization options.
    """
    if rerank_method == 'bge':
        model = load_bge()
    elif rerank_method == 'tildev2':
        tokenizer, model = load_tildev2()

    # Enhance query using HyDE or Query2Doc
    if method == 'query2doc':
        query = query + " " + hyde(query)
    elif method == 'hyde':
        query = hyde(query)

    # Retrieve initial results
    if retrive_method == 'hybrid':
        results = hybrid_search(query, top_k, index_path, metadata_path)
    else:
        results = search_faiss(query, top_k, index_path, metadata_path)

    # Apply reranking
    if rerank_method == 'bge':
        results = bge_rerank(query, results, model)
    elif rerank_method == 'tildev2':
        results = tildev2_rerank(query, results, tokenizer, model)

    # **Apply Repacking** after reranking
    context = repack_results(results, repack_method)

    # **Summarization Step**
    if summarize:
        context = summarize_context(query, context)  # Summarize the context

    # Generate an answer using LLM
    answer = generate_answer(query, context)

    return answer, context

In [None]:
scores = []
for i in tqdm(range(len(queries)), desc="Processing queries", total=len(queries)):
    query = queries[i]
    reference = references[i]
    answer,context = rag_pipeline(query, top_k=8, method='',
                                  retrive_method='',rerank_method='tildev2',repack_method='forward',
                                  summarize=True,index_path='/kaggle/input/database-recursive/faiss_index_semantic.index',
                                  metadata_path='/kaggle/input/database-recursive/metadata_semantic.pkl')
    score = evaluate_single_input(query, reference, answer, context)
    scores.append(score)


In [None]:
# Calculate average scores
faithfullness, relevance, factual, overall = 0, 0, 0, 0
for i, score in enumerate(scores):
    faithfullness += scores[i]['Faithfulness']
    relevance += scores[i]['Relevancy']
    factual += scores[i]['Factual Correctness']
    overall += scores[i]['Overall Score']

faithfullness /= len(scores)
relevance /= len(scores)
factual /= len(scores)
overall /= len(scores)

In [None]:
# Display the average scores
print(f"{faithfullness}, {relevance}, {factual}, {overall}")