In [None]:
from langchain.docstore.document import Document
from datasets import load_dataset
from typing import List
from rank_bm25 import BM25Okapi
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS

In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()


True

In [3]:
def encode_pdf_and_get_split_documents(chunk_size=1000, chunk_overlap=120):
    """
    Encodes a PDF book or dataset into a FAISS vector store.
    Returns: vectorstore, texts (list of Document)
    """
    DB_FAISS_PATH = "faiss_index_fusion"

    print("Veriseti indiriliyor: 'neural-bridge/rag-dataset-12000'...")
    dataset = load_dataset("neural-bridge/rag-dataset-12000", split="train")
    print("Veriseti başarıyla indirildi.")

    # Dokümanları oluştur
    documents = [Document(page_content=item["context"]) for item in dataset]

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(documents)

    # Embeddings
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

    # FAISS index varsa yükle
    if os.path.exists(DB_FAISS_PATH):
        print("Mevcut FAISS veritabanı yükleniyor...")
        vectorstore = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
        return vectorstore, texts

    # Yeni FAISS index oluştur
    vectorstore = FAISS.from_documents(texts, embeddings)
    vectorstore.save_local(DB_FAISS_PATH)
    return vectorstore, texts

In [4]:
def create_bm25_index(documents: List[Document]) -> BM25Okapi:
    tokenized_docs = [doc.page_content.split() for doc in documents]  # basit tokenize
    bm25 = BM25Okapi(tokenized_docs)
    bm25.corpus_docs = documents  # fusion sırasında alignment için
    return bm25

In [13]:
def fusion_retrieval(vectorstore, bm25, query: str, k: int = 5, alpha: float = 0.5, top_k_vector: int = 100):
    """
    Returns top-k documents along with BM25 and vector scores.
    """
    epsilon = 1e-8

    query_tokens = query.split()
    bm25_scores = bm25.get_scores(query_tokens)
    all_docs = list(bm25.corpus_docs)

    # Vector search
    vector_results = vectorstore.similarity_search_with_score(query, k=min(top_k_vector, len(all_docs)))
    vector_scores_dict = {doc.page_content: score for doc, score in vector_results}

    # Normalize BM25
    bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores) + epsilon)

    # Normalize vector (FAISS cosine similarity, yüksek = iyi)
    vector_scores = np.array([vector_scores_dict.get(doc.page_content, 0) for doc in all_docs])
    vector_scores_norm = (vector_scores - np.min(vector_scores)) / (np.max(vector_scores) - np.min(vector_scores) + epsilon)

    # Fusion
    combined_scores = alpha * vector_scores_norm + (1 - alpha) * bm25_scores_norm
    sorted_indices = np.argsort(combined_scores)[::-1]

    # Top k doküman ve skorları
    top_docs = []
    for i in sorted_indices[:k]:
        doc = all_docs[i]
        top_docs.append({
            "document": doc,
            "bm25_score": bm25_scores[i],
            "vector_score": vector_scores[i],
            "combined_score": combined_scores[i]
        })

    return top_docs


In [14]:
vectorstore, texts = encode_pdf_and_get_split_documents()
bm25 = create_bm25_index(texts)

Veriseti indiriliyor: 'neural-bridge/rag-dataset-12000'...
Veriseti başarıyla indirildi.
Mevcut FAISS veritabanı yükleniyor...


In [15]:
query = "What was the first job Edwin Koo landed in his professional career?"
top_docs = fusion_retrieval(vectorstore, bm25, query, k=5, alpha=0.5)

for i, d in enumerate(top_docs, 1):
    print(f"--- Document {i} ---")
    print(f"BM25 score: {d['bm25_score']:.4f}")
    print(f"Vector score: {d['vector_score']:.4f}")
    print(f"Combined score: {d['combined_score']:.4f}")
    print(f"Content: {d['document'].page_content[:200]}...\n")  # ilk 200 karakter


--- Document 1 ---
BM25 score: 38.7288
Vector score: 0.6416
Combined score: 0.8731
Content: Edwin became a professional photographer quite by accident in 2003. Trained in journalism, he always thought he would become a writer, but fate landed him his first job as a photojournalist in local n...

--- Document 2 ---
BM25 score: 26.9677
Vector score: 0.7760
Combined score: 0.7994
Content: Today, Edwin has a commercial portfolio but continues to pursue his documentary projects, especially long term ones that demonstrate an artistic vision on the world. Early this year, he published his ...

--- Document 3 ---
BM25 score: 22.2206
Vector score: 0.8554
Combined score: 0.7843
Content: GS: What was your role in your first job at Eastern Kentucky?
JS: I was a graduate assistant at Eastern Kentucky. I worked a lot in the weight room and then I also assisted with the defensive line ini...

--- Document 4 ---
BM25 score: 20.8394
Vector score: 0.8223
Combined score: 0.7473
Content: It was in 2008 th