In [1]:
import spacy
import faiss
import numpy as np
import pickle
import openai
from openai import OpenAI

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Initialize DeepInfra OpenAI Client
openai = OpenAI(
    api_key="YOUR_API_KEY",  # Replace with your key
    base_url="https://api.deepinfra.com/v1/openai",
)

In [2]:
def chunk_text(text, max_chunk_size=200, min_chunk_size=100):
    # Step 1: Use SpaCy for sentence splitting
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]

    # Step 2: Group sentences into chunks
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        # If adding the sentence exceeds max size, save the current chunk
        if len(current_chunk) + len(sentence) > max_chunk_size:
            # If current_chunk is too small, add more until it meets min size
            if len(current_chunk) < min_chunk_size:
                current_chunk += " " + sentence
            else:
                # Save the chunk and start a new one
                chunks.append(current_chunk.strip())
                current_chunk = sentence
        else:
            current_chunk += " " + sentence

    # Add the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())

    # Step 3: Handle very large sentences by word splitting
    final_chunks = []
    for chunk in chunks:
        if len(chunk) > max_chunk_size:
            words = chunk.split()  # Split by words
            temp_chunk = ""
            for word in words:
                if len(temp_chunk) + len(word) + 1 > max_chunk_size:
                    final_chunks.append(temp_chunk.strip())
                    temp_chunk = word
                else:
                    temp_chunk += " " + word
            if temp_chunk:
                final_chunks.append(temp_chunk.strip())
        else:
            final_chunks.append(chunk)

    return final_chunks

In [3]:
def save_faiss_index(index, index_path, metadata_path, metadata):
    # Save index and metadata
    faiss.write_index(index, index_path)
    with open(metadata_path, 'wb') as f:
        pickle.dump(metadata, f)
    print("FAISS index and metadata saved locally.")


def load_faiss_index(index_path, metadata_path):
    # Load index and metadata
    index = faiss.read_index(index_path)
    with open(metadata_path, 'rb') as f:
        metadata = pickle.load(f)
    return index, metadata

In [4]:
def generate_embeddings(chunks):
    # Generate embeddings for each chunk with BgeM3 model
    embeddings = []
    for chunk in chunks:
        response = openai.embeddings.create(
            model="BAAI/bge-m3",
            input=chunk,
            encoding_format="float"
        )
        embeddings.append(response.data[0].embedding)
    return np.array(embeddings)

In [5]:
def process_and_store_faiss(chunks, index_path='faiss_index.bin', metadata_path='metadata.pkl'):
    # Generate embeddings
    embeddings = generate_embeddings(chunks)

    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)  # Add embeddings to FAISS index

    # Save metadata and index
    save_faiss_index(index, index_path, metadata_path, chunks)

In [6]:
def search_faiss(query, top_k=3, index_path='faiss_index.bin', metadata_path='metadata.pkl'):
    # Load FAISS index and metadata
    index, metadata = load_faiss_index(index_path, metadata_path)

    # Generate embedding for query
    response = openai.embeddings.create(
        model="BAAI/bge-m3",
        input=query,
        encoding_format="float"
    )
    query_embedding = np.array([response.data[0].embedding])

    # Search FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Print results
    print("\nSearch Results:")
    for i, idx in enumerate(indices[0]):
        print(f"Rank {i+1}: {metadata[idx]} (Distance: {distances[0][i]})")

In [7]:
if __name__ == "__main__":
    # Load text file
    with open('knowledge_base/istanbul_places_content.txt', 'r', encoding='utf-8') as file:
        text = file.read()

    # Set chunk size
    max_chunk_size = 500
    min_chunk_size = 100

    # Perform recursive chunking
    chunks = chunk_text(text, max_chunk_size, min_chunk_size)
    print(f"Total Chunks Created: {len(chunks)}")

    # Process chunks and store in FAISS
    process_and_store_faiss(chunks)


Total Chunks Created: 2002
FAISS index and metadata saved locally.


In [11]:
 # Perform a test search
query = "Fethiye Mosque"
search_faiss(query, top_k=3, index_path='faiss_index_recursive.bin', metadata_path='metadata_recursive.pkl')


Search Results:
Rank 1: Ticket Price: 3€ (Euro) State: Closed Official Announcements: Fethiye (Pammakaristos Church) Mosque/Museum: The Monastery of the Theotokos Pammakaristos (Mother of God the All-Blessed), now Fethiye Mosque, was located on the fifth hill of Constantinople, in the modern neighborhood of Çarsamba. It is located southeast of Chora and Blachernai churches. (Distance: 0.7436721324920654)
Rank 2: A document of the second half of the 16th century describes a number of tombs and relics there, including Alexios Komnenos. In 1587, it was converted into a mosque, after which it was significantly altered. It was converted into a mosque around 1587 during the reign of Murad III. It was called Fethiye (“Conquest”) Mosque, in commemoration of the Ottoman conquest of Georgia. Sinan Pasha, then the Grand Vizier, established its madrasa. (Distance: 0.7661095857620239)
Rank 3: 124-125
Points from Turkey
The Fatih Mosque (Turkish: Fatih Camii, "Conqueror's Mosque" in English) is an 