# Semantic Chunking

In [1]:
import faiss
import pickle
from tqdm import tqdm
import numpy as np
from openai import OpenAI
import spacy

# Initialize DeepInfra OpenAI Client
openai = OpenAI(
    api_key="2Q2AU9IG4jKLdqmRrHc2UxaLP8hHB0ii",  # Replace with your key
    base_url="https://api.deepinfra.com/v1/openai",
)


def generate_embeddings(chunks):
    """
    Generates embeddings using DeepInfra's BAAI/bge-m3 model with progress bar.
    Args:
        chunks (list): List of text chunks.
    Returns:
        np.ndarray: Array of embeddings.
    """
    embeddings = []
    for chunk in tqdm(chunks, desc="Generating Embeddings"):
        try:
            # Generate embeddings
            response = openai.embeddings.create(
                model="BAAI/bge-m3",
                input=chunk,
                encoding_format="float"
            )
            # Append embedding
            embeddings.append(response.data[0].embedding)
        except Exception as e:
            print(f"Error generating embedding for chunk: {chunk[:30]}... - {e}")
            embeddings.append([0] * 768)  # Add zero vector for failed chunks
    return np.array(embeddings, dtype=np.float32)  # Ensure float32 type


def process_and_store_faiss(chunks, index_path, metadata_path):
    """
    Processes the text chunks, generates embeddings, and stores them in FAISS.
    """
    # Generate embeddings
    embeddings = generate_embeddings(chunks)

    # Create FAISS index
    dimension = embeddings.shape[1]  # Get embedding dimension
    index = faiss.IndexFlatL2(dimension)  # Create FAISS index with L2 distance
    index.add(embeddings)  # Add embeddings to the index

    # Save index and metadata
    save_faiss_index(index, index_path, metadata_path, chunks)


def save_faiss_index(index, index_path, metadata_path, metadata):
    """
    Saves the FAISS index and metadata locally.
    """
    # Save FAISS index
    faiss.write_index(index, index_path)
    # Save metadata
    with open(metadata_path, 'wb') as f:
        pickle.dump(metadata, f)
    print("FAISS index and metadata saved locally.")


def load_faiss_index(index_path, metadata_path):
    """
    Loads the FAISS index and metadata from local storage.
    """
    # Load FAISS index
    index = faiss.read_index(index_path)
    # Load metadata
    with open(metadata_path, 'rb') as f:
        metadata = pickle.load(f)
    return index, metadata

In [6]:

# Load the pre-trained NLP model
# install en_core_web_sm model
# !python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")  # Use 'en_core_web_lg' for better accuracy with larger models

# Read the text file
with open('istanbul_places_content.txt', 'r', encoding='utf-8') as file:
    text = file.read()

def semantic_chunking(text, max_length=500, overlap=50):
    # Process the text with spaCy
    doc = nlp(text)
    
    chunks = []
    current_chunk = ""
    for sent in doc.sents:  # Iterate through sentences
        # Add the sentence if it fits in the current chunk
        if len(current_chunk) + len(sent.text) <= max_length:
            current_chunk += sent.text + " "
        else:
            # Add the current chunk to the list and start a new chunk with overlap
            chunks.append(current_chunk.strip())
            current_chunk = " ".join(current_chunk.split()[-overlap:]) + " " + sent.text
    
    # Append the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Perform semantic chunking
chunks = semantic_chunking(text)

# Define the file paths for saving the FAISS index and metadata
index_path = "faiss_index.index"
metadata_path = "metadata.pkl"

# Process and store the FAISS index
process_and_store_faiss(chunks, index_path, metadata_path)


Generating Embeddings: 100%|██████████| 4997/4997 [40:07<00:00,  2.08it/s]  


FAISS index and metadata saved locally.
