In [1]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import os
from pathlib import Path
from tqdm import tqdm

def load_files(summaries_path, metadata_path):
    """Load both summaries and metadata files."""
    with open(summaries_path, 'r') as f:
        summaries = json.load(f)
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    return summaries, metadata

def prepare_texts(summaries, metadata):
    """Prepare texts for embedding by combining summaries with key metadata."""
    texts = []
    processed_metadata = []
    
    # Add tqdm progress bar for text preparation
    for idx, (summary_obj, meta) in enumerate(tqdm(zip(summaries, metadata), 
                                                  total=len(summaries),
                                                  desc="Preparing texts")):
        # Extract summary text from object
        summary = list(summary_obj)[0]  # Since each summary is stored as a single-item set
        
        # Create enriched text for embedding
        text_parts = [
            summary,
            f"Rule {meta.get('Rule no.', '')}" if meta.get('Rule no.') else '',
            meta.get('Description', '')
        ]
        text = ' '.join(filter(None, text_parts))
        
        if text.strip():
            texts.append(text)
            # Keep original metadata structure and add index
            meta_entry = meta.copy()
            meta_entry['summary'] = summary
            processed_metadata.append(meta_entry)
    
    return texts, processed_metadata

def create_embeddings(texts, model_name='bert-base-multilingual-cased', batch_size=16):
    """Create embeddings using mBERT via SentenceTransformer."""
    print(f"Using model: {model_name}")
    model = SentenceTransformer(model_name, device='cpu')
    
    print("Generating embeddings...")
    # mBERT models are larger, so using smaller batch size
    embeddings = model.encode(texts, 
                            show_progress_bar=True, 
                            batch_size=batch_size)
    return embeddings

def create_faiss_index(embeddings, save_dir):
    """Create and save FAISS index using L2 distance."""
    os.makedirs(save_dir, exist_ok=True)
    
    dimension = embeddings.shape[1]
    print(f"Creating FAISS index with dimension {dimension}")
    
    # Convert to float32 for FAISS compatibility
    embeddings = embeddings.astype('float32')
    
    # Create FlatL2 index
    print("Creating FlatL2 index...")
    index = faiss.IndexFlatL2(dimension)
    
    # Add vectors to the index with progress bar
    print("Adding vectors to index...")
    batch_size = 1000
    for i in tqdm(range(0, len(embeddings), batch_size), desc="Adding to FAISS"):
        batch = embeddings[i:i + batch_size]
        index.add(batch)
    
    # Save the index
    index_path = os.path.join(save_dir, 'embeddings.faiss')
    faiss.write_index(index, index_path)
    print(f"Index saved to {index_path}")
    return index

def save_metadata(metadata, save_dir):
    """Save processed metadata."""
    metadata_path = os.path.join(save_dir, 'metadata.json')
    with tqdm(total=1, desc="Saving metadata") as pbar:
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        pbar.update(1)
    print(f"Metadata saved to {metadata_path}")

def main():
    # Setup paths
    base_path = Path('/workspace/Extracted/Structured/Summary')
    summaries_path = base_path / 'Combined_Summaries.json' 
    metadata_path = base_path / 'Combined_Metadata.json'
    save_dir = '/workspace/Malayalam/vector_db'
    
    print("Loading files...")
    with tqdm(total=2, desc="Loading files") as pbar:
        summaries, metadata = load_files(summaries_path, metadata_path)
        pbar.update(2)
    
    texts, processed_metadata = prepare_texts(summaries, metadata)
    
    print(f"Creating embeddings for {len(texts)} documents...")
    # Using mBERT model instead of all-MiniLM-L6-v2
    embeddings = create_embeddings(texts, model_name='bert-base-multilingual-cased')
    
    print("Creating and saving FAISS index...")
    index = create_faiss_index(embeddings, save_dir)
    
    print("Saving metadata...")
    save_metadata(processed_metadata, save_dir)
    
    print("\nSummary:")
    print(f"- Processed {len(texts)} documents")
    print(f"- Created {embeddings.shape[1]}-dimensional embeddings")
    print(f"- Index and metadata saved in {save_dir}")

if __name__ == "__main__":
    main()

  from tqdm.autonotebook import tqdm, trange


Loading files...


Loading files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 60.71it/s]
Preparing texts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6293/6293 [00:00<00:00, 473507.98it/s]

Creating embeddings for 6293 documents...
Using model: bert-base-multilingual-cased



No sentence-transformers model found with name bert-base-multilingual-cased. Creating a new one with mean pooling.


Generating embeddings...


Batches:   0%|          | 0/394 [00:00<?, ?it/s]

Creating and saving FAISS index...
Creating FAISS index with dimension 768
Creating FlatL2 index...
Adding vectors to index...


Adding to FAISS: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 389.39it/s]


Index saved to /workspace/Malayalam/vector_db/embeddings.faiss
Saving metadata...


Saving metadata: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.61it/s]

Metadata saved to /workspace/Malayalam/vector_db/metadata.json

Summary:
- Processed 6293 documents
- Created 768-dimensional embeddings
- Index and metadata saved in /workspace/Malayalam/vector_db



