In [8]:
pip install langchain-community

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
pip install faiss-cpu sentence-transformers numpy

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.


In [10]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import os

def load_documents(file_path):
    with open(file_path, 'r') as f:
        documents = json.load(f)
    return documents

def prepare_texts(documents):
    # Combine relevant fields for each document
    texts = []
    metadata = []
    
    for doc in documents:
        # Combine fields into a single text
        text_parts = [
            doc.get('Document', ''),
            doc.get('Description', ''),
            f"Rule {doc.get('Rule no.', '')}" if doc.get('Rule no.') else ''
        ]
        text = ' '.join(filter(None, text_parts))
        
        if text.strip():
            texts.append(text)
            metadata.append({
                'Document': doc.get('Document', ''),
                'Rule_no': doc.get('Rule no.', ''),
                'Part': doc.get('Part', ''),
                'Chapter': doc.get('Chapter', '')
            })
    
    return texts, metadata

def create_embeddings(texts, model_name='all-MiniLM-L6-v2', batch_size=32):
    # Initialize the transformer model with device='cpu'
    model = SentenceTransformer(model_name, device='cpu')
    
    # Generate embeddings in batches
    embeddings = model.encode(texts, show_progress_bar=True, batch_size=batch_size)
    return embeddings

def create_faiss_index(embeddings, save_dir):
    # Create directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Initialize FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    
    # Add vectors to the index
    index.add(np.array(embeddings).astype('float32'))
    
    # Save the index
    faiss.write_index(index, os.path.join(save_dir, 'document_index.faiss'))
    return index

def save_metadata(metadata, save_dir):
    with open(os.path.join(save_dir, 'metadata.json'), 'w') as f:
        json.dump(metadata, f, indent=2)

def main():
    # Paths
    input_file = '/workspace/rohith_llm/Full_KSR_extracted_rules.json'
    save_dir = 'Vector_DB'
    
    # Load and prepare documents
    documents = load_documents(input_file)
    texts, metadata = prepare_texts(documents)
    
    # Create embeddings
    embeddings = create_embeddings(texts)
    
    # Create and save FAISS index
    index = create_faiss_index(embeddings, save_dir)
    
    # Save metadata
    save_metadata(metadata, save_dir)
    
    print(f"Created FAISS index with {len(texts)} documents")
    print(f"Index saved to {save_dir}/document_index.faiss")
    print(f"Metadata saved to {save_dir}/metadata.json")

if __name__ == "__main__":
    main()

Batches:   0%|          | 0/69 [00:00<?, ?it/s]

Created FAISS index with 2191 documents
Index saved to Vector_DB/document_index.faiss
Metadata saved to Vector_DB/metadata.json


KSR Amendments

In [13]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import json
import os

def load_existing_index(index_path):
    """Load existing FAISS index"""
    return faiss.read_index(index_path)

def load_existing_metadata(metadata_path):
    """Load existing metadata"""
    with open(metadata_path, 'r') as f:
        return json.load(f)

def process_go_document(content, index):
    """Process a government order document and extract relevant information"""
    # Extract basic information
    lines = content.split('\n')
    go_number = ""
    date = ""
    department = ""
    
    for line in lines:
        if "Government Order (P) No." in line:
            go_number = line.strip()
        elif "Date:" in line:
            date = line.split("Date:")[1].strip()
        elif "Department:" in line:
            department = line.split("Department:")[1].strip()
            break
    
    return {
        'Document': go_number,
        'Date': date,
        'Department': department,
        'Content': content,
        'Index': index  # Add index number
    }

def update_faiss_database(existing_index_path, existing_metadata_path, new_documents_folder, save_dir):
    """Update existing FAISS database with new documents"""
    
    # Load existing index and metadata
    index = load_existing_index(existing_index_path)
    metadata = load_existing_metadata(existing_metadata_path)
    
    # Get the current number of vectors in the index
    current_index_size = index.ntotal
    
    # Initialize the transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
    
    # Process new documents
    new_texts = []
    new_metadata = []
    
    # Read and process each file in the new documents folder
    for i, filename in enumerate(os.listdir(new_documents_folder)):
        if filename.endswith('.txt'):
            file_path = os.path.join(new_documents_folder, filename)
            with open(file_path, 'r') as f:
                content = f.read()
                
                # Process the document with new index number
                new_index = current_index_size + i
                doc_info = process_go_document(content, new_index)
                
                # Add to new texts and metadata
                new_texts.append(content)
                new_metadata.append({
                    'Document': doc_info['Document'],
                    'Date': doc_info['Date'],
                    'Department': doc_info['Department'],
                    'Type': 'Government Order',
                    'Filename': filename,
                    'Index': new_index  # Store the index in metadata
                })
    
    # Generate embeddings for new documents
    print(f"Generating embeddings for {len(new_texts)} new documents...")
    new_embeddings = model.encode(new_texts, show_progress_bar=True, batch_size=32)
    
    # Add new embeddings to the index
    print("Adding new embeddings to FAISS index...")
    index.add(np.array(new_embeddings).astype('float32'))
    
    # Update metadata
    # First, ensure existing metadata has index numbers if they don't exist
    for i, item in enumerate(metadata):
        if 'Index' not in item:
            item['Index'] = i
    
    metadata.extend(new_metadata)
    
    # Sort metadata by index number
    metadata = sorted(metadata, key=lambda x: x['Index'])
    
    # Verify index consistency
    print("Verifying index consistency...")
    if index.ntotal != len(metadata):
        raise ValueError(f"Mismatch between index size ({index.ntotal}) and metadata length ({len(metadata)})")
    
    # Save updated index and metadata
    print("Saving updated index and metadata...")
    os.makedirs(save_dir, exist_ok=True)
    faiss.write_index(index, os.path.join(save_dir, 'document_index.faiss'))
    with open(os.path.join(save_dir, 'metadata.json'), 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"Successfully added {len(new_texts)} new documents to the database")
    print(f"Total documents in database: {index.ntotal}")
    return index, metadata

def verify_index_integrity(index_path, metadata_path):
    """Verify the integrity of the index and metadata"""
    index = load_existing_index(index_path)
    metadata = load_existing_metadata(metadata_path)
    
    print("Performing index integrity check...")
    print(f"Number of vectors in FAISS index: {index.ntotal}")
    print(f"Number of entries in metadata: {len(metadata)}")
    
    # Check if all metadata entries have unique indices
    indices = [item['Index'] for item in metadata]
    unique_indices = set(indices)
    if len(indices) != len(unique_indices):
        print("WARNING: Duplicate indices found in metadata!")
    
    # Check if indices are continuous
    expected_indices = set(range(len(metadata)))
    if unique_indices != expected_indices:
        print("WARNING: Non-continuous indices found!")
        print("Missing indices:", expected_indices - unique_indices)
        print("Extra indices:", unique_indices - expected_indices)
    
    return index.ntotal == len(metadata)

# Usage
if __name__ == "__main__":
    # Paths
    existing_index_path = 'Vector_DB/document_index.faiss'
    existing_metadata_path = 'Vector_DB/metadata.json'
    new_documents_folder = '/workspace/rohith_llm/Documents/Structured amendments/'
    save_dir = 'Vector_DB'
    
    # Update database
    index, metadata = update_faiss_database(
        existing_index_path,
        existing_metadata_path,
        new_documents_folder,
        save_dir
    )
    
    # Verify the updated database
    is_valid = verify_index_integrity(
        os.path.join(save_dir, 'document_index.faiss'),
        os.path.join(save_dir, 'metadata.json')
    )
    
    if is_valid:
        print("Database update completed successfully with index integrity maintained")
    else:
        print("WARNING: Database update completed but index integrity check failed")

Generating embeddings for 12 new documents...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Adding new embeddings to FAISS index...
Verifying index consistency...
Saving updated index and metadata...
Successfully added 12 new documents to the database
Total documents in database: 2215
Performing index integrity check...
Number of vectors in FAISS index: 2215
Number of entries in metadata: 2215
Database update completed successfully with index integrity maintained


COMPLETE

In [2]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import os
from glob import glob

def load_documents_from_directory(directory_path):
    """Load documents from all JSON files in the specified directory"""
    all_documents = []
    
    # Get all JSON files in the directory
    json_files = glob(os.path.join(directory_path, '*.json'))
    
    for file_path in json_files:
        try:
            with open(file_path, 'r') as f:
                documents = json.load(f)
                if isinstance(documents, list):
                    all_documents.extend(documents)
                print(f"Successfully loaded: {file_path}")
        except Exception as e:
            print(f"Error loading {file_path}: {str(e)}")
    
    return all_documents

def prepare_texts(documents):
    """Combine relevant fields for each document"""
    texts = []
    metadata = []
    
    for doc in documents:
        # Combine fields into a single text
        text_parts = [
            doc.get('Document', ''),
            doc.get('Description', ''),
            f"Rule {doc.get('Rule no.', '')}" if doc.get('Rule no.') else ''
            
        ]
        text = ' '.join(filter(None, text_parts))
        
        if text.strip():
            texts.append(text)
            metadata.append({
                'Document': doc.get('Document', ''),
                'Rule_no': doc.get('Rule no.', ''),
                'Part': doc.get('Part', ''),
                'Chapter': doc.get('Chapter', ''),
                'Amendment_order': doc.get('Amendment order no.', ''),
                'Order_date': doc.get('Order date', ''),
                'Effective_date': doc.get('Effective date', '')
            })
    
    return texts, metadata

def create_embeddings(texts, model_name='all-MiniLM-L6-v2', batch_size=32):
    """Generate embeddings using the specified model"""
    model = SentenceTransformer(model_name, device='cpu')
    embeddings = model.encode(texts, show_progress_bar=True, batch_size=batch_size)
    return embeddings

def create_faiss_index(embeddings, save_dir):
    """Create and save FAISS index"""
    os.makedirs(save_dir, exist_ok=True)
    
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings).astype('float32'))
    
    faiss.write_index(index, os.path.join(save_dir, 'document_index.faiss'))
    return index

def save_metadata(metadata, save_dir):
    """Save metadata to JSON file"""
    with open(os.path.join(save_dir, 'metadata.json'), 'w') as f:
        json.dump(metadata, f, indent=2)

def main():
    # Paths
    input_directory = '/workspace/rohith_llm/Extracted/Structured'
    save_dir = 'Vector_DB'
    
    # Load and prepare documents from all JSON files
    documents = load_documents_from_directory(input_directory)
    texts, metadata = prepare_texts(documents)
    
    if not texts:
        print("No documents found to process!")
        return
    
    # Create embeddings
    print(f"Creating embeddings for {len(texts)} documents...")
    embeddings = create_embeddings(texts)
    
    # Create and save FAISS index
    index = create_faiss_index(embeddings, save_dir)
    
    # Save metadata
    save_metadata(metadata, save_dir)
    
    print(f"Created FAISS index with {len(texts)} documents")
    print(f"Index saved to {save_dir}/document_index.faiss")
    print(f"Metadata saved to {save_dir}/metadata.json")

if __name__ == "__main__":
    main()

Successfully loaded: /workspace/rohith_llm/Extracted/Structured/KSR.json
Successfully loaded: /workspace/rohith_llm/Extracted/Structured/KSR_Amendments.json
Successfully loaded: /workspace/rohith_llm/Extracted/Structured/KTC.json
Successfully loaded: /workspace/rohith_llm/Extracted/Structured/KFC.json
Creating embeddings for 5858 documents...


Batches:   0%|          | 0/184 [00:00<?, ?it/s]

Created FAISS index with 5858 documents
Index saved to Vector_DB/document_index.faiss
Metadata saved to Vector_DB/metadata.json
