AMENDMENTS ONLY

In [19]:
import json
import requests
import time
from tqdm import tqdm
from pathlib import Path

def process_with_llama(json_obj):
    """Process a single JSON object with Llama model via Ollama."""
    prompt = f"""
Given this JSON object from a Kerala Service Rules amendment:
{json.dumps(json_obj, indent=2)}

Create a one-sentence summary that captures all the important information from this document.
Give only the summary text, no additional formatting or explanation.
"""

    response = requests.post('http://localhost:11434/api/generate',
                           json={
                               'model': 'llama3.3:70b-instruct-q8_0',
                               'prompt': prompt,
                               'stream': False
                           })
    
    if response.status_code == 200:
        return response.json()['response'].strip()
    else:
        return f"Error processing amendment: {response.status_code}"

def main():
    # Setup paths
    base_path = Path('/workspace/rohith_llm/Extracted/Structured')
    summary_dir = base_path / 'Summary'
    input_path = base_path / 'KSR_Amendments.json'
    summaries_path = summary_dir / 'KSR_Amendments_Summaries.json'
    metadata_path = summary_dir / 'KSR_Amendments_Metadata.json'
    
    try:
        with open(input_path, 'r') as file:
            amendments = json.load(file)
    except FileNotFoundError:
        print(f"Error: Could not find input file at {input_path}")
        return
    except json.JSONDecodeError:
        print("Error: Invalid JSON in input file")
        return

    # Lists to store summaries and metadata
    summaries = []
    metadata = []
    
    # Process each amendment with tqdm progress bar
    with tqdm(total=len(amendments), desc="Processing amendments") as pbar:
        for idx, amendment in enumerate(amendments):
            # Generate summary
            summary = process_with_llama(amendment)
            
            # Add to summaries list
            summaries.append({"summary": summary})
            
            # Create metadata entry with original structure plus index
            metadata_entry = amendment.copy()  # Preserve original structure
            metadata_entry["index"] = idx     # Add index field
            metadata.append(metadata_entry)
            
            # Update progress bar
            pbar.update(1)
            

    # Save summaries file
    with open(summaries_path, 'w') as f:
        json.dump(summaries, f, indent=2)
    
    # Save metadata file
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"\nProcessed {len(amendments)} amendments")
    print(f"Summaries saved to: {summaries_path}")
    print(f"Metadata saved to: {metadata_path}")

if __name__ == "__main__":
    main()

Processing amendments: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [02:17<00:00,  4.59s/it]


Processed 30 amendments
Summaries saved to: /workspace/rohith_llm/Extracted/Structured/Summary/KSR_Amendments_Summaries.json
Metadata saved to: /workspace/rohith_llm/Extracted/Structured/Summary/KSR_Amendments_Metadata.json





In [None]:
ENTIRE DOCUMENTS

In [None]:
import json
import requests
import time
from tqdm import tqdm
from pathlib import Path
import glob

def process_with_llama(json_obj):
    """Process a single JSON object with Llama model via Ollama."""
    prompt = f"""
Given this JSON object from a Kerala Service Rules amendment:
{json.dumps(json_obj, indent=2)}

Create a one-sentence summary that captures all the information from this document.
Give only the summary text, no additional formatting or explanation.
"""

    response = requests.post('http://localhost:11434/api/generate',
                           json={
                               'model': 'llama3.3:70b-instruct-q8_0',
                               'prompt': prompt,
                               'stream': False
                           })
    
    if response.status_code == 200:
        return response.json()['response'].strip()
    else:
        return f"Error processing amendment: {response.status_code}"

def main():
    # Setup paths
    base_path = Path('/workspace/rohith_llm/Extracted/Structured')
    documents_path = base_path / 'Documents'
    summary_dir = base_path / 'Summary'
    summary_dir.mkdir(exist_ok=True)
    
    # Combined output files
    combined_summaries_path = summary_dir / 'Combined_Summaries.json'
    combined_metadata_path = summary_dir / 'Combined_Metadata.json'
    
    # Lists to store all summaries and metadata
    all_summaries = []
    all_metadata = []
    
    # Get all JSON files in the Documents directory
    json_files = list(documents_path.glob('*.json'))
    
    if not json_files:
        print(f"Error: No JSON files found in {documents_path}")
        return
    
    # Process each JSON file
    for json_file in json_files:
        print(f"\nProcessing file: {json_file.name}")
        
        try:
            with open(json_file, 'r') as file:
                amendments = json.load(file)
        except FileNotFoundError:
            print(f"Error: Could not find file {json_file}")
            continue
        except json.JSONDecodeError:
            print(f"Error: Invalid JSON in file {json_file}")
            continue
        
        # Process each amendment in the current file with tqdm progress bar
        with tqdm(total=len(amendments), desc=f"Processing {json_file.name}") as pbar:
            for amendment in amendments:
                # Generate summary
                summary = process_with_llama(amendment)
                
                # Add to summaries list with source file information
                all_summaries.append({
                    "source_file": json_file.name,
                    "summary": summary
                })
                
                # Create metadata entry with source file information
                metadata_entry = amendment.copy()
                metadata_entry["source_file"] = json_file.name
                metadata_entry["index"] = len(all_metadata)  # Global index across all files
                all_metadata.append(metadata_entry)
                
                pbar.update(1)
    
    # Save combined summaries file
    with open(combined_summaries_path, 'w') as f:
        json.dump(all_summaries, f, indent=2)
    
    # Save combined metadata file
    with open(combined_metadata_path, 'w') as f:
        json.dump(all_metadata, f, indent=2)
    
    print(f"\nProcessed {len(all_summaries)} total amendments from {len(json_files)} files")
    print(f"Combined summaries saved to: {combined_summaries_path}")
    print(f"Combined metadata saved to: {combined_metadata_path}")

if __name__ == "__main__":
    main()


Processing file: KSR.json


Processing KSR.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2453/2453 [2:14:20<00:00,  3.29s/it]



Processing file: KSSR.json


Processing KSSR.json:  16%|██████████████████▍                                                                                                  | 77/489 [03:33<17:55,  2.61s/it]

VECTOR DB

In [33]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import os
from pathlib import Path

def load_files(summaries_path, metadata_path):
    """Load both summaries and metadata files."""
    with open(summaries_path, 'r') as f:
        summaries = json.load(f)
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    return summaries, metadata

def prepare_texts(summaries, metadata):
    """Prepare texts for embedding by combining summaries with key metadata."""
    texts = []
    processed_metadata = []
    
    for idx, (summary_obj, meta) in enumerate(zip(summaries, metadata)):
        # Extract summary text from object
        summary = list(summary_obj)[0]  # Since each summary is stored as a single-item set
        
        # Create enriched text for embedding
        text_parts = [
            summary,
            f"Rule {meta.get('Rule no.', '')}" if meta.get('Rule no.') else '',
            meta.get('Description', '')
        ]
        text = ' '.join(filter(None, text_parts))
        
        if text.strip():
            texts.append(text)
            # Keep original metadata structure and add index
            meta_entry = meta.copy()
            meta_entry['summary'] = summary
            processed_metadata.append(meta_entry)
    
    return texts, processed_metadata

def create_embeddings(texts, model_name='all-MiniLM-L6-v2', batch_size=32):
    """Create embeddings using SentenceTransformer."""
    print(f"Using model: {model_name}")
    model = SentenceTransformer(model_name, device='cpu')
    
    print("Generating embeddings...")
    embeddings = model.encode(texts, 
                            show_progress_bar=True, 
                            batch_size=batch_size)
    return embeddings

def create_faiss_index(embeddings, save_dir):
    """Create and save FAISS index using L2 distance."""
    os.makedirs(save_dir, exist_ok=True)
    
    dimension = embeddings.shape[1]
    print(f"Creating FAISS index with dimension {dimension}")
    
    # Convert to float32 for FAISS compatibility
    embeddings = embeddings.astype('float32')
    
    # Create FlatL2 index
    print("Creating FlatL2 index...")
    index = faiss.IndexFlatL2(dimension)
    
    # Add vectors to the index
    print("Adding vectors to index...")
    index.add(embeddings)
    
    # Save the index
    index_path = os.path.join(save_dir, 'embeddings.faiss')
    faiss.write_index(index, index_path)
    print(f"Index saved to {index_path}")
    return index

def save_metadata(metadata, save_dir):
    """Save processed metadata."""
    metadata_path = os.path.join(save_dir, 'metadata.json')
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"Metadata saved to {metadata_path}")

def main():
    # Setup paths
    base_path = Path('/workspace/rohith_llm/Extracted/Structured/Summary')
    summaries_path = base_path / 'Combined_Summaries.json' 
    metadata_path = base_path / 'Combined_Metadata.json'
    save_dir = base_path / 'Vector_DB'
    
    print("Loading files...")
    summaries, metadata = load_files(summaries_path, metadata_path)
    
    print("Preparing texts...")
    texts, processed_metadata = prepare_texts(summaries, metadata)
    
    print(f"Creating embeddings for {len(texts)} documents...")
    embeddings = create_embeddings(texts)
    
    print("Creating and saving FAISS index...")
    index = create_faiss_index(embeddings, save_dir)
    
    print("Saving metadata...")
    save_metadata(processed_metadata, save_dir)
    
    print("\nSummary:")
    print(f"- Processed {len(texts)} documents")
    print(f"- Created {embeddings.shape[1]}-dimensional embeddings")
    print(f"- Index and metadata saved in {save_dir}")

if __name__ == "__main__":
    main()

Loading files...
Preparing texts...
Creating embeddings for 6448 documents...
Using model: all-MiniLM-L6-v2
Generating embeddings...


Batches:   0%|          | 0/202 [00:00<?, ?it/s]

Creating and saving FAISS index...
Creating FAISS index with dimension 384
Creating FlatL2 index...
Adding vectors to index...
Index saved to /workspace/rohith_llm/Extracted/Structured/Summary/Vector_DB/embeddings.faiss
Saving metadata...
Metadata saved to /workspace/rohith_llm/Extracted/Structured/Summary/Vector_DB/metadata.json

Summary:
- Processed 6448 documents
- Created 384-dimensional embeddings
- Index and metadata saved in /workspace/rohith_llm/Extracted/Structured/Summary/Vector_DB


In [2]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import os
from pathlib import Path
from tqdm import tqdm

def load_files(summaries_path, metadata_path):
    """Load both summaries and metadata files."""
    with open(summaries_path, 'r') as f:
        summaries = json.load(f)
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    return summaries, metadata

def prepare_texts(summaries, metadata):
    """Prepare texts for embedding by combining summaries with key metadata."""
    texts = []
    processed_metadata = []
    
    # Add tqdm progress bar for text preparation
    for idx, (summary_obj, meta) in enumerate(tqdm(zip(summaries, metadata), 
                                                  total=len(summaries),
                                                  desc="Preparing texts")):
        # Extract summary text from object
        summary = list(summary_obj)[0]  # Since each summary is stored as a single-item set
        
        # Create enriched text for embedding
        text_parts = [
            summary,
            f"Rule {meta.get('Rule no.', '')}" if meta.get('Rule no.') else '',
            meta.get('Description', '')
        ]
        text = ' '.join(filter(None, text_parts))
        
        if text.strip():
            texts.append(text)
            # Keep original metadata structure and add index
            meta_entry = meta.copy()
            meta_entry['summary'] = summary
            processed_metadata.append(meta_entry)
    
    return texts, processed_metadata

def create_embeddings(texts, model_name='all-MiniLM-L6-v2', batch_size=32):
    """Create embeddings using SentenceTransformer."""
    print(f"Using model: {model_name}")
    model = SentenceTransformer(model_name, device='cpu')
    
    print("Generating embeddings...")
    # SentenceTransformer already uses tqdm when show_progress_bar=True
    embeddings = model.encode(texts, 
                            show_progress_bar=True, 
                            batch_size=batch_size)
    return embeddings

def create_faiss_index(embeddings, save_dir):
    """Create and save FAISS index using L2 distance."""
    os.makedirs(save_dir, exist_ok=True)
    
    dimension = embeddings.shape[1]
    print(f"Creating FAISS index with dimension {dimension}")
    
    # Convert to float32 for FAISS compatibility
    embeddings = embeddings.astype('float32')
    
    # Create FlatL2 index
    print("Creating FlatL2 index...")
    index = faiss.IndexFlatL2(dimension)
    
    # Add vectors to the index with progress bar
    print("Adding vectors to index...")
    batch_size = 1000
    for i in tqdm(range(0, len(embeddings), batch_size), desc="Adding to FAISS"):
        batch = embeddings[i:i + batch_size]
        index.add(batch)
    
    # Save the index
    index_path = os.path.join(save_dir, 'embeddings.faiss')
    faiss.write_index(index, index_path)
    print(f"Index saved to {index_path}")
    return index

def save_metadata(metadata, save_dir):
    """Save processed metadata."""
    metadata_path = os.path.join(save_dir, 'metadata.json')
    with tqdm(total=1, desc="Saving metadata") as pbar:
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        pbar.update(1)
    print(f"Metadata saved to {metadata_path}")

def main():
    # Setup paths
    base_path = Path('/workspace/rohith_llm/Extracted/Structured/Summary')
    summaries_path = base_path / 'Combined_Summaries.json' 
    metadata_path = base_path / 'Combined_Metadata.json'
    save_dir = base_path / 'Vector_DB'
    
    print("Loading files...")
    with tqdm(total=2, desc="Loading files") as pbar:
        summaries, metadata = load_files(summaries_path, metadata_path)
        pbar.update(2)
    
    texts, processed_metadata = prepare_texts(summaries, metadata)
    
    print(f"Creating embeddings for {len(texts)} documents...")
    embeddings = create_embeddings(texts)
    
    print("Creating and saving FAISS index...")
    index = create_faiss_index(embeddings, save_dir)
    
    print("Saving metadata...")
    save_metadata(processed_metadata, save_dir)
    
    print("\nSummary:")
    print(f"- Processed {len(texts)} documents")
    print(f"- Created {embeddings.shape[1]}-dimensional embeddings")
    print(f"- Index and metadata saved in {save_dir}")

if __name__ == "__main__":
    main()

Loading files...


Loading files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 56.50it/s]
Preparing texts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6293/6293 [00:00<00:00, 441679.30it/s]

Creating embeddings for 6293 documents...
Using model: all-MiniLM-L6-v2





Generating embeddings...


Batches:   0%|          | 0/197 [00:00<?, ?it/s]

Creating and saving FAISS index...
Creating FAISS index with dimension 384
Creating FlatL2 index...
Adding vectors to index...


Adding to FAISS: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 686.27it/s]


Index saved to /workspace/rohith_llm/Extracted/Structured/Summary/Vector_DB/embeddings.faiss
Saving metadata...


Saving metadata: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.88it/s]

Metadata saved to /workspace/rohith_llm/Extracted/Structured/Summary/Vector_DB/metadata.json

Summary:
- Processed 6293 documents
- Created 384-dimensional embeddings
- Index and metadata saved in /workspace/rohith_llm/Extracted/Structured/Summary/Vector_DB



