In [10]:
import os
import json
import requests
from typing import Dict, Optional
import time

class LlamaExtractor:
    def __init__(self, input_dir: str, output_dir: str, model: str = "llama3.3:70b-instruct-q8_0"):
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.model = model
        self.ollama_url = "http://localhost:11434/api/generate"
        
        # Create output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

    def query_llm(self, prompt: str) -> str:
        """Query the local Llama model through Ollama API."""
        headers = {"Content-Type": "application/json"}
        data = {
            "model": self.model,
            "prompt": prompt,
            "stream": False
        }

        try:
            response = requests.post(self.ollama_url, headers=headers, json=data)
            response.raise_for_status()
            return response.json()["response"]
        except Exception as e:
            print(f"Error querying LLM: {str(e)}")
            return ""

    def extract_details(self, text: str) -> Dict:
        """Extract amendment details using LLM."""
        prompt = """
        Extract the following details from the given text of a Kerala Service Rules amendment notification. 
        Return the information in JSON format with these keys:
        - go_number (Government Order number)
        - date (date of the notification)
        - sro_number (SRO number)
        - amendment_name (name of the amendment)
        - amendment_year (year of the amendment)
        - commencement_date (when the amendment comes into force)

        If any information is not found, use null for that field.
        Only return the JSON object, no other text.

        Text:
        {text}
        """.format(text=text[:2000])  # Using first 2000 chars for key details

        try:
            response = self.query_llm(prompt)
            # Clean up the response to ensure it's valid JSON
            response = response.strip()
            if response.startswith("```json"):
                response = response[7:]
            if response.endswith("```"):
                response = response[:-3]
            data = json.loads(response)
            return data
        except json.JSONDecodeError:
            print("Error parsing LLM response as JSON")
            return {
                "go_number": None,
                "date": None,
                "sro_number": None,
                "amendment_name": None,
                "amendment_year": None,
                "commencement_date": None
            }

    def process_file(self, filename: str) -> Dict:
        """Process a single file and extract information using LLM."""
        with open(os.path.join(self.input_dir, filename), 'r', encoding='utf-8') as f:
            content = f.read()

        # Extract information using LLM
        extracted_data = self.extract_details(content)
        extracted_data['filename'] = filename
        extracted_data['original_text'] = content

        return extracted_data

    def save_extracted_info(self, data: Dict, original_filename: str):
        """Save extracted information to a new file."""
        base_name = os.path.splitext(original_filename)[0]
        output_filename = f"{base_name}_extracted.txt"
        
        output_text = (
            f"Amendment Details:\n"
            f"----------------\n"
            f"GO Number: {data.get('go_number', 'Not found')}\n"
            f"Date: {data.get('date', 'Not found')}\n"
            f"SRO Number: {data.get('sro_number', 'Not found')}\n"
            f"Amendment Name: {data.get('amendment_name', 'Not found')}\n"
            f"Amendment Year: {data.get('amendment_year', 'Not found')}\n"
            f"Commencement Date: {data.get('commencement_date', 'Not found')}\n\n"
            f"Original Text:\n"
            f"-------------\n"
            f"{data['original_text']}"
        )
        
        output_path = os.path.join(self.output_dir, output_filename)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(output_text)

    def process_all_files(self):
        """Process all text files in the input directory."""
        for filename in os.listdir(self.input_dir):
            if filename.endswith('.txt'):
                try:
                    print(f"Processing {filename}...")
                    data = self.process_file(filename)
                    self.save_extracted_info(data, filename)
                    print(f"Successfully processed {filename}")
                    # Add a small delay to prevent overwhelming the LLM
                    time.sleep(1)
                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")

# Usage example
if __name__ == "__main__":
    input_directory = "rohith_llm/Documents/Amendments"
    output_directory = "rohith_llm/Documents/Amendments/Extracted"
    
    extractor = LlamaExtractor(
        input_directory, 
        output_directory,
        model="llama3.3:70b-instruct-q8_0"  # Specify your model name here
    )
    extractor.process_all_files()

In [6]:
pip install requests

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.
