In [None]:
!pip install pandas requests

In [None]:
import pandas as pd
import json
import requests

In [None]:
def get_llm_response(prompt):
    url = "http://localhost:11434/api/generate"
    
    data = {
        "model": "llama3.3:70b-instruct-q8_0",
        "prompt": prompt,
        "stream": False
    }
    
    response = requests.post(url, json=data)
    return response.json()['response']

In [None]:
!pip install requests
!pip install typing-extensions==4.7.1
!pip install pydantic==2.4.2
!pip install ollama

KSR Vol 1 - Chunk 2000, Overlap 300

KSR Vol 1 - Chunk 2000, Overlap 300

In [None]:
import json
import requests
import re
from tqdm import tqdm

def extract_section_info(text):
    patterns = {
        'Document': r'Document:"([^"]+)"',
        'Part': r'Part:"([^"]+)"',
        'Chapter': r'Chapter:"([^"]+)"',
        'Appendix': r'Appendix:"([^"]+)"',
        'Annexure': r'Annexure:"([^"]+)"',
        'Section': r'Section:"([^"]+)"',
        'Sub Section': r'Sub Section:"([^"]+)"',
        'Sub division': r'Sub division:"([^"]+)"',
        'Rule no.': r'Rule no.:"([^"]+)"'
    }
    
    result = {k: None for k in patterns.keys()}
    
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            result[key] = match.group(1)
    
    return result

def process_text_chunk(text):
    # Split into sections based on "Rule no.:" pattern
    sections = re.split(r'(?=Rule no.:")', text)
    sections = [s for s in sections if s.strip()]

    results = []
    for section in sections:
        # Extract section info and description
        section_info = extract_section_info(section)
        
        # Extract description - everything after the last known field
        description_pattern = r'Description:"([^"]+)"'
        desc_match = re.search(description_pattern, section)
        if desc_match:
            section_info['Description'] = desc_match.group(1)
        
        if any(section_info.values()):
            results.append(section_info)
    
    return results

def chunk_text(text, max_length=2000, overlap=300):
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + max_length
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
    
    return chunks

def process_with_llama(text):
    prompt = """You are tasked with extracting structured data from a text block that contains information about rules or sections from a document, likely the Kerala Service Rules (KSR). Your goal is to parse this information and format it as a JSON list.

The JSON format for each entry should be as follows:
{
  "Document": "KSR",
  "Part": "",
  "Chapter": "",
  "Appendix": "",
  "Annexure": "",
  "Section": "",
  "Sub Section": "",
  "Sub division": "",
  "Rule no.": "",
  "Description": ""
}

Here is the text block you need to parse:

<text_block>
{?text}
</text_block>

To extract the required information:

1. Identify the document name (usually "KSR" unless specified otherwise).
2. Look for indicators of Part, Chapter, Appendix, Annexure, Section, Sub Section, Sub division, and Rule number. These may be explicitly stated or implied by the structure of the text.
3. The remaining text should be considered as the Description.
4. If any field is not present in the text, leave it as an empty string in the JSON.
5. If multiple rules or sections are present in the text block, create separate JSON objects for each.
6. Ensure that the Rule number is a string, even if it's a number.
7. For the Description, include all relevant text, including notes, government decisions, and examples if present.

Your output should be a valid JSON list containing one or more objects in the specified format. Do not include any additional text or explanations outside of the JSON structure.

<output>
</output>
""".replace("{?text}", text)

    response = requests.post('http://localhost:11434/api/generate',
                           json={
                               'model': 'llama3.3:70b-instruct-q8_0',
                               'prompt': prompt,
                               'stream': False
                           })
    
    if response.status_code == 200:
        try:
            response_json = response.json()['response']
            # Extract JSON content from markdown code block if present
            if '```json' in response_json:
                json_str = response_json.split('```json')[1].split('```')[0].strip()
            else:
                json_str = response_json.strip()
            
            # Parse the JSON string
            return json.loads(json_str)
        except Exception as e:
            print(f"Error processing LLM response: {e}")
            print(f"Response was: {response.json()['response']}")
            return None
    return None

def main():
    # Read the input file
#     with open('Extracted/KSR_Vol_2_pdfminer_extracted.txt', 'r', encoding='utf-8') as file:
#         text = file.read()
        
    # Open and read both files
    with open('Extracted/KSR_Vol_1_pdfminer_extracted.txt', 'r', encoding='utf-8') as file1, \
         open('Extracted/KSR_Vol_2_pdfminer_extracted.txt', 'r', encoding='utf-8') as file2:
        text = file1.read() + file2.read()
        
    
    # Process text in chunks
    chunks = chunk_text(text)  # Removed the [:3] slice to process all chunks
    all_results = []
    
    for chunk in tqdm(chunks, desc="Processing text chunks"):
        # Try direct pattern matching first
        results = process_text_chunk(chunk)
        
        # If pattern matching fails, use LLM
        if not results:
            llm_result = process_with_llama(chunk)
            if llm_result:
                all_results.extend(llm_result if isinstance(llm_result, list) else [llm_result])
        else:
            all_results.extend(results)
    
    # Add debug output
    print(f"Total results found: {len(all_results)}")
    
    # Write results to JSON file
    if all_results:
        with open('Full_KSR_extracted_rules.json', 'w', encoding='utf-8') as f:
            json.dump(all_results, f, indent=4, ensure_ascii=False)
        print("Results successfully written to Full_KSR_extracted_rules.json")
    else:
        print("No results were found to write to file")
    
    return all_results

if __name__ == "__main__":
    result = main()

In [2]:
import json
import requests
import re
from tqdm import tqdm

def load_examples(filename='Data_extraction_format.json'):
    """Load the example data for few-shot learning."""
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def create_few_shot_prompt(examples, text_to_process):
    """Create a prompt that includes examples and the text to process."""
    prompt = """You are tasked with extracting structured data from text blocks containing information about rules from Kerala Service Rules (KSR). Your goal is to parse this information and format it as a JSON object.

Here are 20 examples of correctly formatted extractions:

"""
    # Add examples
    for idx, example in enumerate(examples, 1):
        example_str = json.dumps(example, indent=2)
        prompt += f"Example {idx}:\n{example_str}\n\n"

    prompt += """Now, using the same format as the examples above, extract the information from this text:

<text_block>
{text}
</text_block>

Important patterns to look for:
1. Document is usually "KSR"
2. Look for Part (I, II, III, etc.)
3. Look for Chapter references
4. Look for Appendix references
5. Look for Annexure references
6. Look for Section references
7. Look for Sub Section references
8. Look for Sub division references
9. Look for Rule numbers
10. Everything else goes into Description

Your output should match the format of the examples exactly. Only output valid JSON, with no additional text or explanations.
""".replace("{text}", text_to_process)

    return prompt

def process_text_chunk(text, examples):
    """Process a chunk of text using the LLM with few-shot examples."""
    prompt = create_few_shot_prompt(examples, text)
    
    response = requests.post('http://localhost:11434/api/generate',
                           json={
                               'model': 'llama3.3:70b-instruct-q8_0',
                               'prompt': prompt,
                               'stream': False
                           })
    
    if response.status_code == 200:
        try:
            response_json = response.json()['response']
            # Clean up the response text
            response_text = response_json.strip()
            
            # Handle markdown code blocks
            if '```' in response_text:
                # Extract content between code blocks, regardless of language specification
                response_text = re.search(r'```(?:json)?\n(.*?)\n```', response_text, re.DOTALL)
                if response_text:
                    response_text = response_text.group(1)
            
            # Remove any leading/trailing whitespace
            json_str = response_text.strip()
            
            try:
                # Parse the JSON string
                result = json.loads(json_str)
            except json.JSONDecodeError as e:
                print(f"Failed to parse JSON: {e}")
                print(f"Attempted to parse: {json_str}")
                return None
            return result if isinstance(result, list) else [result]
        except Exception as e:
            print(f"Error processing LLM response: {e}")
            print(f"Response was: {response.json()['response']}")
            return None
    return None

def chunk_text(text, max_length=2000, overlap=300):
    """Split text into chunks with overlap."""
    chunks = []
    start = 0
    
    while start < len(text):
        # Find the end of the current chunk
        end = start + max_length
        if end >= len(text):
            chunks.append(text[start:])
            break
            
        # Try to find a good breaking point (end of a rule)
        next_rule = text[end:].find('Rule no.:')
        if next_rule != -1:
            end = end + next_rule
        
        chunks.append(text[start:end])
        start = end - overlap
    
    return chunks

def main():
    # Load the example data
    examples = load_examples()
    print(f"Loaded {len(examples)} examples for few-shot learning")
    
    # Read the input files
    with open('Extracted/KSR_Vol_1_pdfminer_extracted.txt', 'r', encoding='utf-8') as file1, \
         open('Extracted/KSR_Vol_2_pdfminer_extracted.txt', 'r', encoding='utf-8') as file2:
        text = file1.read() + file2.read()
    
    # Process text in chunks
    chunks = chunk_text(text)[:5]
    all_results = []
    
    for chunk in tqdm(chunks, desc="Processing text chunks"):
        results = process_text_chunk(chunk, examples)
        if results:
            all_results.extend(results)
    
    # Add debug output
    print(f"Total results found: {len(all_results)}")
    
    # Write results to JSON file
    if all_results:
        with open('KSR_extracted_rules.json', 'w', encoding='utf-8') as f:
            json.dump(all_results, f, indent=4, ensure_ascii=False)
        print("Results successfully written to KSR_extracted_rules.json")
    else:
        print("No results were found to write to file")
    
    return all_results

if __name__ == "__main__":
    result = main()

Loaded 20 examples for few-shot learning


Processing text chunks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:48<00:00, 21.76s/it]

Total results found: 7
Results successfully written to KSR_extracted_rules.json





In [1]:
import json
import requests
import re
from tqdm import tqdm

def load_examples(filename='Data_extraction_format.json'):
    """Load the example data for few-shot learning."""
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def normalize_rule_number(rule_text):
    """Normalize rule number format."""
    if not rule_text:
        return ""
    # Remove any spaces and convert to string
    rule_text = str(rule_text).strip()
    return rule_text

def create_few_shot_prompt(examples, text_to_process):
    """Create a prompt that includes examples and the text to process."""
    prompt = """You are tasked with extracting structured information from Kerala Service Rules (KSR) text into a consistent JSON format.

Key requirements:
1. Always use these exact field names:
   - Document
   - Part
   - Chapter
   - Appendix
   - Annexure
   - Section
   - Sub Section
   - Sub division
   - Rule no.
   - Description

2. Always include all fields, using empty strings ("") for missing values, not null.
3. The "Document" field should always be "KSR" if present in the text.
4. Rule numbers should be strings, even if they're numbers.
5. Keep the format exactly consistent with these examples.

Here are examples of correctly formatted extractions:

"""
    # Add examples
    for idx, example in enumerate(examples, 1):
        example_str = json.dumps(example, indent=2)
        prompt += f"Example {idx}:\n{example_str}\n\n"

    prompt += """Now, extract the information from this text using the exact same format as the examples:

<text>
{text}
</text>

Remember:
- Use empty strings ("") for missing fields
- Include ALL fields mentioned above
- Keep rule numbers as strings
- Follow the exact format of the examples
- Put all other text in the Description field

Output only the JSON object, no other text.""".replace("{text}", text_to_process)

    return prompt

def standardize_output(result):
    """Standardize the JSON output format."""
    if not isinstance(result, dict):
        return None
        
    # Standard fields that should always be present
    standard_fields = {
        "Document": "KSR",
        "Part": "", 
        "Chapter": "",
        "Appendix": "",
        "Annexure": "",
        "Section": "",
        "Sub Section": "",
        "Sub division": "",
        "Rule no.": "",
        "Description": ""
    }
    
    # Create a new standardized dictionary
    standardized = standard_fields.copy()
    
    # Map common field variations
    field_mappings = {
        "Rule": "Rule no.",
        "rule_no": "Rule no.",
        "rule_number": "Rule no.",
    }
    
    # Collect all description-related fields
    description_parts = []
    
    for key, value in result.items():
        # Map the field name if it's a known variation
        standard_key = field_mappings.get(key, key)
        
        # Handle description-related fields
        if key.startswith('Description') or key == 'Provided' or key == 'Model_Form':
            if value and isinstance(value, str) and value.strip():
                if key == 'Provided':
                    description_parts.append(f"Provided that {value}")
                elif key == 'Model_Form':
                    description_parts.append(f"Note: {value}")
                else:
                    description_parts.append(value)
            continue
            
        # Skip unknown fields that aren't description-related
        if standard_key not in standard_fields:
            continue
            
        # Handle the value
        if value is None:
            standardized[standard_key] = ""
        elif isinstance(value, (list, dict)):
            # Convert complex rule structures to string
            if standard_key == "Rule no.":
                if isinstance(value, list):
                    standardized[standard_key] = ", ".join(str(v) for v in value)
                elif isinstance(value, dict):
                    standardized[standard_key] = str(value.get("Number", ""))
        else:
            standardized[standard_key] = str(value).strip()
    
    # Combine all description parts into main description
    if description_parts:
        if standardized["Description"]:
            description_parts.insert(0, standardized["Description"])
        standardized["Description"] = "\n\n".join(filter(None, description_parts))
            
    return standardized

def process_text_chunk(text, examples):
    """Process a chunk of text using the LLM with few-shot examples."""
    prompt = create_few_shot_prompt(examples, text)
    
    response = requests.post('http://localhost:11434/api/generate',
                           json={
                               'model': 'qwen2.5:72b-instruct-q8_0',
                               'prompt': prompt,
                               'stream': False
                           })
    
    if response.status_code == 200:
        try:
            response_json = response.json()['response']
            # Clean up the response text
            response_text = response_json.strip()
            
            # Handle markdown code blocks
            if '```' in response_text:
                # Extract content between code blocks, regardless of language specification
                response_text = re.search(r'```(?:json)?\n(.*?)\n```', response_text, re.DOTALL)
                if response_text:
                    response_text = response_text.group(1)
            
            # Remove any leading/trailing whitespace
            response_text = response_text.strip()
            
            # Handle multiple JSON objects in response
            # Split on closing brace followed by opening brace
            json_objects = re.split(r'}\s*{', response_text)
            
            result = []
            for idx, json_obj in enumerate(json_objects):
                # Add back the braces except for first and last object
                if idx > 0:
                    json_obj = '{' + json_obj
                if idx < len(json_objects) - 1:
                    json_obj = json_obj + '}'
                    
                try:
                    # Parse individual JSON object
                    parsed_obj = json.loads(json_obj)
                    result.append(parsed_obj)
                except json.JSONDecodeError as e:
                    print(f"Failed to parse JSON object {idx + 1}: {e}")
                    print(f"Object text: {json_obj}")
                    continue
            
            # Handle both single objects and lists
            if isinstance(result, dict):
                result = [result]
            
            # Standardize each result
            standardized_results = []
            for item in result:
                standardized = standardize_output(item)
                if standardized:
                    standardized_results.append(standardized)
            
            return standardized_results
            
        except Exception as e:
            print(f"Error processing LLM response: {e}")
            print(f"Response was: {response_json}")
            return None
    return None

def chunk_text(text, max_length=2000, overlap=300):
    """Split text into chunks with overlap."""
    chunks = []
    start = 0
    
    while start < len(text):
        # Find the end of the current chunk
        end = start + max_length
        if end >= len(text):
            chunks.append(text[start:])
            break
            
        # Try to find a good breaking point (end of a rule)
        next_rule = text[end:].find('Rule no.:')
        if next_rule != -1:
            end = end + next_rule
        
        chunks.append(text[start:end])
        start = end - overlap
    
    return chunks

def main():
    # Load the example data
    examples = load_examples()
    print(f"Loaded {len(examples)} examples for few-shot learning")
    
    # Read the input files
    with open('Extracted/KSR_Vol_1_pdfminer_extracted.txt', 'r', encoding='utf-8') as file1, \
         open('Extracted/KSR_Vol_2_pdfminer_extracted.txt', 'r', encoding='utf-8') as file2:
        text = file1.read() + file2.read()
    
    # Process text in chunks
    chunks = chunk_text(text)[55:61]
    all_results = []
    
    for chunk in tqdm(chunks, desc="Processing text chunks"):
        results = process_text_chunk(chunk, examples)
        if results:
            all_results.extend(results)
    
    # Add debug output
    print(f"Total results found: {len(all_results)}")
    
    # Write results to JSON file
    if all_results:
        with open('KSR_extracted_rules.json', 'w', encoding='utf-8') as f:
            json.dump(all_results, f, indent=4, ensure_ascii=False)
        print("Results successfully written to KSR_extracted_rules.json")
    else:
        print("No results were found to write to file")
    
    return all_results

if __name__ == "__main__":
    result = main()

Loaded 20 examples for few-shot learning


Processing text chunks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [05:44<00:00, 57.44s/it]

Total results found: 6
Results successfully written to KSR_extracted_rules.json



