In [None]:
!pip install pandas requests

In [None]:
import pandas as pd
import json
import requests

In [None]:
def get_llm_response(prompt):
    url = "http://localhost:11434/api/generate"
    
    data = {
        "model": "llama3.3:70b-instruct-q8_0",
        "prompt": prompt,
        "stream": False
    }
    
    response = requests.post(url, json=data)
    return response.json()['response']

In [None]:
!pip install requests
!pip install typing-extensions==4.7.1
!pip install pydantic==2.4.2
!pip install ollama

KSR Vol 1 - Chunk 2000, Overlap 300

KSR Vol 1 - Chunk 2000, Overlap 300

KSR

In [None]:
import json
import requests
import re
from tqdm import tqdm

def extract_section_info(text):
    patterns = {
        'Document': r'Document:"([^"]+)"',
        'Part': r'Part:"([^"]+)"',
        'Chapter': r'Chapter:"([^"]+)"',
        'Appendix': r'Appendix:"([^"]+)"',
        'Annexure': r'Annexure:"([^"]+)"',
        'Section': r'Section:"([^"]+)"',
        'Sub Section': r'Sub Section:"([^"]+)"',
        'Sub division': r'Sub division:"([^"]+)"',
        'Rule no.': r'Rule no.:"([^"]+)"'
    }
    
    result = {k: None for k in patterns.keys()}
    
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            result[key] = match.group(1)
    
    return result

def process_text_chunk(text):
    # Split into sections based on "Rule no.:" pattern
    sections = re.split(r'(?=Rule no.:")', text)
    sections = [s for s in sections if s.strip()]

    results = []
    for section in sections:
        # Extract section info and description
        section_info = extract_section_info(section)
        
        # Extract description - everything after the last known field
        description_pattern = r'Description:"([^"]+)"'
        desc_match = re.search(description_pattern, section)
        if desc_match:
            section_info['Description'] = desc_match.group(1)
        
        if any(section_info.values()):
            results.append(section_info)
    
    return results

def chunk_text(text, max_length=2000, overlap=300):
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + max_length
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
    
    return chunks

def process_with_llama(text):
    prompt = """You are tasked with extracting structured data from a text block that contains information about rules or sections from a document, likely the Kerala Service Rules (KSR). Your goal is to parse this information and format it as a JSON list.

The JSON format for each entry should be as follows:
{
  "Document": "KSR",
  "Part": "",
  "Chapter": "",
  "Appendix": "",
  "Annexure": "",
  "Section": "",
  "Sub Section": "",
  "Sub division": "",
  "Rule no.": "",
  "Description": ""
}

Here is the text block you need to parse:

<text_block>
{?text}
</text_block>

To extract the required information:

1. Identify the document name (usually "KSR" unless specified otherwise).
2. Look for indicators of Part, Chapter, Appendix, Annexure, Section, Sub Section, Sub division, and Rule number. These may be explicitly stated or implied by the structure of the text.
3. The remaining text should be considered as the Description.
4. If any field is not present in the text, leave it as an empty string in the JSON.
5. If multiple rules or sections are present in the text block, create separate JSON objects for each.
6. Ensure that the Rule number is a string, even if it's a number.
7. For the Description, include all relevant text, including notes, government decisions, and examples if present.
8. Do not discard any part of the text block. If you cannot classify any portion of the text according to the specified fields (Part, Chapter, Appendix, Annexure, Section, Sub Section, Sub division, Rule number), include it in the Description field and populate its remaining attributes (Part, Chapter, Appendix, Annexure, Section, Sub Section, Sub division, Rule number) with the corresponding attributes from the previous JSON object, because these fields are mandatory. Do not put random stuff in these field, always put the correct ones. 
9. Format the Rule no. field exactly as shown in the template above.

Your output should be a valid JSON list containing one or more objects in the specified format. Do not include any additional text or explanations outside of the JSON structure.

<output>
</output>
""".replace("{?text}", text)

    response = requests.post('http://localhost:11434/api/generate',
                           json={
                               'model': 'llama3.3:70b-instruct-fp16',
                               'prompt': prompt,
                               'stream': False
                           })
    
    if response.status_code == 200:
        try:
            response_json = response.json()['response']
            # Extract JSON content from markdown code block if present
            if '```json' in response_json:
                json_str = response_json.split('```json')[1].split('```')[0].strip()
            else:
                json_str = response_json.strip()
            
            # Parse the JSON string
            return json.loads(json_str)
        except Exception as e:
            print(f"Error processing LLM response: {e}")
            print(f"Response was: {response.json()['response']}")
            return None
    return None

def main():
    # Read the input file
#     with open('Extracted/KSR_Vol_2_pdfminer_extracted.txt', 'r', encoding='utf-8') as file:
#         text = file.read()
        
    # Open and read both files
    with open('Extracted/KSR_Vol_1_pdfminer_extracted.txt', 'r', encoding='utf-8') as file1, \
         open('Extracted/KSR_Vol_2_pdfminer_extracted.txt', 'r', encoding='utf-8') as file2:
        text = file1.read() + file2.read()
        
    
    # Process text in chunks
    chunks = chunk_text(text)  # Removed the [:3] slice to process all chunks
    all_results = []
    
    for chunk in tqdm(chunks, desc="Processing text chunks"):
        # Try direct pattern matching first
        results = process_text_chunk(chunk)
        
        # If pattern matching fails, use LLM
        if not results:
            llm_result = process_with_llama(chunk)
            if llm_result:
                all_results.extend(llm_result if isinstance(llm_result, list) else [llm_result])
        else:
            all_results.extend(results)
    
    # Add debug output
    print(f"Total results found: {len(all_results)}")
    
    # Write results to JSON file
    if all_results:
        with open('FP_Full_KSR_extracted_rules2.json', 'w', encoding='utf-8') as f:
            json.dump(all_results, f, indent=4, ensure_ascii=False)
        print("Results successfully written to Full_KSR_extracted_rules.json")
    else:
        print("No results were found to write to file")
    
    return all_results

if __name__ == "__main__":
    result = main()

Processing text chunks:   0%|▏                                                                                                                 | 1/659 [00:44<8:05:20, 44.26s/it]

KFC

In [None]:
import json
import requests
import re
from tqdm import tqdm

def extract_section_info(text):
    patterns = {
        'Document': r'Document:"([^"]+)"',
        'Chapter': r'Chapter:"([^"]+)"',
        'Appendix': r'Appendix:"([^"]+)"',
        'Annexure': r'Annexure:"([^"]+)"',
        'Rule no.': r'Rule no.:"([^"]+)"'
    }
    
    result = {k: None for k in patterns.keys()}
    
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            result[key] = match.group(1)
    
    return result

def process_text_chunk(text):
    # Split into sections based on "Rule no.:" pattern
    sections = re.split(r'(?=Rule no.:")', text)
    sections = [s for s in sections if s.strip()]

    results = []
    for section in sections:
        # Extract section info and description
        section_info = extract_section_info(section)
        
        # Extract description - everything after the last known field
        description_pattern = r'Description:"([^"]+)"'
        desc_match = re.search(description_pattern, section)
        if desc_match:
            section_info['Description'] = desc_match.group(1)
        
        if any(section_info.values()):
            results.append(section_info)
    
    return results

def chunk_text(text, max_length=2000, overlap=300):
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + max_length
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
    
    return chunks

def process_with_llama(text):
    prompt = """You are tasked with extracting structured data from a text block that contains information about rules from a document, which is Kerala Financial Code Vol I. Your goal is to parse this information and format it as a JSON list.

The JSON format for each entry should be as follows:
{
  "Document": "KFC",
  "Chapter": "",
  "Appendix": "",
  "Annexure": "",
  "Rule no.": "",
  "Description": ""
}

Here is the text block you need to parse:

<text_block>
{?text}
</text_block>

To extract the required information:

1. Identify the document name (usually "KFC" unless specified otherwise).
2. Look for indicators of Chapter, Appendix, Annexure, and Rule number. These may be explicitly stated or implied by the structure of the text.
3. The remaining text should be considered as the Description.
4. If any field is not present in the text, leave it as an empty string in the JSON.
5. If multiple rules or sections are present in the text block, create separate JSON objects for each.
6. Ensure that the Rule number is a string, even if it's a number.
7. For the Description, include all relevant text, including notes, government decisions, and examples if present.
8. Do not discard any part of the text block. If you cannot classify any portion of the text according to the specified fields (Chapter, Appendix, Annexure, Rule no.), include it in the Description field.
9. Format the Rule no. field exactly as shown in the template above.

Your output should be a valid JSON list containing one or more objects in the specified format. Do not include any additional text or explanations outside of the JSON structure.

<output>
</output>
""".replace("{?text}", text)

    response = requests.post('http://localhost:11434/api/generate',
                           json={
                               'model': 'llama3.3:70b-instruct-fp16',
                               'prompt': prompt,
                               'stream': False
                           })
    
    if response.status_code == 200:
        try:
            response_json = response.json()['response']
            # Extract JSON content from markdown code block if present
            # Clean up the response
            if '```json' in response_json:
                json_str = response_json.split('```json')[1].split('```')[0].strip()
            else:
                json_str = response_json.strip()
            
            # Remove any leading/trailing whitespace or newlines
            json_str = json_str.strip()
            
            # Parse the JSON string
            return json.loads(json_str)
        except Exception as e:
            print(f"Error processing LLM response: {e}")
            print(f"Response was: {response.json()['response']}")
            return None
    return None

def main():
    # Read the input file
    with open('/workspace/rohith_llm/Extracted/KFC1_pdfminer_extracted.txt', 'r', encoding='utf-8') as file:
        text = file.read()
        
            
    
    # Process text in chunks
    chunks = chunk_text(text)  # Removed the [:3] slice to process all chunks
    all_results = []
    
    for chunk in tqdm(chunks, desc="Processing text chunks"):
        # Try direct pattern matching first
        results = process_text_chunk(chunk)
        
        # If pattern matching fails, use LLM
        if not results:
            llm_result = process_with_llama(chunk)
            if llm_result:
                all_results.extend(llm_result if isinstance(llm_result, list) else [llm_result])
        else:
            all_results.extend(results)
    
    # Add debug output
    print(f"Total results found: {len(all_results)}")
    
    # Write results to JSON file
    if all_results:
        with open('FP_Full_KFC_extracted_rules.json', 'w', encoding='utf-8') as f:
            json.dump(all_results, f, indent=4, ensure_ascii=False)
        print("Results successfully written to Full_KFC_extracted_rules.json")
    else:
        print("No results were found to write to file")
    
    return all_results

if __name__ == "__main__":
    result = main()

KTC

In [None]:
import json
import requests
import re
from tqdm import tqdm

def extract_section_info(text):
    patterns = {
        'Document': r'Document:"([^"]+)"',
        'Part': r'Part:"([^"]+)"',
        'Section': r'Section:"([^"]+)"',
        'Rule no.': r'Rule no.:"([^"]+)"'
    }
    
    result = {k: None for k in patterns.keys()}
    
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            result[key] = match.group(1)
    
    return result

def process_text_chunk(text):
    # Split into sections based on "Rule no.:" pattern
    sections = re.split(r'(?=Rule no.:")', text)
    sections = [s for s in sections if s.strip()]

    results = []
    for section in sections:
        # Extract section info and description
        section_info = extract_section_info(section)
        
        # Extract description - everything after the last known field
        description_pattern = r'Description:"([^"]+)"'
        desc_match = re.search(description_pattern, section)
        if desc_match:
            section_info['Description'] = desc_match.group(1)
        
        if any(section_info.values()):
            results.append(section_info)
    
    return results

def chunk_text(text, max_length=2000, overlap=300):
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + max_length
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
    
    return chunks

def process_with_llama(text):
    prompt = """You are tasked with extracting structured data from a text block that contains information about rules from a document, which is Kerala Financial Code Vol I. Your goal is to parse this information and format it as a JSON list.

The JSON format for each entry should be as follows:
{
  "Document": "KTC",
  "Part": "", #always present
  "Section": "", #present most of the time
  "Rule no.": "", #always present
  "Description": ""
}

Here is the text block you need to parse:

<text_block>
{?text}
</text_block>

To extract the required information:

1. Identify the document name (usually "KTC" unless specified otherwise).
2. Look for indicators of Part, Section, and Rule number. These may be explicitly stated or implied by the structure of the text.
3. The remaining text should be considered as the Description.
4. If any field is not present in the text, leave it as an empty string in the JSON.
5. If multiple rules or sections are present in the text block, create separate JSON objects for each.
6. Ensure that the Rule number is a string, even if it's a number.
7. For the Description, include all relevant text, including notes, government decisions, and examples if present.
8. Do not discard any part of the text block. If you cannot classify any portion of the text according to the specified fields (Part, Section, Rule number), include it in the Description field and populate its remaining attributes (Part, Section, and Rule number) with the corresponding attributes from the previous JSON object, because these fields are mandatory. Do not put random stuff in these field, always put the correct ones. 
9. Format the Rule no. field exactly as shown in the template above.

Your output should be a valid JSON list containing one or more objects in the specified format. Do not include any additional text or explanations outside of the JSON structure.

<output>
</output>
""".replace("{?text}", text)

    response = requests.post('http://localhost:11434/api/generate',
                           json={
                               'model': 'llama3.3:70b-instruct-fp16',
                               'prompt': prompt,
                               'stream': False
                           })
    
    if response.status_code == 200:
        try:
            response_json = response.json()['response']
            # Extract JSON content from markdown code block if present
            # Clean up the response
            if '```json' in response_json:
                json_str = response_json.split('```json')[1].split('```')[0].strip()
            else:
                json_str = response_json.strip()
            
            # Remove any leading/trailing whitespace or newlines
            json_str = json_str.strip()
            
            # Parse the JSON string
            return json.loads(json_str)
        except Exception as e:
            print(f"Error processing LLM response: {e}")
            print(f"Response was: {response.json()['response']}")
            return None
    return None

def main():
    # Read the input file
    with open('/workspace/rohith_llm/Extracted/KTC_pdfminer_extracted.txt', 'r', encoding='utf-8') as file:
        text = file.read()
        
            
    
    # Process text in chunks
    chunks = chunk_text(text)  # Removed the [:3] slice to process all chunks
    all_results = []
    
    for chunk in tqdm(chunks, desc="Processing text chunks"):
        # Try direct pattern matching first
        results = process_text_chunk(chunk)
        
        # If pattern matching fails, use LLM
        if not results:
            llm_result = process_with_llama(chunk)
            if llm_result:
                all_results.extend(llm_result if isinstance(llm_result, list) else [llm_result])
        else:
            all_results.extend(results)
    
    # Add debug output
    print(f"Total results found: {len(all_results)}")
    
    # Write results to JSON file
    if all_results:
        with open('FP_Full_KTC_extracted_rules.json', 'w', encoding='utf-8') as f:
            json.dump(all_results, f, indent=4, ensure_ascii=False)
        print("Results successfully written to Full_KTC_extracted_rules.json")
    else:
        print("No results were found to write to file")
    
    return all_results

if __name__ == "__main__":
    result = main()

KSSR

In [2]:
import json
import requests
import re
from tqdm import tqdm

def extract_section_info(text):
    # This approach won't work well with the KSSR format - it's looking for exact pattern matches
    # that don't exist in the actual document structure
    patterns = {
        'Document': 'KSSR',  # Default document name
        'Part': None, 
        'Section': None,
        'Annexure': None,
        'Rule no.': None,
        'Description': None
    }
    
    # Look for Part information
    part_match = re.search(r'PART\s+([I|V|X]+)', text, re.IGNORECASE)
    if part_match:
        patterns['Part'] = part_match.group(1)
    
    # Look for Rule number
    rule_match = re.search(r'(\d+[A-Z]*)\.[\s_]*([^\.]+)\.', text)
    if rule_match:
        patterns['Rule no.'] = rule_match.group(1)
        patterns['Description'] = rule_match.group(2).strip()
    
    # Look for Section information
    section_match = re.search(r'((?:SECTION|SCHEDULE)\s+[^\.]+)', text, re.IGNORECASE)
    if section_match:
        patterns['Section'] = section_match.group(1).strip()
    
    # Look for Annexure
    annexure_match = re.search(r'ANNEXURE[^\n]*', text, re.IGNORECASE)
    if annexure_match:
        patterns['Annexure'] = annexure_match.group(0).strip()
    
    return patterns

def process_rule_based(text):
    # Split by rule patterns - looking for pattern like "1. Title" or "10. Qualifications"
    rule_pattern = r'(\d+[A-Z]*)\.[\s_]*(.*?)(?=\n\s*\d+[A-Z]*\.[\s_]|$)'
    rules = re.findall(rule_pattern, text, re.DOTALL)
    
    results = []
    current_part = "I"  # Default to Part I if not found
    current_section = ""
    current_annexure = ""
    
    # First look for part information
    part_match = re.search(r'PART\s+([I|V|X]+)', text, re.IGNORECASE)
    if part_match:
        current_part = part_match.group(1)
    
    # Look for section or schedule
    section_match = re.search(r'((?:SECTION|SCHEDULE)\s+[^\.]+)', text, re.IGNORECASE)
    if section_match:
        current_section = section_match.group(1).strip()
    
    # Look for Annexure
    annexure_match = re.search(r'ANNEXURE[^\n]*', text, re.IGNORECASE)
    if annexure_match:
        current_annexure = annexure_match.group(0).strip()
    
    for rule_no, description in rules:
        # Cleanup the description
        description = description.strip()
        
        # Skip empty descriptions
        if not description:
            continue
        
        result = {
            "Document": "KSSR",
            "Part": current_part,
            "Section": current_section,
            "Annexure": current_annexure,
            "Rule no.": rule_no.strip(),
            "Description": description
        }
        results.append(result)
    
    return results

def chunk_text(text, max_length=5000, overlap=500):
    chunks = []
    start = 0
    
    while start < len(text):
        end = min(start + max_length, len(text))
        
        # Try to find a clean break at a rule boundary
        if end < len(text):
            # Look for the next rule start after the intended end point
            next_rule = re.search(r'\n\s*\d+[A-Z]*\.[\s_]', text[end:end+overlap])
            if next_rule:
                # Move end to the beginning of the next rule
                end += next_rule.start()
            else:
                # Look for a paragraph break if no rule boundary
                next_para = text[end:end+overlap].find('\n\n')
                if next_para != -1:
                    end += next_para
        
        chunks.append(text[start:end])
        start = end
    
    return chunks

def process_with_llama(text):
    prompt = """You are tasked with extracting structured data from a text block that contains information about rules from the Kerala State and Subordinate Services Rules (KSSR). Your goal is to parse this information and format it as a JSON list.

The JSON format for each entry should be as follows:
{
  "Document": "KSSR",
  "Part": "", 
  "Section": "",
  "Annexure": "",
  "Rule no.": "",
  "Description": ""
}

Here is the text block you need to parse:

<text_block>
{?text}
</text_block>

Rules for extraction:
1. Document should always be "KSSR".
2. Part should be identified as "I", "II", etc. (look for "PART I", "PART II", etc.)
3. Section should be identified if present (not all rules have sections).
4. Annexure should be identified if present (mostly in supplementary information).
5. Rule no. should be the rule number, like "1", "2", "2A", "10" etc.
6. Description should be the title and content of the rule.
7. Create a separate JSON object for each rule.
8. Ensure all fields are present in each JSON object, even if empty.

Your output should be a valid JSON array containing one object per rule identified.
""".replace("{?text}", text)

    response = requests.post('http://localhost:11434/api/generate',
                           json={
                               'model': 'llama3.3:70b-instruct-q8_0',
                               'prompt': prompt,
                               'stream': False
                           })
    
    if response.status_code == 200:
        try:
            response_json = response.json()['response']
            # Extract JSON content from markdown code block if present
            if '```json' in response_json:
                json_str = response_json.split('```json')[1].split('```')[0].strip()
            elif '```' in response_json:
                json_str = response_json.split('```')[1].split('```')[0].strip()
            else:
                json_str = response_json.strip()
            
            # Remove any leading/trailing whitespace or newlines
            json_str = json_str.strip()
            
            # Parse the JSON string
            parsed_json = json.loads(json_str)
            
            # Ensure we have a list
            if isinstance(parsed_json, dict):
                parsed_json = [parsed_json]
                
            return parsed_json
        except Exception as e:
            print(f"Error processing LLM response: {e}")
            print(f"Response was: {response.json()['response'][:500]}...")  # Print just first 500 chars
            return []
    else:
        print(f"API request failed with status code: {response.status_code}")
    return []

def main():
    # Read the input file
    with open('/workspace/rohith_llm/Extracted/KSSR_pdfminer_extracted.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Process text in chunks
    chunks = chunk_text(text)
    all_results = []
    
    current_part = "I"
    current_section = ""
    current_annexure = ""
    
    for i, chunk in enumerate(tqdm(chunks, desc="Processing text chunks")):
        # Try rule-based processing first
        results = process_rule_based(chunk)
        
        # Update tracking variables if this chunk has them
        for result in results:
            if result["Part"]:
                current_part = result["Part"]
            if result["Section"]:
                current_section = result["Section"]
            if result["Annexure"]:
                current_annexure = result["Annexure"]
        
        # If rule-based processing fails or finds very little, use LLM
        if not results or len(results) < 2:
            print(f"Using LLM for chunk {i+1} of {len(chunks)}")
            llm_results = process_with_llama(chunk)
            
            # Update the tracking variables for any missing fields
            for result in llm_results:
                if not result["Part"]:
                    result["Part"] = current_part
                else:
                    current_part = result["Part"]
                    
                if not result["Section"]:
                    result["Section"] = current_section
                else:
                    current_section = result["Section"]
                    
                if not result["Annexure"]:
                    result["Annexure"] = current_annexure
                else:
                    current_annexure = result["Annexure"]
            
            all_results.extend(llm_results)
        else:
            all_results.extend(results)
    
    # Add debug output
    print(f"Total rules found: {len(all_results)}")
    
    # Write results to JSON file
    if all_results:
        with open('Full_KSSR_extracted_rules.json', 'w', encoding='utf-8') as f:
            json.dump(all_results, f, indent=4, ensure_ascii=False)
        print("Results successfully written to Full_KSSR_extracted_rules.json")
    else:
        print("No rules were found to write to file")
    
    return all_results

if __name__ == "__main__":
    result = main()

Processing text chunks:   0%|                                                                                                                             | 0/36 [00:00<?, ?it/s]

Using LLM for chunk 6 of 36


Processing text chunks:  17%|███████████████████▌                                                                                                 | 6/36 [03:06<15:33, 31.11s/it]

Using LLM for chunk 9 of 36


Processing text chunks:  25%|█████████████████████████████▎                                                                                       | 9/36 [04:13<12:19, 27.38s/it]

Using LLM for chunk 10 of 36


Processing text chunks:  28%|████████████████████████████████▏                                                                                   | 10/36 [05:28<15:18, 35.34s/it]

Error processing LLM response: Expecting value: line 1 column 1 (char 0)
Response was: ### Rules Extraction from KSSR Text Block

Given the complexity and variability of the text block provided, the following Python script is designed to parse through the document and extract relevant information based on predefined rules. The script assumes that each new rule starts with a specific marker (like "(1)", "(2)", etc., or "c", "d", etc.) and attempts to identify parts, sections, annexures, and rule numbers within the text.

```python
import re
import json

# Sample text block for pars...
Using LLM for chunk 12 of 36


Processing text chunks:  33%|██████████████████████████████████████▋                                                                             | 12/36 [08:03<19:32, 48.86s/it]

Using LLM for chunk 21 of 36


Processing text chunks:  58%|███████████████████████████████████████████████████████████████████▋                                                | 21/36 [09:45<05:53, 23.55s/it]

Using LLM for chunk 22 of 36


Processing text chunks:  61%|██████████████████████████████████████████████████████████████████████▉                                             | 22/36 [10:23<05:49, 24.99s/it]

Using LLM for chunk 23 of 36


Processing text chunks:  64%|██████████████████████████████████████████████████████████████████████████                                          | 23/36 [11:10<05:59, 27.67s/it]

Using LLM for chunk 24 of 36


Processing text chunks:  67%|█████████████████████████████████████████████████████████████████████████████▎                                      | 24/36 [12:06<06:22, 31.87s/it]

Using LLM for chunk 25 of 36


Processing text chunks:  69%|████████████████████████████████████████████████████████████████████████████████▌                                   | 25/36 [12:54<06:20, 34.60s/it]

Using LLM for chunk 26 of 36


Processing text chunks:  72%|███████████████████████████████████████████████████████████████████████████████████▊                                | 26/36 [13:37<06:03, 36.34s/it]

Using LLM for chunk 27 of 36


Processing text chunks:  75%|███████████████████████████████████████████████████████████████████████████████████████                             | 27/36 [14:29<05:58, 39.84s/it]

Using LLM for chunk 28 of 36


Processing text chunks:  78%|██████████████████████████████████████████████████████████████████████████████████████████▏                         | 28/36 [15:02<05:05, 38.22s/it]

Using LLM for chunk 29 of 36


Processing text chunks: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [15:53<00:00, 26.49s/it]

Total rules found: 489
Results successfully written to Full_KSSR_extracted_rules.json



