In [3]:
pip install pypdf2


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: pypdf2
Successfully installed pypdf2-3.0.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import json
import ollama
from tqdm import tqdm
import re

# Load structured data from JSON
with open("/workspace/rohith_llm/Extracted/Structured/Summary/Combined_Metadata.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Define Alpaca dataset format
alpaca_dataset = []

# Function to generate contextually relevant questions using the LLM
def generate_contextual_question(rule):
    # Prepare a prompt that asks the LLM to analyze the rule and generate a contextual question
    rule_content = f"Rule Description: {rule.get('Description', '')}\nRule Content: {rule.get('Content', '')}"
    
    # Ask the LLM to analyze the rule and generate an appropriate question
    analysis_response = ollama.chat(
        model="llama3.3:70b-instruct-q8_0",
        messages=[
            {
                "role": "system", 
                "content": "You are an expert in Kerala government rules. Given a government rule, generate a single, specific question that would be relevant for someone wanting to learn about this rule. Your response should ONLY include the question itself, with no additional text or explanations."
            },
            {
                "role": "user", 
                "content": f"Here is a Kerala government service rule. Generate only a specific, contextually relevant question about this rule:\n\n{rule_content}"
            }
        ]
    )["message"]["content"]
    
    # Clean up the response to ensure we only get the question
    question = analysis_response.strip()
    
    # Remove any explanatory text that might be included
    if ":" in question and not question.startswith('"'):
        question = question.split(":", 1)[1].strip()
    
    # Remove quotes if the LLM added them
    question = re.sub(r'^["\'](.*)["\']$', r'\1', question)
    
    return question

# Process each rule to generate instruction-response pairs
for rule in tqdm(data, desc="Generating dataset"):
    if rule.get("Description"):  # Ensure description is present
        # First, use the LLM to generate a contextually relevant question
        instruction = generate_contextual_question(rule)
        
        # Then, use the LLM to generate a detailed answer to that question
        # Pass the rule content along with the question to ensure the LLM has context
        rule_content = f"Rule Description: {rule.get('Description', '')}\nRule Content: {rule.get('Content', '')}"
        
        response = ollama.chat(
            model="llama3.3:70b-instruct-q8_0",
            messages=[
                {
                    "role": "system", 
                    "content": "You are an expert in Kerala government rules. Provide a detailed and structured response based on the given rule.  Include proper references (document, part, chapter, rule number, etc.) when available. Use simple, everyday language that anyone can understand "
                },
                {
                    "role": "user", 
                    "content": f"Based on the following Kerala government rule:\n\n{rule_content}\n\nPlease answer this question: {instruction}"
                }
            ]
        )["message"]["content"]
        
        alpaca_dataset.append({
            "instruction": instruction,
            "input": "",
            "output": response
        })

    # Stop when we reach 10 entries for testing
    if len(alpaca_dataset) >= 10000:
        break

# Save dataset to JSON
with open("kerala_llm_instruction_dataset.json", "w", encoding="utf-8") as f:
    json.dump(alpaca_dataset, f, ensure_ascii=False, indent=4)

print("Dataset generation complete. Saved to kerala_llm_instruction_dataset.json")

Generating dataset:  34%|█████████████████████████████████████▍                                                                        | 2139/6293 [11:47:38<21:16:10, 18.43s/it]

In [None]:
import json
import ollama
from tqdm import tqdm
import re
import random
import torch
import os


torch.cuda.set_device(7)

# Load structured data from JSON
with open("/workspace/rohith_llm/Extracted/Structured/Summary/Combined_Metadata.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Define Alpaca dataset format
alpaca_dataset = []

# Function to determine how many questions to generate for a rule
def determine_question_count(rule):
    # Check if the rule has substantial content
    content_length = len(rule.get('Content', ''))
    description_length = len(rule.get('Description', ''))
    
    # Rules with more content can generate more questions
    if content_length > 1000 or description_length > 200:
        return random.randint(2, 4)  # Generate 2-4 questions for substantial rules
    else:
        return 1  # Generate at least 1 question for each rule

# Function to generate multiple contextually relevant questions using the LLM
def generate_contextual_questions(rule, count):
    # Prepare a prompt that asks the LLM to analyze the rule and generate multiple contextual questions
    rule_content = f"Rule Description: {rule.get('Description', '')}\nRule Content: {rule.get('Content', '')}"
    
    # Ask the LLM to generate multiple different questions
    analysis_response = ollama.chat(
        model="llama3.3:70b-instruct-q8_0",
        messages=[
            {
                "role": "system", 
                "content": "You are an expert in Kerala government rules. Given a government rule, generate multiple specific, diverse questions that would be relevant for someone wanting to learn about this rule. Generate exactly the number of questions requested. Format each question on a separate line with a number followed by a period (e.g., '1.', '2.'). Your questions should cover different aspects of the rule."
            },
            {
                "role": "user", 
                "content": f"Here is a Kerala government service rule. Generate {count} specific, contextually relevant questions about this rule:\n\n{rule_content}"
            }
        ]
    )["message"]["content"]
    
    # Parse the numbered questions
    questions = []
    lines = analysis_response.strip().split("\n")
    for line in lines:
        # Look for numbered lines like "1. Question" or "1) Question"
        match = re.match(r'^\d+[\.\)]\s+(.*)', line)
        if match:
            question = match.group(1).strip()
            # Remove quotes if the LLM added them
            question = re.sub(r'^["\'](.*)["\']$', r'\1', question)
            questions.append(question)
    
    # If parsing failed, try to split by newlines and take the first 'count' non-empty lines
    if len(questions) < count:
        questions = [line.strip() for line in lines if line.strip()]
        questions = questions[:count]
    
    # If we still don't have enough questions, generate a default one
    if len(questions) < count:
        questions.append(f"What are the key provisions in the rule about {rule.get('Description', 'this Kerala government regulation')}?")
    
    return questions[:count]  # Return only the requested number of questions

# Function to generate a detailed answer to a question
def generate_response(rule, question):
    rule_content = f"Rule Description: {rule.get('Description', '')}\nRule Content: {rule.get('Content', '')}"
    
    response = ollama.chat(
        model="llama3.3:70b-instruct-q8_0",
        messages=[
            {
                "role": "system", 
                "content": "You are an expert in Kerala government rules. Provide a detailed and structured response based on the given rule. Include proper references (document, part, chapter, rule number, etc.) when available. Use simple, everyday language that anyone can understand."
            },
            {
                "role": "user", 
                "content": f"Based on the following Kerala government rule:\n\n{rule_content}\n\nPlease answer this question: {question}"
            }
        ]
    )["message"]["content"]
    
    return response

# Process each rule to generate instruction-response pairs
target_count = 10000
pbar = tqdm(total=target_count, desc="Generating dataset")

# First pass: process all rules at least once
for rule in data:
    if not rule.get("Description"):
        continue  # Skip rules without description
        
    # Determine how many questions to generate for this rule
    question_count = determine_question_count(rule)
    
    # Generate multiple questions for this rule
    questions = generate_contextual_questions(rule, question_count)
    
    # Process each question
    for question in questions:
        response = generate_response(rule, question)
        
        alpaca_dataset.append({
            "instruction": question,
            "input": "",
            "output": response
        })
        
        pbar.update(1)
        
        # Check if we've reached our target
        if len(alpaca_dataset) >= target_count:
            break
    
    # Break the outer loop if we've reached our target
    if len(alpaca_dataset) >= target_count:
        break

# If we still don't have enough entries, do a second pass on rules with substantial content
if len(alpaca_dataset) < target_count:
    # Sort rules by content length to prioritize substantial rules
    sorted_rules = sorted(data, key=lambda x: len(x.get('Content', '')), reverse=True)
    
    for rule in sorted_rules:
        if not rule.get("Description"):
            continue  # Skip rules without description
            
        # Generate additional questions beyond what we did in the first pass
        additional_questions = generate_contextual_questions(rule, 2)  # Generate 2 more questions
        
        for question in additional_questions:
            # Check if this question is too similar to ones we already asked for this rule
            existing_questions = [item["instruction"] for item in alpaca_dataset]
            if any(similar(question, eq) for eq in existing_questions):
                continue  # Skip similar questions
                
            response = generate_response(rule, question)
            
            alpaca_dataset.append({
                "instruction": question,
                "input": "",
                "output": response
            })
            
            pbar.update(1)
            
            # Check if we've reached our target
            if len(alpaca_dataset) >= target_count:
                break
        
        # Break the outer loop if we've reached our target
        if len(alpaca_dataset) >= target_count:
            break

# Simple function to check if two questions are similar
def similar(q1, q2):
    # Very basic similarity check - can be improved
    common_words = set(q1.lower().split()) & set(q2.lower().split())
    return len(common_words) > 3  # If they share more than 3 words, consider them similar

pbar.close()

# Save dataset to JSON
with open("kerala_llm_instruction_dataset.json", "w", encoding="utf-8") as f:
    json.dump(alpaca_dataset, f, ensure_ascii=False, indent=4)

print(f"Dataset generation complete. Generated {len(alpaca_dataset)} instruction-response pairs. Saved to kerala_llm_instruction_dataset.json")


Generating dataset:   0%|                                                                                                                         | 0/10000 [00:24<?, ?it/s][A

Generating dataset:   0%|                                                                                                            | 1/10000 [01:44<290:40:59, 104.66s/it][A
Generating dataset:   0%|                                                                                                             | 2/10000 [01:51<131:18:33, 47.28s/it][A
Generating dataset:   0%|                                                                                                             | 3/10000 [02:23<112:09:18, 40.39s/it][A
Generating dataset:   0%|                                                                                                              | 4/10000 [02:46<92:09:31, 33.19s/it][A
Generating dataset:   0%|                                                                                             

In [1]:
import json
import ollama
from tqdm import tqdm
import re
import random
import torch
import os
import time
from datetime import datetime


#torch.cuda.set_device(5)

# Create a checkpoints directory if it doesn't exist
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Function to load existing checkpoint if available
def load_latest_checkpoint():
    checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.startswith("kerala_dataset_checkpoint_") and f.endswith(".json")]
    
    if not checkpoint_files:
        return [], 0  # Return empty dataset and count 0 if no checkpoints found
    
    # Sort checkpoint files by their numbers
    checkpoint_files.sort(key=lambda x: int(x.split("_")[-1].split(".")[0]))
    latest_checkpoint = checkpoint_files[-1]
    
    print(f"Loading latest checkpoint: {latest_checkpoint}")
    
    with open(os.path.join(checkpoint_dir, latest_checkpoint), "r", encoding="utf-8") as f:
        dataset = json.load(f)
    
    count = int(latest_checkpoint.split("_")[-1].split(".")[0])
    
    return dataset, count

# Function to save checkpoint
def save_checkpoint(dataset, count):
    checkpoint_path = os.path.join(checkpoint_dir, f"kerala_dataset_checkpoint_{count}.json")
    
    with open(checkpoint_path, "w", encoding="utf-8") as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)
    
    # Also save a status file with timestamp for quick progress checks
    status_path = os.path.join(checkpoint_dir, "current_status.txt")
    with open(status_path, "w", encoding="utf-8") as f:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        f.write(f"Last update: {timestamp}\n")
        f.write(f"Progress: {count}/{target_count} ({count/target_count*100:.2f}%)\n")
    
    print(f"Checkpoint saved: {count}/{target_count} ({count/target_count*100:.2f}%)")

# Load structured data from JSON
with open("/workspace/rohith_llm/Extracted/Structured/Summary/Combined_Metadata.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Define Alpaca dataset format and target count
target_count = 10000

# Load any existing checkpoint
alpaca_dataset, current_count = load_latest_checkpoint()

# Simple function to check if two questions are similar
def similar(q1, q2):
    # Very basic similarity check - can be improved
    common_words = set(q1.lower().split()) & set(q2.lower().split())
    return len(common_words) > 3  # If they share more than 3 words, consider them similar

# Function to determine how many questions to generate for a rule
def determine_question_count(rule):
    # Check if the rule has substantial content
    content_length = len(rule.get('Content', ''))
    description_length = len(rule.get('Description', ''))
    
    # Rules with more content can generate more questions
    if content_length > 1000 or description_length > 200:
        return random.randint(2, 4)  # Generate 2-4 questions for substantial rules
    else:
        return 1  # Generate at least 1 question for each rule

# Function to generate multiple contextually relevant questions using the LLM
def generate_contextual_questions(rule, count):
    # Prepare a prompt that asks the LLM to analyze the rule and generate multiple contextual questions
    rule_content = f"Rule Description: {rule.get('Description', '')}\nRule Content: {rule.get('Content', '')}"
    
    # Ask the LLM to generate multiple different questions
    analysis_response = ollama.chat(
        model="llama3.3:70b-instruct-q8_0",
        messages=[
            {
                "role": "system", 
                "content": "You are an expert in Kerala government rules. Given a government rule, generate multiple specific, diverse questions that would be relevant for someone wanting to learn about this rule. Generate exactly the number of questions requested. Format each question on a separate line with a number followed by a period (e.g., '1.', '2.'). Your questions should cover different aspects of the rule."
            },
            {
                "role": "user", 
                "content": f"Here is a Kerala government service rule. Generate {count} specific, contextually relevant questions about this rule:\n\n{rule_content}"
            }
        ]
    )["message"]["content"]
    
    # Parse the numbered questions
    questions = []
    lines = analysis_response.strip().split("\n")
    for line in lines:
        # Look for numbered lines like "1. Question" or "1) Question"
        match = re.match(r'^\d+[\.\)]\s+(.*)', line)
        if match:
            question = match.group(1).strip()
            # Remove quotes if the LLM added them
            question = re.sub(r'^["\'](.*)["\']$', r'\1', question)
            questions.append(question)
    
    # If parsing failed, try to split by newlines and take the first 'count' non-empty lines
    if len(questions) < count:
        questions = [line.strip() for line in lines if line.strip()]
        questions = questions[:count]
    
    # If we still don't have enough questions, generate a default one
    if len(questions) < count:
        questions.append(f"What are the key provisions in the rule about {rule.get('Description', 'this Kerala government regulation')}?")
    
    return questions[:count]  # Return only the requested number of questions

# Function to generate a detailed answer to a question
def generate_response(rule, question):
    rule_content = f"Rule Description: {rule.get('Description', '')}\nRule Content: {rule.get('Content', '')}"
    
    response = ollama.chat(
        model="llama3.3:70b-instruct-q8_0",
        messages=[
            {
                "role": "system", 
                "content": "You are an expert in Kerala government rules. Provide a detailed and structured response based on the given rule. Include proper references (document, part, chapter, rule number, etc.) when available. Use simple, everyday language that anyone can understand."
            },
            {
                "role": "user", 
                "content": f"Based on the following Kerala government rule:\n\n{rule_content}\n\nPlease answer this question: {question}"
            }
        ]
    )["message"]["content"]
    
    return response

# Initialize progress bar from current count
pbar = tqdm(total=target_count, initial=len(alpaca_dataset), desc="Generating dataset")

# Set checkpoint frequency
checkpoint_frequency = 20  # Save every 20 entries

# Continue processing if we haven't reached our target
if len(alpaca_dataset) < target_count:
    try:
        # First pass: process all rules at least once
        for rule in data:
            if not rule.get("Description"):
                continue  # Skip rules without description
                
            # Determine how many questions to generate for this rule
            question_count = determine_question_count(rule)
            
            # Generate multiple questions for this rule
            questions = generate_contextual_questions(rule, question_count)
            
            # Process each question
            for question in questions:
                response = generate_response(rule, question)
                
                alpaca_dataset.append({
                    "instruction": question,
                    "input": "",
                    "output": response
                })
                
                current_count = len(alpaca_dataset)
                pbar.update(1)
                
                # Save checkpoint periodically
                if current_count % checkpoint_frequency == 0:
                    save_checkpoint(alpaca_dataset, current_count)
                
                # Check if we've reached our target
                if current_count >= target_count:
                    break
            
            # Break the outer loop if we've reached our target
            if current_count >= target_count:
                break

        # If we still don't have enough entries, do a second pass on rules with substantial content
        if current_count < target_count:
            # Sort rules by content length to prioritize substantial rules
            sorted_rules = sorted(data, key=lambda x: len(x.get('Content', '')), reverse=True)
            
            for rule in sorted_rules:
                if not rule.get("Description"):
                    continue  # Skip rules without description
                    
                # Generate additional questions beyond what we did in the first pass
                additional_questions = generate_contextual_questions(rule, 2)  # Generate 2 more questions
                
                for question in additional_questions:
                    # Check if this question is too similar to ones we already asked for this rule
                    existing_questions = [item["instruction"] for item in alpaca_dataset]
                    if any(similar(question, eq) for eq in existing_questions):
                        continue  # Skip similar questions
                        
                    response = generate_response(rule, question)
                    
                    alpaca_dataset.append({
                        "instruction": question,
                        "input": "",
                        "output": response
                    })
                    
                    current_count = len(alpaca_dataset)
                    pbar.update(1)
                    
                    # Save checkpoint periodically
                    if current_count % checkpoint_frequency == 0:
                        save_checkpoint(alpaca_dataset, current_count)
                    
                    # Check if we've reached our target
                    if current_count >= target_count:
                        break
                
                # Break the outer loop if we've reached our target
                if current_count >= target_count:
                    break
    
    except Exception as e:
        # If any error occurs, save the checkpoint before exiting
        print(f"Error occurred: {str(e)}")
        save_checkpoint(alpaca_dataset, len(alpaca_dataset))
        raise e
    
    finally:
        # Close progress bar
        pbar.close()
        
        # Save final checkpoint
        save_checkpoint(alpaca_dataset, len(alpaca_dataset))

# Save final dataset to JSON
with open("kerala_llm_instruction_dataset.json", "w", encoding="utf-8") as f:
    json.dump(alpaca_dataset, f, ensure_ascii=False, indent=4)

print(f"Dataset generation complete. Generated {len(alpaca_dataset)} instruction-response pairs. Saved to kerala_llm_instruction_dataset.json")

Loading latest checkpoint: kerala_dataset_checkpoint_8980.json


Generating dataset:  90%|██████████████████████████████████████████████████████████████████▌       | 9000/10000 [09:10<7:10:09, 25.81s/it]

Checkpoint saved: 9000/10000 (90.00%)


Generating dataset:  90%|██████████████████████████████████████████████████████████████████▋       | 9020/10000 [16:49<6:00:58, 22.10s/it]

Checkpoint saved: 9020/10000 (90.20%)


Generating dataset:  90%|██████████████████████████████████████████████████████████████████▉       | 9040/10000 [24:11<5:22:35, 20.16s/it]

Checkpoint saved: 9040/10000 (90.40%)


Generating dataset:  90%|██████████████████████████████████████████████████████████████████▉       | 9042/10000 [25:09<6:28:43, 24.35s/it]


Checkpoint saved: 9042/10000 (90.42%)


KeyboardInterrupt: 