# Generation of Expert-tuning Dataset

In [None]:
import os
import json
import subprocess
from pathlib import Path

# Paths
DATASET_DIR = Path("datasets")
ATTRIBUTES_DIR = Path("other_resources/attributes")
EXTERNAL_DIR = Path("other_resources/external_resource")
TEMP_DESCRIPTIONS_FILE = "paddy_disease_desc.jsonl" 
FINAL_OUTPUT_FILE = "paddy_disease.jsonl" 

def read_txt_file(file_path):
    path = Path(file_path)
    
    if not path.exists():
        print(f"File not found: {file_path}")
        return ""

    # Try UTF-8 first
    try:
        return path.read_text(encoding="utf-8").strip()
    except UnicodeDecodeError:
        pass

    # Try latin-1
    try:
        return path.read_text(encoding="latin-1").strip()
    except UnicodeDecodeError:
        pass

    # Fallback: decode raw bytes while ignoring errors
    try:
        with open(path, "rb") as f:
            return f.read().decode(errors="ignore").strip()
    except Exception as e:
        print(f"Failed to read file: {file_path} with all methods. Error: {e}")
        return ""

def call_ollama(model: str, prompt: str, image_path: str = None):
    if image_path:
        result = subprocess.run(
            ["ollama", "run", model],
            input=json.dumps({"prompt": prompt, "images": [str(Path(image_path).resolve())]}).encode("utf-8"),
            stdout=subprocess.PIPE
        )
    else:
        result = subprocess.run(
            ["ollama", "run", model],
            input=prompt.encode("utf-8"),
            stdout=subprocess.PIPE
        )
    return result.stdout.decode("utf-8")

# Stage 1: Generate image descriptions using LLaVA
def generate_descriptions():
    # print("desc")
    with open(TEMP_DESCRIPTIONS_FILE, "w", encoding="utf-8") as desc_out:
        for dataset_dir in DATASET_DIR.iterdir():
            
            if not dataset_dir.is_dir():
                
                continue
            dataset_name = dataset_dir.name

            for class_dir in dataset_dir.iterdir():
                if not class_dir.is_dir():
                    continue
                class_label = class_dir.name
                attr_path = ATTRIBUTES_DIR / dataset_name / f"{class_label}.txt"
                attributes = read_txt_file(attr_path)
                # for image_path in class_dir.glob("*.jpg"):

                for image_path in (p for p in class_dir.iterdir() if p.suffix.lower() in [".jpg",".jpeg"]):
                    # print("asd")
                    # print(image_path)
                    print(f"[Stage 1] Processing image for description: {image_path}")
                    try:
                        prompt = (
                            f"You are an agricultural assistant. Describe this image of a {class_label} from the {dataset_name} dataset. "
                            f"Use the following attributes for a more detailed and contextual description:\n{attributes}"
                        )
                        description = call_ollama("llava:13b", prompt, image_path=image_path)
                        desc_out.write(json.dumps({
                            "image_path": image_path.relative_to(DATASET_DIR).as_posix(),
                            "dataset": dataset_name,
                            "class": class_label,
                            "attributes": attributes,
                            "description": description.strip()
                        }, ensure_ascii=False) + "\n")
                    except Exception as e:
                        print(f"Error generating description for {image_path}: {e}")
    # print("done")
# Stage 2: Generate multi-turn Q&A using Mistral
def generate_multiturn_qa():
    # print("ml")

    results = []
    with open(TEMP_DESCRIPTIONS_FILE, "r", encoding="utf-8") as infile:
        for line in infile:
            data = json.loads(line)
            class_label = data["class"]
            dataset_name = data["dataset"]
            ext_path = EXTERNAL_DIR / dataset_name / f"{class_label}.txt"
            external_knowledge = read_txt_file(ext_path)

            print(f"[Stage 2] Generating multi-turn QA for: {data['image_path']}")
            try:
                prompt = f"""
You are an AI assistant specialized in agricultural topics. You are provided with the text
description of an image of a plant, attributes of the plant (such as name, disease),
and common information of the plant. Unfortunately, you don't have access to the actual
image.

You must generate exactly 3 to 5 pairs of question and answer (Q&A). Each question should begin with "Q:" and each answer with "A:". Do not include any narrative text outside the Q&A pairs.

Instructions:
- Focus on visual details that can be seen in the image (e.g., plant type, symptoms, disease, prevention).
- Do not refer to the 'text', 'context', or 'caption' — behave as if you are only seeing the image.
- Do not ask speculative or ambiguous questions.
- Avoid referencing numbers, scientific names, or datasets.
- Maintain consistent formatting as:
  Q1: ...
  A1: ...
  Q2: ...
  A2: ...
  (and so on)

Context:
Image Description: {data["description"]}
Attributes: {data["attributes"]}
External Knowledge: {external_knowledge}
"""
                multiturn = call_ollama("mistral", prompt)
                data["external_knowledge"] = external_knowledge
                data["multi_turn_conversation"] = multiturn.strip()
                results.append(data)
            except Exception as e:
                print(f"Error generating multi-turn QA for {data['image_path']}: {e}")
    # print("ml")
    return results

# Stage 3: Generate simple Q&A using Mistral and save final output
def generate_simple_qa_and_save(results):
    # print("sa")

    with open(FINAL_OUTPUT_FILE, "w", encoding="utf-8") as outfile:
        for data in results:
            print(f"[Stage 3] Generating simple QA for: {data['image_path']}")
            try:
                prompt = f"""
You are an AI assistant specialized in agricultural topics. You are provided with the text
description of an image of a plant, attributes of the plant (such as name, disease),
and common information of the plant. Unfortunately, you don't have access to the actual
image.
You are a helpful tutor. Based on the image of a {data['class']} from the {data['dataset']} dataset, generate 3–5 basic question–answer pairs.

Instructions:
- Start each question with "Q:" and answer with "A:".
- Keep answers very short — just the name or label (like "Tomato", "Late blight", etc).
- Do not use full sentences or long explanations in answers.
- produce to the point answer do not produce any kind of explanation

Context:
Image Description: {data['description']}
Attributes: {data['attributes']}
External Knowledge: {data['external_knowledge']}
"""
                simple_qa = call_ollama("mistral", prompt)
                data["simple_qa"] = simple_qa.strip()

                outfile.write(json.dumps(data, ensure_ascii=False) + "\n")
            except Exception as e:
                print(f"Error generating simple QA for {data['image_path']}: {e}")
    # print("sa")

if __name__ == "__main__":
    print("Stage 1: Generating image descriptions with LLaVA:13b...")
    generate_descriptions()

    print("\nStage 2: Generating multi-turn questions with Mistral...")
    multiturn_results = generate_multiturn_qa()

    print("\nStage 3: Generating simple QA with Mistral...")
    generate_simple_qa_and_save(multiturn_results)

    print("\nDataset generation complete. Results saved to:", FINAL_OUTPUT_FILE)
