In [None]:
# !/usr/bin/env python3
import os
import csv
import google.generativeai as genai
import re
import pandas as pd
import time
import glob


### Base model data generating

In [None]:
API_KEY_1 =""
API_KEY_2 = ""
API_KEY_3 = ""
genai.configure(api_key=API_KEY_3)

# System constants
FILE_NAME = "disease_data.csv"
SYSTEMS = [
    "CirculatorySystem", "RespiratorySystem", "DigestiveSystem", "NervousSystem",
    "MusculoskeletalSystem", "EndocrineSystem", "ImmuneSystem", "UrinarySystem",
    "ReproductiveSystem", "IntegumentarySystem"
]


# Functions for duplicated data handling
def get_existing_disease_names(filename):
    """Read existing disease names from the CSV file to avoid duplicates."""
    if not os.path.exists(filename) or os.path.getsize(filename) == 0:
        return set()
    
    try:
        df = pd.read_csv(filename)
        # Extract disease names (part before the colon), normalize to lowercase and strip spaces
        existing_names = set(df['Disease_Information'].str.split(':', n=1).str[0].str.strip().str.lower())
        return existing_names
    except Exception as e:
        # Handle potential errors in reading the file
        print(f"Lỗi khi đọc tên bệnh từ CSV: {e}")
        return set()

# Functions for data generation and processing
def filter_new_data(new_data, existing_names_set):
    """Lọc dữ liệu mới để loại bỏ trùng lặp dựa trên Tên Bệnh."""
    filtered_data = []
    duplicates_count = 0
    
    for info, system in new_data:
        # Lấy tên bệnh (phần trước dấu hai chấm)
        disease_name = info.split(':', 1)[0].strip().lower()
        
        # Kiểm tra trùng lặp
        if disease_name not in existing_names_set:
            filtered_data.append([info, system]) # Dùng [] vì csv.writerows yêu cầu list of lists/tuples
        else:
            duplicates_count += 1
            
    return filtered_data, duplicates_count

# Functions for prompt preparation
def prepare_simple_prompt(num_entries):
    """Create a simple prompt for generating disease data."""
    
    return f"""
    You are an expert medical data generator. Your task is to generate {num_entries} new disease entries following the EXACT format: ('Disease Name: Characteristic Symptoms and bodily impact', 'Body System'),.

    The generated diseases must be randomly distributed among the following systems: {', '.join(SYSTEMS)}.

    **OUTPUT REQUIREMENTS:**
    1. Generate exactly {num_entries} unique entries.
    2. Output ONLY the lines of data (Python tuples).
    3. Each line MUST start with '(' and end with '),' on a new line.
    4. Use ONLY English for the disease information.
    5. Use ONLY the system names provided in the list above.
    """
# Functions for Gemini API interaction and data parsing
def generate_and_parse_diseases(prompt):
    """Call Gemini API to generate disease data and parse the response."""
    try:
        # Initialize the model
        model = genai.GenerativeModel('gemini-2.5-flash')
        
        print("Calling Gemini API...")
        response = model.generate_content(prompt)
        raw_text = response.text

        # Ensure to capture multiline entries with re.DOTALL
        pattern = re.compile(r"\('(.*?)',\s*'(.*?)'\),", re.DOTALL)
        
        matches = pattern.findall(raw_text)
        
        parsed_data = []
        for match in matches:
            # match[0] is 'disease information', match[1] is 'system'
            parsed_data.append([match[0].strip(), match[1].strip()])
            
        return parsed_data

    except Exception as e:
        print(f"Error during Gemini API call or parsing: {e}")
        return []
def append_to_csv(data, filename=FILE_NAME):
    """Save new disease data to CSV file, appending if file exists."""
    if not data:
        print("No new data to save.")
        return

    file_exists = os.path.exists(filename)
    header = ['Disease_Information', 'Body_System']

    try:
        # Using 'a' mode to append data
        with open(filename, 'a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            
            # Only write header if file doesn't exist or is empty
            if not file_exists or os.path.getsize(filename) == 0:
                writer.writerow(header)
                
            # Write new data rows
            writer.writerows(data)
            
        print(f"\n- Finished saving {len(data)} new entries to {filename}.")

    except Exception as e:
        print(f"Error writing to CSV: {e}")

In [None]:
# Just run the main loop
if __name__ == "__main__":
    
    # Begin loop to reach target N
    while True:
        # 1. Determine current and needed counts
        existing_names_set = get_existing_disease_names(FILE_NAME)
        current_count = len(existing_names_set)
        TARGET_N = 1000  # <--- TARGET NUMBER OF RECORDS TO ACHIEVE
        BATCH_SIZE = 20 # <--- NUMBER OF DISEASES TO GENERATE PER API CALL
        print("-" * 50)
        print(f"Number of existing records: {current_count}")
        
        # Stop if target reached
        if current_count >= TARGET_N:
            print(f"Reached target of {TARGET_N} records. Exiting.")
            break
            
        remaining_needed = TARGET_N - current_count
        
        # The last API call doesn't need to generate full BATCH_SIZE if fewer are needed
        entries_to_generate = min(BATCH_SIZE, remaining_needed)
        print(f"Needing {remaining_needed} more records. Requesting Gemini to generate {entries_to_generate} records.")

        # 2. Prepare prompt and call Gemini API
        prompt = prepare_simple_prompt(entries_to_generate)
        
        # 3. Generate and parse data from Gemini
        final_data_raw = generate_and_parse_diseases(prompt) 
        
        if not final_data_raw:
            print("Cannot get data from Gemini. Pausing to check API error.")
            break # Exit if API error

        # 4. Filter out duplicates
        unique_new_data, duplicates_removed = filter_new_data(final_data_raw, existing_names_set)
        
        if duplicates_removed > 0:
            print(f"Eliminated {duplicates_removed} duplicate entries from generated data.")

        # 5. Save only unique new data to CSV
        if unique_new_data:

            append_to_csv(unique_new_data)
            print(f"Saved {len(unique_new_data)} new records to file: {FILE_NAME}")

        else:
            print("- All generated data were duplicates or invalid. Retrying later.")
            
        # Sleep to avoid rate limiting
        time.sleep(5)

--------------------------------------------------
Tổng số bản ghi hiện có: 0
Cần thêm 1000 bản ghi. Đang yêu cầu Gemini sinh 20 bản ghi.
Đang gọi API Gemini để tạo dữ liệu...

✅ Đã lưu tiếp 20 bản ghi mới vào tệp: disease_data.csv
Đã lưu tiếp 20 bản ghi mới vào tệp: disease_data.csv
--------------------------------------------------
Tổng số bản ghi hiện có: 20
Cần thêm 980 bản ghi. Đang yêu cầu Gemini sinh 20 bản ghi.
Đang gọi API Gemini để tạo dữ liệu...

✅ Đã lưu tiếp 20 bản ghi mới vào tệp: disease_data.csv
Đã lưu tiếp 20 bản ghi mới vào tệp: disease_data.csv
--------------------------------------------------
Tổng số bản ghi hiện có: 40
Cần thêm 960 bản ghi. Đang yêu cầu Gemini sinh 20 bản ghi.
Đang gọi API Gemini để tạo dữ liệu...
🚨 Đã loại bỏ 1 bản ghi trùng lặp do Gemini không tuân thủ.

✅ Đã lưu tiếp 19 bản ghi mới vào tệp: disease_data.csv
Đã lưu tiếp 19 bản ghi mới vào tệp: disease_data.csv
--------------------------------------------------
Tổng số bản ghi hiện có: 59
Cần thê

### KG data handling

In [None]:
# Prompt for extracting unique symptoms from disease descriptions
def create_extraction_prompt(descriptions: list) -> str:
    """Generate a prompt to extract unique symptoms from disease descriptions."""
    
    # Chuyển danh sách mô tả thành định dạng dễ đọc cho mô hình (ví dụ: gạch đầu dòng)
    input_text = "\n".join([f"- {desc}" for desc in descriptions])

    PROMPT = f"""
    Based on the following list of disease descriptions, your task is to extract all unique, distinct, and characteristic symptoms and clinical findings.

    Follow these strict rules:
    1. Extract ONLY the specific medical symptoms (e.g., 'nausea', 'fractures', 'cough').
    2. Do NOT include general impacts (e.g., 'impacting motor control', 'affecting digestion', 'impaired movement').
    3. Standardize and normalize related symptoms (e.g., use 'cough' instead of 'chronic cough' unless the severity is the unique finding).
    4. Ensure the final list contains ONLY unique entries (no duplicates).
    5. Present the output as a single, comma-separated list of items.
    6. Do NOT include any explanations, markdown code blocks, or headers in your final output.

--- INPUT DESCRIPTIONS ---
{input_text}
--- END OF INPUT ---

OUTPUT LIST (Comma-separated, unique symptoms only):
"""
    return PROMPT

# Function to call Gemini API and process the output
def extract_and_clean_symptoms(descriptions: list) -> list:
    """Call Gemini API to extract and clean unique symptoms from disease descriptions."""
    
    if not descriptions:
        return []
    
    try:
        prompt = create_extraction_prompt(descriptions)
        model = genai.GenerativeModel('gemini-2.5-flash')
        
        print("\nCalling Gemini API for symptom extraction...")
        
        # Call API
        response = model.generate_content(prompt)
        raw_text = response.text.strip()
        
        # Post-processing
        
        # 1. Clean text: Remove unwanted whitespace and newlines
        cleaned_text = re.sub(r'[\r\n]+', '', raw_text).strip()
        
        # 2. Split string and normalize (lowercase, trim spaces)
        symptoms_list = [
            item.strip().lower() 
            for item in cleaned_text.split(',') 
            if item.strip()
        ]
        
        # 3. Filter unique symptoms and sort
        unique_symptoms = sorted(list(set(symptoms_list)))
        
        return unique_symptoms

    except Exception as e:
        print(f"- Error during Gemini API call or processing: {e}")
        return []

# Function to load disease descriptions from CSV
def load_disease_descriptions(filename: str) -> list:
    """Read CSV file and return a list of disease description strings."""
    
    if not os.path.exists(filename):
        print(f"- Error: File {filename} not found.")
        return []

    try:
        # Read the CSV file using pandas
        df = pd.read_csv(filename)
        
        # Check if the required column exists
        if 'Disease_Information' not in df.columns:
            print("- Error: CSV file does not contain 'Disease_Information' column.")
            return []
            
        # Extract data from that column into a list
        descriptions = df['Disease_Information'].astype(str).tolist()
        
        print(f"- Successfully loaded {len(descriptions)} disease descriptions from file.")
        return descriptions

    except Exception as e:
        print(f"- Error reading CSV file: {e}")
        return []

# --- Main Execution ---
try:
    all_descriptions = load_disease_descriptions(FILE_NAME)
    # 1. Ensure we have input data
    if not all_descriptions:
        print("- No input data to process.")
    else:
        INPUT_DATA = all_descriptions
    # 2. Call extraction function
    unique_traits_list = extract_and_clean_symptoms(INPUT_DATA)

    # 3. Print results
    if unique_traits_list:
        print("\n" + "="*60)
        print("RESULT: LIST OF UNIQUE, STANDARDIZED SYMPTOMS:")
        print("="*60)
        for i, trait in enumerate(unique_traits_list):
            # Print with capitalized first letter for readability
            print(f"{i+1}. {trait.capitalize()}")
        print(f"\nNumber of unique symptoms extracted: {len(unique_traits_list)}")
    else:
        print("\n- Cannot extract symptoms. Please check API connection and input data.")

except ValueError as e:
    print(f"- Error initializing: {e}")
except Exception as e:
    print(f"- Error: {e}")

✅ Đã tải thành công 1000 mô tả bệnh lý từ file.

Đang gọi API Gemini để phân tích ngữ nghĩa và rút trích các tính trạng duy nhất...

KẾT QUẢ DANH SÁCH CÁC TÍNH TRẠNG DUY NHẤT (Đã Chuẩn Hóa):
1. Abdominal distension
2. Abdominal pain
3. Abnormal bone growth
4. Abnormal cell growth in reproductive organs
5. Abnormal growth of glandular tissue
6. Abnormal keratin production
7. Abnormal nevus formation
8. Abnormal proliferation of epithelial cells in stomach
9. Abnormal stress response
10. Absence of germ cells in testes
11. Absence of menstrual cycles
12. Absent puberty
13. Accelerated metabolism
14. Acne
15. Acute respiratory distress
16. Airway constriction
17. Airway obstruction
18. Altered body contour
19. Altered sleep patterns
20. Alternating hypoglycemia and hyperglycemia
21. Anemia
22. Angina
23. Anxiety
24. Asphyxiation
25. Aspiration risk
26. Asynchronous hormone production
27. Asynchronous processing of sensory input
28. Atrophy
29. Atrophy of motor neurons
30. Attacks on brain

In [None]:
# Prompt for augmenting symptoms into OWL/RDF Turtle format
def create_augmentation_prompt(traits: list) -> str:
    """Generate a prompt asking Gemini to classify symptoms into Systems and Organs, then create Turtle blocks."""
    
    # Define the standard structure of an entry (similar to your provided example)
    EXAMPLE_ENTRY = """
:NewSymptomName rdf:type :Symptom ;
    rdfs:label "New Symptom Label" ;
    :associatedWith :ExistingOrgan ;
    :confidenceScore "X.X"^^xsd:float ;
    :specificityScore "Y.Y"^^xsd:float ;
    :prevalenceScore "Z.Z"^^xsd:float ;
    :priorityRank N .
"""
    
    # List of valid organs (as per your specification)
    VALID_ORGANS = ":Heart, :Arteries, :Veins, :Capillaries, :Lungs, :Trachea, :Bronchi, :Stomach, :Liver, :SmallIntestine, :LargeIntestine, :Brain, :SpinalCord, :Nerves, :Bones, :Muscles, :Joints, :PituitaryGland, :ThyroidGland, :AdrenalGlands, :LymphNodes, :Spleen, :BoneMarrow, :Kidneys, :Ureters, :Bladder, :Testes, :Prostate, :Ovaries, :Uterus, :Skin, :Hair, :Nails."

    PROMPT = f"""
You are an expert in biomedical ontology development. Your task is to generate OWL/RDF Turtle entries for the following list of new symptoms, integrating them into the existing Knowledge Graph structure.

--- EXISTING CONTEXT ---
1. **Properties:** All required properties (e.g., :associatedWith, :confidenceScore) are already defined in the KG header.
2. **Valid Organs:** The only organs you can link to are: {VALID_ORGANS}
3. **Requirement:** The generated symptom must be logically associated with one of these existing organs.

--- INSTRUCTION ---
For each symptom in the list below, generate a complete OWL/RDF Turtle block.

RULES:
1. **URI Name (e.g., :AbdominalPain):** Convert the symptom label (e.g., 'abdominal pain') to CamelCase.
2. **rdfs:label:** Use the original, properly formatted label.
3. **:associatedWith:** Map the symptom to the SINGLE, MOST LIKELY valid :Organ from the list above.
4. **Scores:** Assign realistic float values between 0.1 and 1.0 (e.g., "0.8"^^xsd:float).
5. **:priorityRank (CRITICAL):** Assign a logical integer rank (1, 2, 3, etc.) representing the priority in a clinical differential diagnosis process. **Rank 1 must be assigned to the organ/system association that is the most acutely life-threatening or requires the most urgent rule-out (e.g., Myocardial Infarction for Chest Pain).** Use higher numbers for less urgent associations.
6. **OUTPUT FORMAT:** Output the entries grouped by the Body System the associated Organ belongs to, separated by comments (e.g., # Digestive System Augmentations).

--- NEW SYMPTOMS LIST ---
{unique_traits_list}

--- EXAMPLE OF REQUIRED FORMAT ---
{EXAMPLE_ENTRY.strip()}

BEGIN GENERATION (Output ONLY the Turtle triples and grouping comments):
"""
    return PROMPT

# Function to call Gemini API and get OWL/RDF Turtle blocks
def generate_owl_augmentation(traits: list) -> str:
    """Call Gemini to generate OWL/RDF Turtle blocks for the entire list of traits."""
    
    prompt = create_augmentation_prompt(traits)
    
    try:
        model = genai.GenerativeModel('gemini-2.5-flash')
        
        print(f"\nCalling Gemini API for OWL/RDF augmentation of {len(traits)} symptoms...")
        
        response = model.generate_content(prompt)
        # Return raw text, expecting it to be the formatted Turtle block
        return response.text.strip()

    except Exception as e:
        print(f"- Error during Gemini API call or processing: {e}")
        return ""

# --- MAIN EXECUTION BLOCK --- 
try:
    # Split unique_traits_list into 6 roughly equal parts
    n_splits = 6
    chunk_size = len(unique_traits_list) // n_splits + (len(unique_traits_list) % n_splits > 0)
    chunks = [unique_traits_list[i:i + chunk_size] for i in range(0, len(unique_traits_list), chunk_size)]

    print("\n" + "="*80)
    print("ReSULT: OWL/RDF AUGMENTATION OUTPUT (Turtle/N3):")
    print("="*80)

    # Define prefixes to write at the top of the output file
    PREFIXES = """
@prefix : <http://example.org/human_body#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
\n"""

    OUTPUT_FILE = "new_symptom_augmentations.ttl"

    # Write to output file
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        f.write(PREFIXES)  # Write prefixes only once at the top

        # Loop through chunks
        for idx, chunk in enumerate(chunks, 1):
            print(f"\n--- PROCESSING CHUNK {idx}/{len(chunks)} (size = {len(chunk)}) ---")
            
            owl_triples_block = generate_owl_augmentation(chunk)

            # Write the block to file if not empty
            if owl_triples_block:
                f.write(owl_triples_block + "\n")
                f.write("----------------------------------------------------------------------------\n")
                print(f"- Finished chunk {idx}, written to file.")
            else:
                print(f"- Chunk {idx} has no OWL data.")
            time.sleep(5)  # Avoid rate limiting
    print(f"\n--- FINISHED, output file: {OUTPUT_FILE} ---\n")

except ValueError as e:
    print(f"- Error initializing: {e}")
except Exception as e:
    print(f"- Error: {e}")



KẾT QUẢ OWL/RDF AUGMENTATION (Turtle/N3):

--- ĐANG XỬ LÝ CHUNK 2/6 (số phần tử = 103) ---

Đang gọi Gemini để tạo 103 khối OWL/RDF...
✓ Hoàn thành chunk 2, đã ghi vào file.

--- ĐANG XỬ LÝ CHUNK 3/6 (số phần tử = 103) ---

Đang gọi Gemini để tạo 103 khối OWL/RDF...
✓ Hoàn thành chunk 3, đã ghi vào file.

--- ĐANG XỬ LÝ CHUNK 4/6 (số phần tử = 103) ---

Đang gọi Gemini để tạo 103 khối OWL/RDF...
✓ Hoàn thành chunk 4, đã ghi vào file.

--- ĐANG XỬ LÝ CHUNK 5/6 (số phần tử = 103) ---

Đang gọi Gemini để tạo 103 khối OWL/RDF...
✓ Hoàn thành chunk 5, đã ghi vào file.

--- ĐANG XỬ LÝ CHUNK 6/6 (số phần tử = 103) ---

Đang gọi Gemini để tạo 103 khối OWL/RDF...
✓ Hoàn thành chunk 6, đã ghi vào file.

--- ĐANG XỬ LÝ CHUNK 7/6 (số phần tử = 101) ---

Đang gọi Gemini để tạo 101 khối OWL/RDF...
✓ Hoàn thành chunk 7, đã ghi vào file.

--- HOÀN TẤT, file kết quả: new_symptom_augmentations.ttl ---



In [None]:
INPUT_FILE = "text.txt"  # new_symptom_augmentations.ttl but changed to txt for processing
OUTPUT_FILE = "text_dedup_blocks.txt" # Output file after deduplication and formatting

# Read input file
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    lines = f.readlines()

# 1. Split into blocks based on symptoms
# A block starts with a line containing ":Symptom" and ends with a line containing "."
# Lines outside blocks are treated as separate blocks (e.g., comments, separators)
blocks = []
current_block = []
for line in lines:
    if line.strip().startswith(":") and "rdf:type :Symptom" in line:
        if current_block:
            blocks.append(current_block)
        current_block = [line]
    elif current_block:
        current_block.append(line)
        if line.strip().endswith("."):
            blocks.append(current_block)
            current_block = []
    else:
        if line.strip() != "":  # Ignore empty lines outside blocks
            blocks.append([line])

# 2. Remove duplicate symptom blocks based on the symptom name
# Keep the first occurrence, remove subsequent ones
seen = set()
unique_blocks = []
for block in blocks:
    header = block[0].strip() if block else ""
    if header.startswith(":") and "rdf:type :Symptom" in header:
        name = header.split()[0]  # ví dụ ":Angina"
        if name not in seen:
            unique_blocks.append(block)
            seen.add(name)
    else:
        unique_blocks.append(block)

# Normarlize formatting within each block
# - Remove empty lines within blocks
# - Ensure comments start with "#" and are followed by a separator line
cleaned_blocks = []
for block in unique_blocks:
    new_block = []
    for line in block:
        if line.strip() == "":
            continue
        if line.lstrip().startswith("#"):
            new_block.append(line.rstrip() + "\n")
            new_block.append("-" * 100 + "\n")
        else:
            new_block.append(line)
    cleaned_blocks.append(new_block)

# Merge blocks back into final lines with exactly one blank line between blocks
final_lines = []
for i, block in enumerate(cleaned_blocks):
    final_lines.extend(block)
    if i < len(cleaned_blocks) - 1:
        final_lines.append("\n")  # 1 blank line between blocks

# Write to output file
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    f.writelines(final_lines)

print(f"Finished writing cleaned data to {OUTPUT_FILE}.")


✅ Đã lọc trùng, format lại file -> text_dedup_blocks.txt


### Client data generating

In [None]:
# Define the folder path containing CSV files
FOLDER_PATH = r"D:\Hoàng Phong\University stuff\Khoa luan\Graph Federated Learning\data\medical"

# Function to read all CSV files and extract existing disease names
def get_all_existing_disease_names(folder_path):
    """
    Read all CSV files in the folder and return a set of all existing disease names to avoid duplicates.
    """
    all_names = set()
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            # Extract disease names (part before the colon), normalize to lowercase and strip spaces
            existing_names = set(df['Disease_Information'].str.split(':', n=1).str[0].str.strip().str.lower())
            return existing_names
        except Exception as e:
            # Handle potential errors in reading the file
            print(f"- Error reading disease names from CSV: {e}")
            return set()
    return all_names

# Ensure the CSV file exists with the correct header
def ensure_file_exists(file_path):
    """
    Generate a new CSV file with the default header if it doesn't exist.
    """
    if not os.path.exists(file_path):
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        df = pd.DataFrame(columns=["Disease information","System"])  # chỉnh lại theo schema bạn cần
        df.to_csv(file_path, index=False, encoding="utf-8-sig")
        print(f"Created new file: {file_path}")

# Extract all existing disease names at the start
all_existing_names_set = get_all_existing_disease_names(FOLDER_PATH)

In [20]:
all_existing_names_set

{'osteo-porosity acuta',
 'spermatic duct obstructive hyperplasia',
 'thyro-dysgenesis',
 'auto-lymphocytosis',
 'gonadal hormone resistance',
 'neuro-electrolyte imbalance disorder',
 'myofascial sheath syndrome',
 'osteo-myelodegeneration',
 'gastric lumen atrophy',
 'auto-crescendo',
 'vascular attenuata syndrome',
 'osteo-crystalline decay',
 'veno-arterial shunt anomaly',
 'bronchial arborization syndrome',
 'pulmonary fibroplasty',
 'spinal cord atrophy syndrome',
 'bronchial spasm persistentis',
 'cortical atrophy syndromis',
 'renal tubular dysfunction',
 'cerebral demyelination syndrome',
 'hepatic drift',
 'synaptic pruning hyperactivity',
 'pancreo-dysregulation syndrome',
 'osteo-rigidia',
 'lympho-stagnation disease',
 'dermal fibrosis hyperplasia',
 'enteric hyper-absorption',
 'adreno-crisis hyper',
 'pulmonic sclerosis',
 'pituitary growth factor dysregulation',
 'nephro-sclerosis progressive',
 'broncho-alveolar fusion',
 'dermal fibro-calcification',
 'dermato-collage

In [None]:
# --- Main Execution ---
if __name__ == "__main__":
    FILE_NAME_1 = "client_3_data.csv"
    file_path = os.path.join(FOLDER_PATH, FILE_NAME_1)
    (file_path)
    # Begin loop to reach target N
    while True:
        # 1. Determine current and needed counts
        existing_names_set = get_existing_disease_names(FILE_NAME_1)
        current_count = len(existing_names_set)
        
        TARGET_N = 300  # <--- TARGET SỐ BẢN GHI CẦN ĐẠT ĐƯỢC
        BATCH_SIZE = 20 # <--- NUMBER OF DISEASES TO GENERATE PER API CALL
        print("-" * 50)
        print(f"Number of existing records: {current_count}")
        
        # Stop if target reached
        if current_count >= TARGET_N:
            print(f"Reached target of {TARGET_N} records. Exiting.")
            break
            
        remaining_needed = TARGET_N - current_count
        
        # The last API call doesn't need to generate full BATCH_SIZE if fewer are needed
        entries_to_generate = min(BATCH_SIZE, remaining_needed)
        print(f"Needing {remaining_needed} more records. Requesting Gemini to generate {entries_to_generate} records.")

        # 2. Prepare prompt and call Gemini API
        prompt = prepare_simple_prompt(entries_to_generate)

        # 3. Generate and parse data from Gemini
        final_data_raw = generate_and_parse_diseases(prompt) 
        
        if not final_data_raw:
            print("❌ Không nhận được dữ liệu từ Gemini. Tạm dừng để kiểm tra lỗi API.")
            break # Thoát nếu API lỗi

        # 3. Filter out duplicates
        unique_new_data, duplicates_removed = filter_new_data(final_data_raw, existing_names_set) # First filter against current file
        unique_new_data, duplicates_removed_1 = filter_new_data(unique_new_data, all_existing_names_set) # Then filter against all files


        if duplicates_removed > 0:
            print(f"- Eliminated {duplicates_removed} duplicate entries from generated data.")
        if duplicates_removed_1 > 0:
            print(f"- Eliminated {duplicates_removed_1} duplicate entries against global data.")

        # 4. Save only unique new data to CSV
        if unique_new_data:
            
            append_to_csv(unique_new_data,FILE_NAME_1)
            print(f"Saved {len(unique_new_data)} new records to file: {FILE_NAME_1}")

        else:
            print("All generated data were duplicates or invalid. Retrying later.")
            
        # Sleep to avoid rate limiting
        time.sleep(5)

--------------------------------------------------
Tổng số bản ghi hiện có: 0
Cần thêm 300 bản ghi. Đang yêu cầu Gemini sinh 20 bản ghi.
Đang gọi API Gemini để tạo dữ liệu...
🚨 Đã loại bỏ 1 bản ghi trùng lặp do trùng với dữ liệu toàn cục.

✅ Đã lưu tiếp 19 bản ghi mới vào tệp: client_3_data.csv
Đã lưu tiếp 19 bản ghi mới vào tệp: client_3_data.csv
--------------------------------------------------
Tổng số bản ghi hiện có: 19
Cần thêm 281 bản ghi. Đang yêu cầu Gemini sinh 20 bản ghi.
Đang gọi API Gemini để tạo dữ liệu...
🚨 Đã loại bỏ 5 bản ghi trùng lặp do trùng với dữ liệu toàn cục.

✅ Đã lưu tiếp 15 bản ghi mới vào tệp: client_3_data.csv
Đã lưu tiếp 15 bản ghi mới vào tệp: client_3_data.csv
--------------------------------------------------
Tổng số bản ghi hiện có: 34
Cần thêm 266 bản ghi. Đang yêu cầu Gemini sinh 20 bản ghi.
Đang gọi API Gemini để tạo dữ liệu...
🚨 Đã loại bỏ 4 bản ghi trùng lặp do trùng với dữ liệu toàn cục.

✅ Đã lưu tiếp 16 bản ghi mới vào tệp: client_3_data.csv
Đã