# 1. UK Health Claims

In [None]:
import pandas as pd
# Define the path and sheet name
file_path = "data/NHC/great-britain-nutrition-and-health-claims-spreadsheet-26-March-2024.ods"
sheet_name = "Health_claims"

# Read the ODS file using the 'odf' engine
df = pd.read_excel(file_path, engine="odf", sheet_name=sheet_name)

# Clean up column names (strip leading/trailing whitespace)
df.columns = df.columns.str.strip()


In [None]:
df

In [None]:

# Function to concatenate multiple 'Scientific Opinion Reference' values with "__"
def concat_refs(series):
    # Drop NaNs, convert to string, take unique values, and join with "__"
    return "__".join(series.dropna().astype(str).unique())

# Function to take the first non-null value in a series
def first_non_null(series):
    non_null = series.dropna()
    return non_null.iloc[0] if not non_null.empty else None

# List of columns to aggregate. We want to group by 'Claim' and
# for the 'Scientific Opinion Reference' column, concatenate values.
# For all other columns, we'll take the first non-null value.
agg_dict = {
    "Claim type": first_non_null,
    "Nutrient substance, food or food category": first_non_null,
    "Claim": "first",  # Grouping column, so just take one copy
    "Conditions of use of the claim / Restrictions of use / Reasons for non-authorisation": first_non_null,
    "Health relationship": first_non_null,
    "Scientific Opinion Reference": concat_refs,
    "Regulation (Note: All claims after the 1st January 2021 were GB approved. Claims prior to the 1st of January 2021 were approved by the EU, and retained by GB.)": first_non_null,
    "Status (Note: Asterisked claims (*) were authorised on the basis of proprietary data and are also listed in the Annex to GB Nutrition and Health Claims Register)": first_non_null,
    "Entry Id": first_non_null  # Adjust column name if needed (removed extra spaces)
}

# Before grouping, ensure the column names match exactly.
# If the "Entry Id" column in your file has extra spaces, you might need to adjust accordingly.

# Group the dataframe by 'Claim' and aggregate the values
df_grouped = df.groupby("Claim", as_index=False).agg(agg_dict)

# Create a unique claim_id column (starting at 1)
df_grouped.insert(0, "claim_id", range(1, len(df_grouped) + 1))

# (Optional) Save the processed dataframe to a new CSV file for verification
df_grouped.to_csv("int_data/UK_processed_health_claims.csv", index=False)

# Print a sample of the dataframe to verify results
print(df_grouped.head())


In [None]:
# read processed_health_claims
df_grouped = pd.read_csv("int_data/UK_processed_health_claims.csv")


In [None]:
df_grouped

# prompt

In [None]:
import os
os.environ["OPENAI_API_KEY"]="KEY" # Enter your key

In [None]:
import os
import json
import random
import time
import pandas as pd
import logging
import pickle
from tqdm import tqdm

# =============================================================================
# Step 0: Define the translation prompt components
# =============================================================================

# Define the response format JSON schema for claim translation with three translation keys
response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "claim_translation_v1",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "translation_claim": {
                    "type": "string",
                    "description": (
                        "The translated health claim text in the target language. "
                        "If a specialized medical term is not available in the target language, retain the original English term."
                    )
                },
                "translation_nutrient_substance": {
                    "type": "string",
                    "description": (
                        "The translated text for the nutrient substance, food or food category in the target language."
                    )
                },
                "translation_health_relationship": {
                    "type": "string",
                    "description": (
                        "The translated text for the health relationship in the target language."
                    )
                }
            },
            "required": ["translation_claim", "translation_nutrient_substance", "translation_health_relationship"],
            "additionalProperties": False
        }
    }
}

# Define the system instruction for the LLM as a translation expert
system_instruction = (
    "You are an expert translator specializing in medical claims. Your task is to accurately translate the provided text from English into the target language specified in the user prompt. "
    "This text includes three parts: the health claim, the nutrient substance (or food/food category), and the health relationship. "
    "Please provide your translations using the keys 'translation_claim', 'translation_nutrient_substance', and 'translation_health_relationship'. "
    "Be precise and maintain the original meaning. If a specialized medical term does not exist in the target language, keep the original English term. "
    "Your response must strictly adhere to the JSON schema provided in the response format."
)

# =============================================================================
# Function to create a JSONL entry for a given health claim translation prompt
# =============================================================================
def create_jsonl_entry_from_claim(claim, claim_id, nutrient, health_relationship, target_language):
    """
    Create a JSONL entry to query an LLM for health claim translation.

    Parameters:
        claim (str): The health claim text in English.
        claim_id (int): Unique identifier for the claim.
        nutrient (str): The nutrient substance, food or food category.
        health_relationship (str): The health relationship.
        target_language (str): The target language to translate the text into.

    Returns:
        dict: A dictionary representing the JSON payload for the LLM prompt.
    """
    custom_id = f"claim_{claim_id}_{target_language}"
    user_prompt = (
        f"Translate the following text into '{target_language}':\n"
        f"Claim: {claim}\n"
        f"Nutrient substance, food or food category: {nutrient}\n"
        f"Health relationship: {health_relationship}"
    )
    
    entry = {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini-2024-07-18",
            "messages": [
                {
                    "role": "system",
                    "content": system_instruction
                },
                {
                    "role": "user",
                    "content": user_prompt
                }
            ],
            "temperature": 0,
            "max_tokens": 10000,
            "response_format": response_format
        }
    }
    return entry


top internet website languages are from https://en.wikipedia.org/wiki/Languages_used_on_the_Internet?utm_source=chatgpt.com

In [None]:
# =============================================================================
# Step 1: Create the JSONL file for batch processing from health claims DataFrame
# =============================================================================

# Define the target languages (ignoring the provided percentages)
target_languages = ["English", "Spanish", "Russian", "German", "French", 
                    "Japanese", "Portuguese", "Turkish", "Italian", "Persian", 
                    "Dutch", "Polish", "Chinese", "Vietnamese", "Indonesian", 
                    "Czech", "Korean", "Ukrainian", "Arabic", "Greek", 
                    "Hindi" 
                    ]

jsonl_filename = "batch_input_hc_multilingual.jsonl"
with open(jsonl_filename, 'w', encoding='utf-8') as jsonl_file:
    for _, row in df_grouped.iterrows():
        for lang in target_languages:
            entry = create_jsonl_entry_from_claim(
                row['Claim'],
                row['claim_id'],
                row['Nutrient substance, food or food category'],
                row['Health relationship'],
                lang
            )
            jsonl_file.write(json.dumps(entry) + '\n')

print("JSONL file created successfully.")


In [None]:

# =============================================================================
# Step 2: Split the JSONL file into smaller chunks (<100mb each)
# =============================================================================

# Create input_batches_hc_multilingual directory if it doesn't exist
if not os.path.exists("input_batches_hc_multilingual"):
    os.makedirs("input_batches_hc_multilingual")

def split_jsonl_file(file_path, num_batches):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    total_lines = len(lines)
    lines_per_batch = total_lines // num_batches
    
    for i in range(num_batches):
        batch_lines = lines[i * lines_per_batch : (i + 1) * lines_per_batch]
        batch_file_path = f"input_batches_hc_multilingual/batch_input_hc_multilingual_part{i + 1}.jsonl"
        with open(batch_file_path, 'w', encoding='utf-8') as batch_file:
            batch_file.writelines(batch_lines)
    
    # If there are leftover lines, add them to the last batch
    if total_lines % num_batches != 0:
        with open(f"input_batches_hc_multilingual/batch_input_hc_multilingual_part{num_batches}.jsonl", 'a', encoding='utf-8') as batch_file:
            batch_file.writelines(lines[num_batches * lines_per_batch :])
    
    print("Batches created successfully.")

# Adjust the number of batches to 5
split_jsonl_file(jsonl_filename, 2)


In [None]:

# =============================================================================
# Step 3: Upload the JSONL files and create batch jobs using the OpenAI client
# =============================================================================

# Import the OpenAI client (assuming it's defined similarly to your existing code)
from openai import OpenAI

# os.environ["OPENAI_API_KEY"] = ""  # Set your API key here
# Initialize OpenAI client
client = OpenAI()

# Function to create a sample JSONL file with 1 random line
def create_sample_jsonl(file_path, sample_size=1):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Randomly select sample_size lines from the full file
    sample_lines = random.sample(lines, sample_size)
    
    # Create a smaller JSONL file with just the sample lines
    sample_file_path = "sample_input_hc_multilingual.jsonl"
    with open(sample_file_path, 'w', encoding='utf-8') as sample_file:
        sample_file.writelines(sample_lines)
    
    print(f"Sample of {sample_size} lines created successfully in {sample_file_path}.")
    return sample_file_path

# Function to upload the sample file and create a batch job
def upload_and_create_sample_batch(sample_file_path):
    # Upload the sample JSONL file
    sample_input_file = client.files.create(
        file=open(sample_file_path, "rb"),
        purpose="batch"
    )
    
    sample_input_file_id = sample_input_file.id
    
    # Create a batch job with the sample file
    batch = client.batches.create(
        input_file_id=sample_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": "Pilot health claim translation job with sample line(s)"
        }
    )
    
    print(f"Sample batch job created successfully with Batch ID: {batch.id}")
    return batch.id

# Run the sample creation process
sample_file = create_sample_jsonl("batch_input_hc_multilingual.jsonl", 1)
sample_batch_id = upload_and_create_sample_batch(sample_file)

# Function to upload and create batches for all parts
def upload_and_create_batches(num_batches):
    batch_ids = []
    for i in range(1, num_batches + 1):
        batch_file_path = f"input_batches_hc_multilingual/batch_input_hc_multilingual_part{i}.jsonl"
        
        # Upload the JSONL file for batch processing
        batch_input_file = client.files.create(
            file=open(batch_file_path, "rb"),
            purpose="batch"
        )
        
        batch_input_file_id = batch_input_file.id
        
        # Create the batch job
        batch = client.batches.create(
            input_file_id=batch_input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={
                "description": f"Health claim translation job part {i}"
            }
        )
        
        batch_ids.append(batch.id)
        print(f"Batch {i} job created successfully with Batch ID: {batch.id}")
        
        # Pause between uploads to avoid hitting rate limits
        time.sleep(1)
    
    return batch_ids

# Run the function to create all batch jobs (using 5 batches)
num_batches = 2
batch_ids = upload_and_create_batches(num_batches)

# (Optional) Save the batch IDs for later retrieval
with open("batch_ids_hc_multilingual.pkl", "wb") as file:
    pickle.dump(batch_ids, file)


In [None]:

# =============================================================================
# Step 4: Check the status of each batch and retrieve the results
# =============================================================================

# Create output_batches_hc_multilingual directory if it doesn't exist
if not os.path.exists("output_batches_hc_multilingual"):
    os.makedirs("output_batches_hc_multilingual")

def check_batch_status(batch_ids):
    for batch_id in batch_ids:
        batch_status = client.batches.retrieve(batch_id)
        print(f"Status for Batch ID {batch_id}: {batch_status.status}")
        
        if batch_status.status == 'completed':
            # Retrieve the output file
            output_file_id = batch_status.output_file_id
            file_response = client.files.content(output_file_id)
            output_file_path = f"output_batches_hc_multilingual/batch_output_hc_multilingual_{batch_id}.jsonl"
            with open(output_file_path, "w", encoding='utf-8') as output_file:
                output_file.write(file_response.text)
            print(f"Batch output saved to {output_file_path}")
        else:
            print(f"Batch {batch_id} not completed yet. Please check again later.")

# Example usage: Check the status of the batches
check_batch_status(batch_ids)


In [None]:

# =============================================================================
# Step 5: Combine the results from all batches into a single DataFrame and save as CSV
# =============================================================================

def load_jsonl(file_path):
    """Load a JSONL file into a list of dictionaries."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file.readlines()]

def normalize_claim_responses(data):
    """
    Process and normalize the response format for health claim translation.
    Extract the three translation fields from the LLM's response.
    """
    records = []
    
    for entry in data:
        custom_id = entry.get('custom_id', 'unknown_id')
        if (
            'response' in entry and
            'body' in entry['response'] and
            'choices' in entry['response']['body']
        ):
            for choice in entry['response']['body']['choices']:
                message_content = choice['message']['content']
                try:
                    content_json = json.loads(message_content)
                    record = {
                        "custom_id": custom_id,
                        "translation_claim": content_json.get("translation_claim", "NA"),
                        "translation_nutrient_substance": content_json.get("translation_nutrient_substance", "NA"),
                        "translation_health_relationship": content_json.get("translation_health_relationship", "NA")
                    }
                    records.append(record)
                except (json.JSONDecodeError, TypeError) as e:
                    logging.error(f"Failed to parse content for custom_id {custom_id}. Error: {e}")
                    logging.error(f"Content: {message_content}")
        else:
            logging.error(f"No valid response found in entry with custom_id {custom_id}")
    
    return pd.DataFrame(records)


def load_and_parse_batches(batch_files_dir):
    """
    Load all batch files from the specified directory,
    process them, and combine the results into a single DataFrame.
    """
    batch_files = [
        os.path.join(batch_files_dir, f)
        for f in os.listdir(batch_files_dir)
        if f.endswith('.jsonl')
    ]
    all_records = []
    
    # Configure logging for errors
    logging.basicConfig(
        filename='parsing_errors_hc_multilingual.log',
        level=logging.ERROR,
        format='%(asctime)s:%(levelname)s:%(message)s'
    )
    
    for batch_file in tqdm(batch_files, desc="Processing batch files"):
        print(f"Processing batch file: {batch_file}")
        batch_data = load_jsonl(batch_file)
        df_batch = normalize_claim_responses(batch_data)
        if not df_batch.empty:
            all_records.append(df_batch)
    
    combined_df = pd.concat(all_records, ignore_index=True) if all_records else pd.DataFrame()
    return combined_df

# Update the directory as needed where your batch JSONL files are located
batch_files_dir = 'output_batches_hc_multilingual/'

# Load and parse the batches into a single DataFrame
df_translations = load_and_parse_batches(batch_files_dir)

# Replace any NaNs with 'NA' for consistency
df_translations.fillna('NA', inplace=True)


In [None]:
# Split the 'custom_id' column into three parts: prefix, claim_id, and language
df_translations[['prefix', 'claim_id', 'language']] = df_translations['custom_id'].str.split('_', expand=True)

# Drop the prefix column (which contains the literal 'claim')
df_translations.drop(columns='prefix', inplace=True)

# Optionally, convert claim_id to an integer (if it contains numeric values)
df_translations['claim_id'] = df_translations['claim_id'].astype(int)

# Perform a left join on the claim_id column
df_translations = df_translations.merge(df_grouped[['claim_id', 'Claim']], on='claim_id', how='left')


In [None]:
df_translations

In [None]:

# Save the combined DataFrame to a CSV file
translations_output_csv = 'int_data/UK_extracted_hc_multilingual.csv'
df_translations.to_csv(translations_output_csv, index=False)

print(f"Claim translations have been successfully flattened and saved to {translations_output_csv}.")


# 2. EU Health Claims

In [None]:
import pandas as pd
# Define the path and sheet name
file_path = "data/EUHC/EU_Register_on_nutrition_and_health_claims.xlsx"
sheet_name = "Worksheet"

# Read the Excel file
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)


In [None]:

# Clean up column names (strip leading/trailing whitespace)
df.columns = df.iloc[0].str.strip()


In [None]:
df

In [None]:

# Function to concatenate multiple 'Scientific Opinion Reference' values with "__"
def concat_refs(series):
    # Drop NaNs, convert to string, take unique values, and join with "__"
    return "__".join(series.dropna().astype(str).unique())

# Function to take the first non-null value in a series
def first_non_null(series):
    non_null = series.dropna()
    return non_null.iloc[0] if not non_null.empty else None

# List of columns to aggregate. We want to group by 'Claim' and
# for the 'Scientific Opinion Reference' column, concatenate values.
# For all other columns, we'll take the first non-null value.
agg_dict = {
    "Claim type": first_non_null,
    "Nutrient substance, food or food category": first_non_null,
    "Claim": "first",  # Grouping column, so just take one copy
    "Conditions of use of the claim / Restrictions of use / Reasons for non-authorisation": first_non_null,
    "Health relationship": first_non_null,
    "EFSA opinion reference": concat_refs,
    "Commission Regulation": first_non_null,
    "Status": first_non_null,
    "Entry Id": first_non_null  # Adjust column name if needed (removed extra spaces)
}

# Before grouping, ensure the column names match exactly.
# If the "Entry Id" column in your file has extra spaces, you might need to adjust accordingly.

# Group the dataframe by 'Claim' and aggregate the values
df_grouped = df.groupby("Claim", as_index=False).agg(agg_dict)

# Create a unique claim_id column (starting at 1)
df_grouped.insert(0, "claim_id", range(1, len(df_grouped) + 1))

# (Optional) Save the processed dataframe to a new CSV file for verification
df_grouped.to_csv("int_data/EU_processed_health_claims.csv", index=False)

# Print a sample of the dataframe to verify results
print(df_grouped.head())


In [None]:
# read processed_health_claims
df_grouped = pd.read_csv("int_data/EU_processed_health_claims.csv")


In [None]:
df_grouped

# prompt

In [None]:
import os
os.environ["OPENAI_API_KEY"]="Key" 

In [None]:
import os
import json
import random
import time
import pandas as pd
import logging
import pickle
from tqdm import tqdm

# =============================================================================
# Step 0: Define the translation prompt components
# =============================================================================

# Define the response format JSON schema for claim translation with three translation keys
response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "claim_translation_v1",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "translation_claim": {
                    "type": "string",
                    "description": (
                        "The translated health claim text in the target language. "
                        "If a specialized medical term is not available in the target language, retain the original English term."
                    )
                },
                "translation_nutrient_substance": {
                    "type": "string",
                    "description": (
                        "The translated text for the nutrient substance, food or food category in the target language."
                    )
                },
                "translation_health_relationship": {
                    "type": "string",
                    "description": (
                        "The translated text for the health relationship in the target language."
                    )
                }
            },
            "required": ["translation_claim", "translation_nutrient_substance", "translation_health_relationship"],
            "additionalProperties": False
        }
    }
}

# Define the system instruction for the LLM as a translation expert
system_instruction = (
    "You are an expert translator specializing in medical claims. Your task is to accurately translate the provided text from English into the target language specified in the user prompt. "
    "This text includes three parts: the health claim, the nutrient substance (or food/food category), and the health relationship. "
    "Please provide your translations using the keys 'translation_claim', 'translation_nutrient_substance', and 'translation_health_relationship'. "
    "Be precise and maintain the original meaning. If a specialized medical term does not exist in the target language, keep the original English term. "
    "Your response must strictly adhere to the JSON schema provided in the response format."
)

# =============================================================================
# Function to create a JSONL entry for a given health claim translation prompt
# =============================================================================
def create_jsonl_entry_from_claim(claim, claim_id, nutrient, health_relationship, target_language):
    """
    Create a JSONL entry to query an LLM for health claim translation.

    Parameters:
        claim (str): The health claim text in English.
        claim_id (int): Unique identifier for the claim.
        nutrient (str): The nutrient substance, food or food category.
        health_relationship (str): The health relationship.
        target_language (str): The target language to translate the text into.

    Returns:
        dict: A dictionary representing the JSON payload for the LLM prompt.
    """
    custom_id = f"claim_{claim_id}_{target_language}"
    user_prompt = (
        f"Translate the following text into '{target_language}':\n"
        f"Claim: {claim}\n"
        f"Nutrient substance, food or food category: {nutrient}\n"
        f"Health relationship: {health_relationship}"
    )
    
    entry = {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini-2024-07-18",
            "messages": [
                {
                    "role": "system",
                    "content": system_instruction
                },
                {
                    "role": "user",
                    "content": user_prompt
                }
            ],
            "temperature": 0,
            "max_tokens": 10000,
            "response_format": response_format
        }
    }
    return entry


top internet website languages are from https://en.wikipedia.org/wiki/Languages_used_on_the_Internet?utm_source=chatgpt.com

In [None]:
# =============================================================================
# Step 1: Create the JSONL file for batch processing from health claims DataFrame
# =============================================================================

# Define the target languages (ignoring the provided percentages)
target_languages = ["English", "Spanish", "Russian", "German", "French", 
                    "Japanese", "Portuguese", "Turkish", "Italian", "Persian", 
                    "Dutch", "Polish", "Chinese", "Vietnamese", "Indonesian", 
                    "Czech", "Korean", "Ukrainian", "Arabic", "Greek", 
                    "Hindi" 
                    ]

jsonl_filename = "int_data/EUHC/batch_input_euhc_multilingual.jsonl"
with open(jsonl_filename, 'w', encoding='utf-8') as jsonl_file:
    for _, row in df_grouped.iterrows():
        for lang in target_languages:
            entry = create_jsonl_entry_from_claim(
                row['Claim'],
                row['claim_id'],
                row['Nutrient substance, food or food category'],
                row['Health relationship'],
                lang
            )
            jsonl_file.write(json.dumps(entry) + '\n')

print("JSONL file created successfully.")


In [None]:

# =============================================================================
# Step 2: Split the JSONL file into smaller chunks (<100mb each)
# =============================================================================

# Create input_batches_hc_multilingual directory if it doesn't exist
if not os.path.exists("int_data/EUHC/input_batches_hc_multilingual"):
    os.makedirs("int_data/EUHC/input_batches_hc_multilingual")

def split_jsonl_file(file_path, num_batches):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    total_lines = len(lines)
    lines_per_batch = total_lines // num_batches
    
    for i in range(num_batches):
        batch_lines = lines[i * lines_per_batch : (i + 1) * lines_per_batch]
        batch_file_path = f"int_data/EUHC/input_batches_hc_multilingual/batch_input_hc_multilingual_part{i + 1}.jsonl"
        with open(batch_file_path, 'w', encoding='utf-8') as batch_file:
            batch_file.writelines(batch_lines)
    
    # If there are leftover lines, add them to the last batch
    if total_lines % num_batches != 0:
        with open(f"int_data/EUHC/input_batches_hc_multilingual/batch_input_hc_multilingual_part{num_batches}.jsonl", 'a', encoding='utf-8') as batch_file:
            batch_file.writelines(lines[num_batches * lines_per_batch :])
    
    print("Batches created successfully.")

# Adjust the number of batches to 5
split_jsonl_file(jsonl_filename, 2)


In [None]:

# =============================================================================
# Step 3: Upload the JSONL files and create batch jobs using the OpenAI client
# =============================================================================

# Import the OpenAI client (assuming it's defined similarly to your existing code)
from openai import OpenAI

# os.environ["OPENAI_API_KEY"] = ""  # Set your API key here
# Initialize OpenAI client
client = OpenAI()

# Function to create a sample JSONL file with 1 random line
def create_sample_jsonl(file_path, sample_size=1):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Randomly select sample_size lines from the full file
    sample_lines = random.sample(lines, sample_size)
    
    # Create a smaller JSONL file with just the sample lines
    sample_file_path = "int_data/EUHC/sample_input_hc_multilingual.jsonl"
    with open(sample_file_path, 'w', encoding='utf-8') as sample_file:
        sample_file.writelines(sample_lines)
    
    print(f"Sample of {sample_size} lines created successfully in {sample_file_path}.")
    return sample_file_path

# Function to upload the sample file and create a batch job
def upload_and_create_sample_batch(sample_file_path):
    # Upload the sample JSONL file
    sample_input_file = client.files.create(
        file=open(sample_file_path, "rb"),
        purpose="batch"
    )
    
    sample_input_file_id = sample_input_file.id
    
    # Create a batch job with the sample file
    batch = client.batches.create(
        input_file_id=sample_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": "Pilot health claim translation job with sample line(s)"
        }
    )
    
    print(f"Sample batch job created successfully with Batch ID: {batch.id}")
    return batch.id

# Run the sample creation process
sample_file = create_sample_jsonl("int_data/EUHC/batch_input_euhc_multilingual.jsonl", 1)
sample_batch_id = upload_and_create_sample_batch(sample_file)


In [None]:

# Function to upload and create batches for all parts
def upload_and_create_batches(num_batches):
    batch_ids = []
    for i in range(1, num_batches + 1):
        batch_file_path = f"int_data/EUHC/input_batches_hc_multilingual/batch_input_hc_multilingual_part{i}.jsonl"
        
        # Upload the JSONL file for batch processing
        batch_input_file = client.files.create(
            file=open(batch_file_path, "rb"),
            purpose="batch"
        )
        
        batch_input_file_id = batch_input_file.id
        
        # Create the batch job
        batch = client.batches.create(
            input_file_id=batch_input_file_id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
            metadata={
                "description": f"Health claim translation job part {i}"
            }
        )
        
        batch_ids.append(batch.id)
        print(f"Batch {i} job created successfully with Batch ID: {batch.id}")
        
        # Pause between uploads to avoid hitting rate limits
        time.sleep(1)
    
    return batch_ids

# Run the function to create all batch jobs (using 5 batches)
num_batches = 2
batch_ids = upload_and_create_batches(num_batches)

# (Optional) Save the batch IDs for later retrieval
with open("int_data/EUHC/batch_ids_hc_multilingual.pkl", "wb") as file:
    pickle.dump(batch_ids, file)


In [None]:

# =============================================================================
# Step 4: Check the status of each batch and retrieve the results
# =============================================================================

# Create output_batches_hc_multilingual directory if it doesn't exist
if not os.path.exists("int_data/EUHC/output_batches_hc_multilingual"):
    os.makedirs("int_data/EUHC/output_batches_hc_multilingual")

def check_batch_status(batch_ids):
    for batch_id in batch_ids:
        batch_status = client.batches.retrieve(batch_id)
        print(f"Status for Batch ID {batch_id}: {batch_status.status}")
        
        if batch_status.status == 'completed':
            # Retrieve the output file
            output_file_id = batch_status.output_file_id
            file_response = client.files.content(output_file_id)
            output_file_path = f"int_data/EUHC/output_batches_hc_multilingual/batch_output_hc_multilingual_{batch_id}.jsonl"
            with open(output_file_path, "w", encoding='utf-8') as output_file:
                output_file.write(file_response.text)
            print(f"Batch output saved to {output_file_path}")
        else:
            print(f"Batch {batch_id} not completed yet. Please check again later.")

# Example usage: Check the status of the batches
check_batch_status(batch_ids)


In [None]:

# =============================================================================
# Step 5: Combine the results from all batches into a single DataFrame and save as CSV
# =============================================================================

def load_jsonl(file_path):
    """Load a JSONL file into a list of dictionaries."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file.readlines()]

def normalize_claim_responses(data):
    """
    Process and normalize the response format for health claim translation.
    Extract the three translation fields from the LLM's response.
    """
    records = []
    
    for entry in data:
        custom_id = entry.get('custom_id', 'unknown_id')
        if (
            'response' in entry and
            'body' in entry['response'] and
            'choices' in entry['response']['body']
        ):
            for choice in entry['response']['body']['choices']:
                message_content = choice['message']['content']
                try:
                    content_json = json.loads(message_content)
                    record = {
                        "custom_id": custom_id,
                        "translation_claim": content_json.get("translation_claim", "NA"),
                        "translation_nutrient_substance": content_json.get("translation_nutrient_substance", "NA"),
                        "translation_health_relationship": content_json.get("translation_health_relationship", "NA")
                    }
                    records.append(record)
                except (json.JSONDecodeError, TypeError) as e:
                    logging.error(f"Failed to parse content for custom_id {custom_id}. Error: {e}")
                    logging.error(f"Content: {message_content}")
        else:
            logging.error(f"No valid response found in entry with custom_id {custom_id}")
    
    return pd.DataFrame(records)


def load_and_parse_batches(batch_files_dir):
    """
    Load all batch files from the specified directory,
    process them, and combine the results into a single DataFrame.
    """
    batch_files = [
        os.path.join(batch_files_dir, f)
        for f in os.listdir(batch_files_dir)
        if f.endswith('.jsonl')
    ]
    all_records = []
    
    # Configure logging for errors
    logging.basicConfig(
        filename='int_data/EUHC/parsing_errors_hc_multilingual.log',
        level=logging.ERROR,
        format='%(asctime)s:%(levelname)s:%(message)s'
    )
    
    for batch_file in tqdm(batch_files, desc="Processing batch files"):
        print(f"Processing batch file: {batch_file}")
        batch_data = load_jsonl(batch_file)
        df_batch = normalize_claim_responses(batch_data)
        if not df_batch.empty:
            all_records.append(df_batch)
    
    combined_df = pd.concat(all_records, ignore_index=True) if all_records else pd.DataFrame()
    return combined_df

# Update the directory as needed where your batch JSONL files are located
batch_files_dir = 'int_data/EUHC/output_batches_hc_multilingual/'

# Load and parse the batches into a single DataFrame
df_translations = load_and_parse_batches(batch_files_dir)

# Replace any NaNs with 'NA' for consistency
df_translations.fillna('NA', inplace=True)


In [None]:
# Split the 'custom_id' column into three parts: prefix, claim_id, and language
df_translations[['prefix', 'claim_id', 'language']] = df_translations['custom_id'].str.split('_', expand=True)

# Drop the prefix column (which contains the literal 'claim')
df_translations.drop(columns='prefix', inplace=True)

# Optionally, convert claim_id to an integer (if it contains numeric values)
df_translations['claim_id'] = df_translations['claim_id'].astype(int)

# Perform a left join on the claim_id column
df_translations = df_translations.merge(df_grouped[['claim_id', 'Claim']], on='claim_id', how='left')


In [None]:
df_translations

In [None]:

# Save the combined DataFrame to a CSV file
translations_output_csv = 'int_data/EU_extracted_hc_multilingual.csv'
df_translations.to_csv(translations_output_csv, index=False)

print(f"Claim translations have been successfully flattened and saved to {translations_output_csv}.")


In [None]:
df_translations