In [None]:
# Define the path and sheet name
import pandas as pd

# Define the file paths for the TSV files
files = ["data/PUBHEALTH/train.tsv", "data/PUBHEALTH/dev.tsv", "data/PUBHEALTH/test.tsv"]

# Read each TSV file into a DataFrame using list comprehension
dfs = [pd.read_csv(file, sep='\t') for file in files]

# Combine the DataFrames into one, resetting the index
df = pd.concat(dfs, ignore_index=True)

# Display the first few rows of the combined DataFrame
print(df.head())



In [None]:
# how many unique claim_id in df
print(df['claim_id'].nunique())

# filter df to keep only unique claim_id
df = df.drop_duplicates(subset='claim_id')

# reset index
df = df.reset_index(drop=True)

print(df['claim_id'].nunique())



In [None]:
# filter out cases where claim_id is empty in df
df = df[df['claim_id'].notna()]

# filter out cases where claim id is longer than 100 characters (cleanup)
df = df[df['claim_id'].str.len() <= 512]

# reset index
df = df.reset_index(drop=True)

In [None]:

# save df as .json file
df.to_json("int_data/PUBHEALTH_processed_health_claims.json", orient="records", lines=True)


In [None]:
# read df from json file
df_grouped = pd.read_json("int_data/PUBHEALTH_processed_health_claims.json", lines=True)

In [None]:
df_grouped

In [None]:
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
# Make sure df is already defined (e.g. via pandas.read_csv or similar)
# from openai import OpenAI  <-- imported inside processing functions as needed

# ------------------------------------------------------------------------------
# English-specific text
# ------------------------------------------------------------------------------
system_instructions = (
    "You are an expert in evaluating medical claims. Your task is to assess the truthfulness of the provided health claim "
    "using established medical research, guidelines, and evidence. Determine if the claim is true. If the claim is supported by evidence, "
    "respond with '1'. If the claim is contradicted by evidence, respond with '0'. If you are unsure, choose the option that is closest to the available evidence. "
)

strict_response_line = "Your answer must be exactly one of the following: '1' or '0'. Do not include any additional text."

prompt_labels = {
    "claim": "Claim:"
}

# ------------------------------------------------------------------------------
# Function to create a combined prompt entry for a given claim
# ------------------------------------------------------------------------------
def create_prompt_entry(claim_id, claim,  model, iteration):
    # Build the combined prompt:
    # 1. The original system instruction
    # 2. The claim details using fixed prompt labels
    # 3. The strict instruction in English
    prompt_text = (
        f"{system_instructions}\n\n"
        f"{prompt_labels['claim']} {claim}\n"
        f"{strict_response_line}"
    )
    # To mimic the template's structure, we include a dummy field ("NA") as the second part
    custom_id = f"claim_{claim_id}__{model}__{iteration}"
    return {
        "custom_id": custom_id,
        "prompt": prompt_text
    }

# ------------------------------------------------------------------------------
# Build the list of prompt entries across models and iterations
# ------------------------------------------------------------------------------
prompts = []
models = [
    "gpt-4o-mini-2024-07-18"
]
iterations = range(3)  # e.g., 0, 1, 2

for model in models:
    for _, row in df.iterrows():
        for iteration in iterations:
            entry = create_prompt_entry(
                row['claim_id'],
                row['claim'],
                model,
                iteration
            )
            prompts.append(entry)


In [None]:
import os
import json
from tqdm import tqdm
from openai import OpenAI
os.environ["OPENAI_API_KEY"]="KEY" 

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])


In [None]:
prompts

In [None]:
import os, json
from tqdm import tqdm
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed


def call_openai(entry):
    custom_id = entry["custom_id"]
    prompt    = entry["prompt"]
    model     = custom_id.split("__")[1]
    try:
        resp = client.chat.completions.create(
            model      = model,
            messages   = [{"role":"user","content":prompt}],
            temperature=0.7,
            max_tokens =50
        )
        return {
            "custom_id": custom_id,
            "model":     model,
            "response":  resp.choices[0].message.content.strip()
        }
    except Exception as e:
        return {
            "custom_id": custom_id,
            "model":     model,
            "error":     str(e)
        }

responses = []
# choose a worker count based on your bandwidth & rate‐limit headroom
with ThreadPoolExecutor(max_workers=50) as executor:
    futures = [executor.submit(call_openai, p) for p in prompts]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Parallelized"):
        responses.append(future.result())

with open("int_data/PUBHEALTH_all_responses.json", "w", encoding="utf-8") as out:
    json.dump(responses, out, ensure_ascii=False, indent=2)


In [None]:
responses