In [None]:
# ---------------------------------------------------------------------------
# 0. Imports & paths
# ---------------------------------------------------------------------------
import os, json, random, time, pickle, logging
import pandas as pd
from tqdm import tqdm
from openai import OpenAI          # pip install openai >=1.14
# ---------------------------------------------------------------------------
# 1. Load forward-translation dataframe
# ---------------------------------------------------------------------------
df_trans = pd.read_csv(
    "int_data/UK_extracted_hc_multilingual.csv",  # claim_id | language | translation_*
    dtype={"claim_id": int, "language": str}
)
print(f"Loaded {len(df_trans):,} translated rows.")


In [None]:
df_trans

In [None]:

# ---------------------------------------------------------------------------
# 2.  GPT‑4o‑mini back‑translation template  (multilingual → English)
# ---------------------------------------------------------------------------

# --- strict JSON schema for the model’s answer ------------------------------
response_format = {
    "type": "json_schema",
    "json_schema": {
        "name": "claim_backtranslation_v1",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "english_claim": {
                    "type": "string",
                    "description": "Health‑claim text translated into UK English."
                },
                "english_nutrient_substance": {
                    "type": "string",
                    "description": "Nutrient / food item translated into UK English."
                },
                "english_health_relationship": {
                    "type": "string",
                    "description": "Health relationship translated into UK English."
                }
            },
            "required": [
                "english_claim",
                "english_nutrient_substance",
                "english_health_relationship"
            ],
            "additionalProperties": False
        }
    }
}

# --- system instruction (mirrors forward‑translation wording) ---------------
system_instruction = (
    "You are an expert translator specializing in medical claims. "
    "Your task is to accurately translate the provided text from the source "
    "language specified in the user prompt into English. "
    "This text includes three parts: the health claim, the nutrient substance "
    "(or food/food category), and the health relationship. "
    "Please provide your translations using the keys "
    "'english_claim', 'english_nutrient_substance', and "
    "'english_health_relationship'. "
    "Be precise and maintain the original meaning. If a specialized medical "
    "term does not have an established English equivalent, keep the original "
    "term from the source language. "
    "Your response must strictly adhere to the JSON schema provided in the "
    "response format."
)


# ---------------------------------------------------------------------------
# Helper to build one JSONL entry per row
# ---------------------------------------------------------------------------
def make_jsonl_entry(row: pd.Series) -> str:
    """
    Build one JSONL line for the OpenAI batch endpoint.
    custom_id layout: 'backtrans_<claim_id>_<lang>'
    """
    custom_id = f"backtrans_{row.claim_id}_{row.language}"
    
    user_prompt = (
        f"Source language: '{row.language}'. Translate the following into English:\n"
        f"translation_claim: {row.translation_claim}\n"
        f"translation_nutrient_substance: {row.translation_nutrient_substance}\n"
        f"translation_health_relationship: {row.translation_health_relationship}"
    )

    payload = {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini-2024-07-18",
            "messages": [
                {"role": "system", "content": system_instruction},
                {"role": "user",   "content": user_prompt}
            ],
            "temperature": 0,
            "max_tokens": 10000,
            "response_format": response_format
        }
    }
    return json.dumps(payload, ensure_ascii=False)


In [None]:

# ---------------------------------------------------------------------------
# 3. Write master JSONL file
# ---------------------------------------------------------------------------
master_jsonl = "int_data/back_translate/openai/batch_input_hc_backtranslation.jsonl"
with open(master_jsonl, "w", encoding="utf-8") as fh:
    for _, r in df_trans.iterrows():
        fh.write(make_jsonl_entry(r) + "\n")
print(f"Master JSONL written: {master_jsonl}")


In [None]:

# ---------------------------------------------------------------------------
# 4. Split into ≤N shards (keeps each file comfortably <100 MB)
# ---------------------------------------------------------------------------
def split_jsonl(source, out_dir, n_parts=4):
    os.makedirs(out_dir, exist_ok=True)
    with open(source, "r", encoding="utf-8") as fh:
        lines = fh.readlines()

    chunk = (len(lines) + n_parts - 1) // n_parts
    for i in range(n_parts):
        part_lines = lines[i*chunk : (i+1)*chunk]
        if not part_lines: break
        tgt = f"{out_dir}/backtranslation_part{i+1}.jsonl"
        with open(tgt, "w", encoding="utf-8") as out:
            out.writelines(part_lines)
        print(f"  wrote {len(part_lines):,} lines → {tgt}")

split_jsonl(master_jsonl,
            "int_data/back_translate/openai/input_batches_hc_backtranslation",
            n_parts=4)


In [None]:
import os
os.environ["OPENAI_API_KEY"]="KEY" 

In [None]:

# ---------------------------------------------------------------------------
# 5. Helper: upload & create batch jobs (commented – run when ready)
# ---------------------------------------------------------------------------

client = OpenAI()                        # reads OPENAI_API_KEY from env

def upload_and_launch(jsonl_path, desc):
    infile = client.files.create(file=open(jsonl_path,"rb"), purpose="batch")
    batch  = client.batches.create(
        input_file_id=infile.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={"description": desc}
    )
    print(f"Launched batch {batch.id} for {jsonl_path}")
    return batch.id

batch_ids = []
for fp in sorted(os.listdir("int_data/back_translate/openai/input_batches_hc_backtranslation")):
    if fp.endswith(".jsonl"):
        full = f"int_data/back_translate/openai/input_batches_hc_backtranslation/{fp}"
        batch_ids.append(upload_and_launch(full, f"Back-translation {fp}"))
# Save for later polling
pickle.dump(batch_ids, open("int_data/back_translate/openai/batch_ids_hc_backtranslation.pkl","wb"))



In [None]:

# ---------------------------------------------------------------------------
# 6. Helper: poll batches & collect outputs (run after 24 h window)
# ---------------------------------------------------------------------------

out_dir = "int_data/back_translate/openai/output_batches_hc_backtranslation"
os.makedirs(out_dir, exist_ok=True)

for bid in batch_ids:
    b = client.batches.retrieve(bid)
    if b.status != "completed":
        print(f"{bid} → {b.status}")
        continue
    out_id = b.output_file_id
    outfile_resp = client.files.content(out_id)
    out_path = f"{out_dir}/backtrans_output_{bid}.jsonl"
    with open(out_path,"w",encoding="utf-8") as fh:
        fh.write(outfile_resp.text)
    print(f"Saved {out_path}")


In [None]:


# ---------------------------------------------------------------------------
# 7. Parse outputs into a tidy DataFrame (optional)
# ---------------------------------------------------------------------------

def load_jsonl(fp):
    with open(fp,"r",encoding="utf-8") as fh:
        return [json.loads(l) for l in fh]

records = []
for fp in os.listdir(out_dir):
    if fp.endswith(".jsonl"):
        for entry in load_jsonl(f"{out_dir}/{fp}"):
            cid = entry["custom_id"]
            if ("response" in entry and
                "body" in entry["response"] and
                "choices" in entry["response"]["body"]):
                raw = entry["response"]["body"]["choices"][0]["message"]["content"]
                try:
                    js = json.loads(raw)
                except json.JSONDecodeError:
                    logging.warning(f"Bad JSON for {cid}")
                    continue
                claim_id, lang = cid.split("_")[1:3]
                records.append({
                    "claim_id": int(claim_id),
                    "language": lang,
                    **js
                })

df_back = pd.DataFrame(records)
df_back.to_csv("int_data/back_translate/openai/backtranslations_en.csv", index=False)
print("✅ back-translation dataframe saved.")

merge back

In [None]:
# ---------------------------------------------------------------------------
# 8. Merge forward‑ and back‑translations
# ---------------------------------------------------------------------------

import pandas as pd

# --- reload the two CSVs -----------------------------------------------------
df_forward = pd.read_csv(
    "int_data/multilingual/UK_extracted_hc_multilingual.csv",   # forward translations
    dtype={"claim_id": int, "language": str}
)

df_back = pd.read_csv(
    "int_data/back_translate/openai/UK_backtranslations_en.csv",       # back‑translations
    dtype={"claim_id": int, "language": str}
)

print(f"Forward rows : {len(df_forward):,}")
print(f"Back rows    : {len(df_back):,}")


In [None]:

# --- sanity‑check keys -------------------------------------------------------
missing = set(df_back["claim_id"]) - set(df_forward["claim_id"])
if missing:
    print(f"⚠️  Warning: {len(missing)} claim_id(s) in back‑translations not found in forward table.")

# --- merge ------------------------------------------------------------------
df_merged = (
    df_forward
    .merge(
        df_back,
        on=["claim_id", "language"],
        how="left",
        validate="one_to_one"
    )
)

print(f"Merged rows  : {len(df_merged):,}")


In [None]:

# --- save -------------------------------------------------------------------
out_path = "int_data/back_translate/openai/UK_merged_translations.csv"
df_merged.to_csv(out_path, index=False)
print(f"✅ Merged file written → {out_path}")

# --- optional peek ----------------------------------------------------------
print(df_merged.head(3).T)   # transpose for compact view


# Evaluation of translation

In [None]:
import os
os.environ["OPENAI_API_KEY"]="KEY" 

# pip install langchain_openai
from langchain_openai import OpenAIEmbeddings

embeddings_3072 = OpenAIEmbeddings(model="text-embedding-3-large")


In [None]:
# ------------------------------------------------------------------
# 0️⃣  Imports & configs
# ------------------------------------------------------------------
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# ------------------------------------------------------------------
# 1️⃣  Load merged translations
# ------------------------------------------------------------------
df = pd.read_csv("int_data/back_translate/openai/UK_merged_translations.csv",
                 dtype={"claim_id": int, "language": str})

# ------------------------------------------------------------------
# 2️⃣  Embed the two text columns in one go
# ------------------------------------------------------------------
tqdm.pandas(desc="Embedding original EN claims")
claim_embeddings = np.vstack(
    embeddings_3072.embed_documents(df["Claim"].astype(str).tolist())
)


In [None]:

tqdm.pandas(desc="Embedding back‑translations")
back_embeddings  = np.vstack(
    embeddings_3072.embed_documents(df["english_claim"].astype(str).tolist())
)

# ------------------------------------------------------------------
# 3️⃣  Cosine similarity row‑wise (vectorised, no loops)
# ------------------------------------------------------------------
numerators = np.einsum("ij,ij->i", claim_embeddings, back_embeddings)
denom      = (
    np.linalg.norm(claim_embeddings, axis=1) *
    np.linalg.norm(back_embeddings,  axis=1)
)
df["sem_sim"] = numerators / denom

# ------------------------------------------------------------------
# 4️⃣  Aggregate stats
# ------------------------------------------------------------------
# filter out rows where language is English
df_noneng = df.query("language != 'English'").reset_index(drop=True)

overall_sim_noneng = df_noneng["sem_sim"].mean()
by_lang_noneng     = (df_noneng.groupby("language")["sem_sim"]
                 .mean()
                 .sort_values(ascending=False))

print(f"\n=== Overall semantic similarity (cosine) === {overall_sim_noneng:.3f}\n")

# overall standard deviation
overall_std = df_noneng["sem_sim"].std()
print(f"\n=== Overall standard deviation === {overall_std:.3f}\n")