In [12]:
import json
import re
import pandas as pd
from datetime import datetime
# pip install reductoai
from pathlib import Path
from reducto import AsyncReducto
import asyncio
import os 
from dotenv import load_dotenv

# Load variables from .env into environment
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
REDUCTO_API_KEY = os.getenv("REDUCTO_API_KEY")
import time
import matplotlib.pyplot as plt
%matplotlib inline


client = AsyncReducto()



In [183]:
async def run_reducto(file):
    # Replace with your file path
    start = time.perf_counter()
    print(f"[{datetime.now():%H:%M:%S}] START {file}")
    upload = await client.upload(file=Path(file))
    print(f"[{datetime.now():%H:%M:%S}] UPLOADED {file}")

    result = await client.pipeline.run(
        input=upload,
        pipeline_id="k97e5ryq6b73xmqjbjxaedy8rn7vw541"
    )
    elapsed = time.perf_counter() - start

    print(f"[{datetime.now():%H:%M:%S}] DONE {file} in {elapsed:.1f}s")

    final_result = result.result.extract.result 
    
    # Extract just the filename from the path
    filename = Path(file).stem  # Gets filename without extension
    # Ensure output directory exists
    os.makedirs("trial_outputs", exist_ok=True)
    
    #write final result to json file 
    with open(f"trial_outputs/trial_{filename}.json", "w") as f:
        json.dump(final_result, f)
    return final_result

In [184]:
# Automatically collect all PDF files from trials folder
trials_dir = Path("trials/pt2/")
trial_paths = [str(pdf_file) for pdf_file in trials_dir.glob("*.pdf")]
print(f"Found {len(trial_paths)} PDF files:")
for path in trial_paths:
    print(f"  - {path}")

Found 2 PDF files:
  - trials/pt2/125_2015_Article_3795.pdf
  - trials/pt2/zdc1255.pdf


In [185]:

async def run_reducto_batch(trials): 
    tasks = [run_reducto(t) for t in trials]

    return await asyncio.gather(*tasks)

In [186]:
final_result = await run_reducto_batch(trial_paths)


[13:14:42] START trials/pt2/125_2015_Article_3795.pdf
[13:14:42] START trials/pt2/zdc1255.pdf
[13:14:43] UPLOADED trials/pt2/125_2015_Article_3795.pdf
[13:14:43] UPLOADED trials/pt2/zdc1255.pdf
[13:16:31] DONE trials/pt2/zdc1255.pdf in 109.4s
[13:19:18] DONE trials/pt2/125_2015_Article_3795.pdf in 276.2s


In [20]:


def load_reducto_results(input):
 
    trials = []
    baseline_rows = []
    ae_rows = []

    for i, trial_obj in enumerate(input):
        trial_obj = trial_obj[0]
        md = trial_obj["trial_metadata"]

        # ---- trial_id / NCT handling ----
        nct = md["NCT Number"]
        
        trial_id = nct
        if trial_id == "N/A": 
            trial_id = f"{md['drug_name']}_{i+1}"
        print(trial_id)
        # ---- trials table row ----
        trial_row = {
            "trial_id": trial_id,
            "citation": md["citation"],
            "drug_name": md["drug_name"],
            "phase": md["phase"],
            "indication": md["indication"],
            "trial_duration": md["trial_duration"],
            "trial_duration_weeks": md["numeric_trial_duration"],
            "safety_population_definition": md["safety_population_definition"],
            "nct_number": nct or None,
            "sponsor": md["Sponsor"],
        }

        # numeric duration in weeks (if it exists)
        dur = md["trial_duration"] or ""
        m = re.search(r"(\d+)\s*week", dur.lower())
      #  trial_row["trial_duration_weeks"] = int(m.group(1)) if m else None

        trials.append(trial_row)

        # ---- baseline table rows ----
        for row in trial_obj["baseline_table"]:
            arm = row["arm_result"] or {}
            baseline_rows.append({
                "trial_id": trial_id,
                "variable_label": row["variable_label"],
                "variable_label_raw": row["variable_label_raw"],
                "categorical_label": row["categorical_label"],
                "variable_type": row["variable_type"],
                "units": row["units"],

                "arm_label": arm["arm_label"],
                "arm_role": arm["arm_role"],
                "arm_total_n": arm["total_n"],

                "n": arm["n"],
                "percent": arm["percent"],
                "mean": arm["mean"],
                "sd": arm["sd"],
                "raw_value": arm["raw_value"],
            })

        # ---- AE table rows ----
        for row in trial_obj["ae_events"]:
            arm = row["arm_result"] or {}
            ae_rows.append({
                "trial_id": trial_id,
                "AE_label_raw": row["AE_label_raw"],
                "event_category": row["event_category"],
                "is_any_ae": row["is_any_ae"],
                "is_serious_ae": row["is_serious_ae"],
                "is_fatal": row["is_fatal"],
                "is_ae_leading_to_discontinuation": row["is_ae_leading_to_discontinuation"],
                "is_aesi": row["is_aesi"],

                "arm_label": arm["arm_label"],
                "arm_role": arm["arm_role"],
                "arm_total_n": arm["total_n"],
                "n_with_event": arm["n_with_event"],
                "percent_with_event": arm["percent_with_event"],
            })

    df_trials = pd.DataFrame(trials)
    df_baseline = pd.DataFrame(baseline_rows)
    df_ae = pd.DataFrame(ae_rows)

    return df_trials, df_baseline, df_ae


In [21]:
## load in json files from trial outputs and append to them a list to run load_reducto_results on them 
trials_json_files = [f for f in os.listdir("trial_outputs") if f.endswith(".json")]
final_result = []
for file in trials_json_files:
    with open(f"trial_outputs/{file}", "r") as f:
        trial_obj = json.load(f)
    final_result.append(trial_obj)

In [22]:
df_trials, df_baseline, df_ae = load_reducto_results(
    final_result
)


NCT01272219
NCT03495102
NCT03548935
NCT00849017
NCT05567796
NCT04707469
Exenatide (Exendin-4)_7
NCT04944992
NCT00688701
NCT00308139
NCT05669599
NCT00381342
NCT02465515
NCT04184622
Exenatide (Byetta)_15
NCT00707031
NCT05035095
NCT04881760


In [23]:
ae_labels = (
    df_ae["AE_label_raw"]
    .dropna()
    .drop_duplicates()
    .sort_values()
)
len(ae_labels), ae_labels[:20]

(172,
 908                         AE leading to discontinuation
 960                                  Abdominal distension
 14                                         Abdominal pain
 323                                  Abdominal pain upper
 880                                   Acute cholecystitis
 682                            Acute gallbladder diseases
 335                                   Acute kidney injury
 83                                     Acute pancreatitis
 92                                     Acute renal events
 151                                   Acute renal failure
 235     Adverse event leading to permanent trial-produ...
 164                   Adverse event leading to withdrawal
 395                                        Adverse events
 0                       Adverse events in ≥5% of patients
 105     Adverse events leading to discontinuation of d...
 1031    Adverse events leading to discontinuation of r...
 772     Adverse events leading to discontinuation

In [12]:
ae_labels.to_csv("ae_labels_raw.csv", index=False)


In [14]:
from openai import OpenAI
import os
client = OpenAI()

In [15]:

def build_prompt_for_labels(labels):
    instructions = """
You are harmonizing adverse event labels from clinical trial safety tables.

Goal:
- Assign a canonical name for each label (ae_canonical) that merges variants of the same concept.
- Also assign a slightly broader "family" grouping (ae_family) that merges clearly related variants that may be slighlty clinically that someone would reasonably collapse for analysis when evaluating the AE profile of a drug.
- Do NOT merge clinically distinct events that are routinely reported separately (for example,
  "Nausea" vs "Vomiting" vs "Diarrhea" should each remain their own family) Remember these should be actually clinically distinct rathan a severe vs serious event. Imagine you are using these labels
  as a doctor to make treatment decisions. 

Definitions:
- ae_canonical:
  - A preferred term for the adverse event concept.
  - Examples:
    - "Cholecystitis acute" -> ae_canonical = "Acute cholecystitis"
    - "Chronic cholecystitis" -> ae_canonical = "Chronic cholecystitis"
    - "Cholelithiasis" -> ae_canonical = "Cholelithiasis"
    - "Nausea" -> ae_canonical = "Nausea"
    - "Renal event" -> ae_canonical = "Renal impairment"
    - "Renal disorder" -> ae_canonical = "Renal impairment"
    - "Hyperglycemic episode" -> ae_canonical = "Hyperglycemia"
    - "Hyperglycemia > 54 mg/dL" -> ae_canonical = "Hyperglycemia"

- ae_family:
  - A slightly broader grouping that clusters very closely related variants and subtypes.
  - Use the same ae_family for labels that are essentially the same clinical topic and would often be
    grouped together in a safety analysis (e.g., different subtypes of cholecystitis, or different
    phrasings of the same AE).
  - Examples:
    - If clinically reasonable, "Cholelithiasis" and "Cholecystitis" may share ae_family = "Gallbladder disease".
    - "Nausea" and "Vomiting" should NOT be merged into the same ae_family; each stays in its own family.
  - If no broader grouping is natural, set ae_family equal to ae_canonical.

- Summary rows:
  - Some labels are global summaries such as "Any adverse event", "Serious adverse events",
    "Adverse events leading to discontinuation".
  - These should be marked as is_summary = true and given an appropriate summary_type.
  - For summary rows, ae_canonical and ae_family may reflect the summary (e.g. "Any adverse event",
    "Serious adverse events") rather than a specific PT.

For EACH label I give you, output a JSON object with:
- "label_raw": the original label string.
- "ae_canonical": a short, specific canonical name as defined above.
- "ae_family": the broader family name as defined above. If no broader grouping is natural,
  set ae_family equal to ae_canonical.
- "is_summary": true if this label describes a global summary row
  (e.g. "Any adverse event", "Serious adverse events", "Adverse events leading to discontinuation"),
  otherwise false.
- "summary_type": if is_summary is true, one of:
  - "any_ae"
  - "serious_ae"
  - "fatal_ae"
  - "ae_leading_to_discontinuation"
  - "other_summary"
  Otherwise null.
- "notes": (optional) short explanation of your reasoning if needed.
- "confidence": "high", "medium", or "low".

Return ONLY a JSON list (array) of these objects, with one object per label, and no extra commentary.

Here are the labels:
"""
    body = "\n".join(f"- {lab}" for lab in labels)
    return instructions + "\n" + body
 

In [24]:
mapping = []
canonical_set = []
prompt = build_prompt_for_labels( ae_labels)
ae_response = client.chat.completions.create(
    model="gpt-5",
    messages=[{"role": "user", "content": prompt}],
    response_format={"type": "json_object"},
)

In [195]:
ae_result = json.loads(ae_response.choices[0].message.content)
ae_mapping = pd.DataFrame(ae_result['result'])
ae_mapping.head()
new_df_ae = df_ae.merge(
    ae_mapping[["label_raw", "ae_canonical", "ae_family", "is_summary", "summary_type"]],
    left_on="AE_label_raw",
    right_on="label_raw",
    how="left",
)
new_df_ae = new_df_ae.merge(
    df_trials[["trial_id", "drug_name"]],
    on="trial_id",
    how="left",
)

In [196]:
# nausea = new_df_ae[new_df_ae["ae_canonical"] == "Nausea"].copy()

# # use drug name instead of trial_id in the label
# nausea["arm_display"] = (
#     nausea["drug_name"].fillna(nausea["trial_id"])  # fallback if drug_name missing
#     + " | " +
#     nausea["arm_label"]
# )

# # sort by % nausea
# nausea = nausea.sort_values("percent_with_event", ascending=True)

# plt.figure(figsize=(10, 6))
# plt.barh(nausea["arm_display"], nausea["percent_with_event"])
# plt.xlabel("Nausea (% participants with ≥1 event)")
# plt.ylabel("Drug | Arm")
# plt.title("Nausea incidence by arm")
# plt.tight_layout()
# plt.show()


In [197]:
df_arms = (
    new_df_ae[["trial_id", "arm_label", "arm_role", "drug_name"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
df_arms = df_arms.merge(
    df_trials[["trial_id", "phase", "indication"]],
    on="trial_id",
    how="left",
)

def build_arm_prompt(arms_batch):
    """
    arms_batch: list of dicts with keys: trial_id, arm_label, drug_name, phase, indication, arm_role
    """
    instructions = """
You are helping analyze clinical trial arms for GLP-1 and related drugs.

For each trial arm, decide if it represents a clinically relevant dose that would
likely be used in real-world practice (approved dose or something very close to it),
as opposed to:
- clear sub-therapeutic exploratory doses
- run-in/titration-only schedules
- placebo or non-active comparator arms.

Guidelines:
- Placebo arms are NEVER clinically relevant.
- Arms with obviously tiny doses (e.g. early phase 0.25 mg where the drug is normally used at 1–2+ mg)
  are usually NOT clinically relevant unless that is an approved maintenance dose.
- If multiple doses are approved (e.g. 1.0 mg and 2.4 mg), they can all be clinically relevant.
- If you are unsure, be conservative and set is_clinical_dose = true.
- If it is not approved yet, typically select the dose with the best balance of efficacy and safety that would be a fair comparison to other approved drugs

For EACH arm I give you, output a JSON object with:
- "trial_id"
- "arm_label"
- "drug_name"
- "dose_mg": numeric dose if you can parse it from the arm_label (e.g. 1.0, 2.4, 50).
  If unclear, use null.
- "route": short route if obvious from context (e.g. "SC", "oral", "IV"), else null.
- "frequency": short dosing frequency (e.g. "QD", "BID", "QW", "Q2W", "Q4W"), else null.
- "is_placebo": true if this is clearly a placebo arm.
- "is_clinical_dose": true if this arm is a clinically relevant, real-world dose as defined above.
- "include_in_plot": true if you think this arm should be included when plotting
  dose vs adverse events; typically true for active clinically relevant doses, false for placebo and
  non-relevant exploratory doses.
- "notes": optional, brief explanation if needed.
- "confidence": "high", "medium", or "low".

Return ONLY a JSON list (array) of these objects, no extra commentary.

Here are the trial arms:
"""
    lines = []
    for arm in arms_batch:
        # provide a little structured context
        context = {
            "trial_id": arm["trial_id"],
            "drug_name": arm["drug_name"],
            "arm_label": arm["arm_label"],
            "arm_role": arm.get("arm_role"),
            "phase": arm.get("phase"),
            "indication": arm.get("indication"),
        }
        lines.append(json.dumps(context))
    return instructions + "\n" + "\n".join(lines)
prompt = build_arm_prompt(df_arms.to_dict(orient="records"))
arm_response = client.chat.completions.create(
    model="gpt-5",
    messages=[{"role": "user", "content": prompt}],
    response_format={"type": "json_object"},
)


In [198]:
arm_text = arm_response.choices[0].message.content
arm_mapping = pd.DataFrame(json.loads(arm_text)["results"])
arm_mapping.head()
new_df_ae = new_df_ae.merge(
    arm_mapping[["trial_id", "arm_label", "is_clinical_dose", "include_in_plot", "dose_mg", "route", "frequency"]],
    on=["trial_id", "arm_label"],
    how="left",
)


In [199]:

# # restrict to nausea + arms the model says are worth plotting
# mask = (
#     (new_df_ae["ae_canonical"] == "Nausea") &
#     (new_df_ae["include_in_plot"] == True)
# )

# nausea = new_df_ae[mask].copy()

# # make a nice display label: drug + dose + freq
# def make_label(row):
#     bits = [row["drug_name"]]
#     if pd.notna(row.get("dose_mg")):
#         bits.append(f'{row["dose_mg"]} mg')
#     if pd.notna(row.get("frequency")):
#         bits.append(row["frequency"])
#     # fallback if we didn't parse anything sensible
#     if len(bits) == 1:
#         bits.append(row["arm_label"])
#     return " ".join(str(b) for b in bits if b)

# nausea["arm_display"] = nausea.apply(make_label, axis=1)

# # sort by nausea %
# nausea = nausea.sort_values("percent_with_event", ascending=True)

# plt.figure(figsize=(10, 6))
# plt.barh(nausea["arm_display"], nausea["percent_with_event"])
# plt.xlabel("Nausea (% participants with ≥1 event)")
# plt.ylabel("Drug / dose")
# plt.title("Nausea incidence by clinically relevant arm")
# plt.tight_layout()
# plt.show()

In [200]:
df_trials.to_csv("trials.csv", index=False)
new_df_ae.to_csv("ae.csv", index=False)
df_baseline.to_csv("baseline.csv", index=False)
df_arms.to_csv("arms.csv", index=False)


In [10]:
indications = (
    df_trials["indication"]
    .dropna()
    .astype(str)
    .drop_duplicates()
    .sort_values()
    .tolist()
)
print(f"Found {len(indications)} unique indication strings")


Found 9 unique indication strings


In [13]:
def build_indication_prompt(labels):
    """
    labels: list of raw indication strings
    """
    instructions = """
You are harmonizing clinical trial indication labels.

Goal:
- Map raw indication strings from trial publications into a single, normalized
  indication_group that can be used for cross-trial analysis.

Rules:
- If two raw labels describe essentially the same patient population, they
  should share the SAME indication_group string.
- Keep important distinctions that change the biology or safety context, but collapse distinctions that a reasonable biotech analyst wouldn't care about (obesity vs overweight are the same thing and would all be obesity):

  e.g., "Type 2 diabetes" vs "Obesity without diabetes" vs
  "Obesity with type 2 diabetes".
- Normalize wording and spelling but keep groups intuitive and human-readable:
  e.g. "Type 2 diabetes", "Obesity", "Obesity + T2D", "NAFLD/NASH".
- If a label is very vague (e.g. "Overweight or obese with comorbidities"),
  choose a concise but reasonable indication_group such as
  "Obesity with cardiometabolic risk".

For EACH label I give you, output a JSON object with:
- "indication_raw": the original label string.
- "indication_group": a concise, normalized indication group as defined above.
- "notes": (optional) short explanation if needed.
- "confidence": "high", "medium", or "low".

Return ONLY a JSON list (array) of these objects, with one object per input label,
and no extra commentary.

Here are the raw indication labels:
"""
    body = "\n".join(f"- {lab}" for lab in labels)
    return instructions + "\n" + body


prompt = build_indication_prompt(indications)

# Use the same client you used for AE mapping; adjust if your call pattern differs
indication_response = client.chat.completions.create(
    model="gpt-5",
    messages=[{"role": "user", "content": prompt}],
    response_format={"type": "json_object"},
)

indication_text = indication_response.choices[0].message.content
result = json.loads(indication_text)

# handle possible response wrappers (check for common keys)
if isinstance(result, dict):
    if "results" in result:
        result = result["results"]
    elif "result" in result:
        result = result["result"]
    # If result is still a dict but not a list, it might be the array itself
    # Check if it looks like it should be a list
    if not isinstance(result, list):
        # Try to find a list value in the dict
        for key, value in result.items():
            if isinstance(value, list):
                result = value
                break

# Ensure result is a list before creating DataFrame
if not isinstance(result, list):
    raise ValueError(f"Expected a list but got {type(result)}: {result}")

indication_mapping = pd.DataFrame(result)
indication_mapping.head()

Unnamed: 0,indication_raw,indication_group,notes,confidence
0,Non-alcoholic fatty liver disease (NAFLD),NAFLD/NASH,Grouped NAFLD with NASH for cross-trial analysis,high
1,Obesity,Obesity,,high
2,Overweight or Obesity,Obesity,Collapsed overweight into obesity,high
3,Overweight or obesity,Obesity,Collapsed overweight into obesity,high
4,Type 2 Diabetes,Type 2 diabetes,,high


In [14]:
# Verify that indication_mapping has the required columns
required_cols = ["indication_raw", "indication_group"]
missing_cols = [col for col in required_cols if col not in indication_mapping.columns]
if missing_cols:
    print(f"Warning: indication_mapping is missing columns: {missing_cols}")
    print(f"Available columns: {list(indication_mapping.columns)}")
    raise KeyError(f"Missing required columns: {missing_cols}")

df_trials = df_trials.merge(
    indication_mapping[["indication_raw", "indication_group"]],
    left_on="indication",
    right_on="indication_raw",
    how="left",
)

df_trials.drop(columns=["indication_raw"], inplace=True)

# Optional: inspect and/or save
df_trials[["trial_id", "indication", "indication_group"]].drop_duplicates().head()

# Optional save
df_trials.to_csv("trials_with_indication_groups.csv", index=False)
indication_mapping.to_csv("indication_mapping.csv", index=False)

In [25]:
### adding in metadata to trials with a dictionary with drug name and half life 
drug_half_life = {
    "Semaglutide": 7,
    "Exenatide": .1,
    "Liraglutide": 0.5,
    "Dulaglutide": 4.85,
    "albiglutide": 7,
    "Retatrutide": 6,
    "Tirzepatide": 5,
    "Lixisenatide": 0.125,
    "Exenatide (Exendin-4)": .1,
    "Albiglutide":7,
    "Exenatide (Byetta)": .1,
}
trials_with_drug_half_life = df_trials.copy()
trials_with_drug_half_life["drug_half_life"] = trials_with_drug_half_life["drug_name"].map(drug_half_life)
trials_with_drug_half_life.to_csv("trials_with_indication_groups.csv", index=False)


In [24]:
trials_with_drug_half_life

Unnamed: 0,trial_id,citation,drug_name,phase,indication,trial_duration,trial_duration_weeks,safety_population_definition,nct_number,sponsor,indication_group,drug_half_life
0,NCT01272219,N Engl J Med 2015;373:11-22. DOI: 10.1056/NEJM...,Liraglutide,Not specified,Weight management in overweight or obese adult...,56 weeks,56.0,All randomized participants who received at le...,NCT01272219,Novo Nordisk,Obesity without diabetes,0.5
1,NCT03495102,Diabetes Care 2021;44:765–773 | https://doi.or...,Dulaglutide,Phase 3,Type 2 Diabetes,52 weeks,52.0,All randomly assigned patients who received at...,NCT03495102,Eli Lilly and Company,Type 2 diabetes,4.85
2,NCT03548935,N Engl J Med 2021;384:989-1002. DOI: 10.1056/N...,Semaglutide,Phase 3,Overweight or Obesity,68 weeks,68.0,All randomized participants exposed to at leas...,NCT03548935,Novo Nordisk,Obesity,7.0
3,NCT00849017,Diabetologia (2016) 59:266-274 DOI 10.1007/s00...,albiglutide,Phase 3,Type 2 Diabetes Mellitus,52 weeks,52.0,All randomized participants who received at le...,NCT00849017,GlaxoSmithKline,Type 2 diabetes,7.0
4,NCT05567796,N Engl J Med 2025;393:635-47. DOI: 10.1056/NEJ...,Cagrilintide-Semaglutide,3a,Overweight or Obesity,68-week,68.0,all the participants who underwent randomizati...,NCT05567796,Novo Nordisk,Obesity,
5,NCT04707469,"Vanita R Aroda, Jens Aberle, Lars Bardtrum, Er...",oral semaglutide,3b,type 2 diabetes,68 weeks,68.0,all participants who received at least one dos...,NCT04707469,Novo Nordisk,Type 2 diabetes,
6,Exenatide (Exendin-4)_7,"Diabetes Care 28:1092-1100, 2005",Exenatide (Exendin-4),Phase III,Type 2 Diabetes,30 weeks,30.0,All randomized participants who received at le...,,"Amylin Pharmaceuticals, Eli Lilly",Type 2 diabetes,0.1
7,NCT04944992,"Journal of Hepatology, October 2023, vol. 79, ...",Efinopegdutide,IIa,Non-alcoholic fatty liver disease (NAFLD),24 weeks,24.0,All randomized participants who received at le...,NCT04944992,"Merck Sharp & Dohme LLC, a subsidiary of Merck...",NAFLD/NASH,
8,NCT00688701,"Fonseca VA, Alvarado-Ruiz R, Raccah D, Boka G,...",Lixisenatide,Phase III,Type 2 Diabetes,12-week,12.0,All randomized patients exposed to at least on...,NCT00688701,Sanofi-aventis,Type 2 diabetes,0.125
9,NCT00308139,"Buse JB, Drucker DJ, Taylor KL, et al. DURATIO...",Exenatide,Not specified,Type 2 Diabetes,52 weeks,52.0,All randomized participants who received at le...,NCT00308139,Amylin Pharmaceuticals and Eli Lilly & Company,Type 2 diabetes,0.1
