In [1]:
import table_utils
import process_paper_utils as utils
import os
from bs4 import BeautifulSoup
import html
from datetime import datetime
import pandas as pd

In [2]:
#Enter PMC here:
PMC = "PMC6449948"

Imports paper from PMC

In [3]:
os.makedirs("papers", exist_ok=True)
if os.path.isfile(f"papers/{PMC}.xml"):
    with open(f"papers/{PMC}.xml", 'r', encoding='utf-8') as f:
        paper_xml = f.read()
else:
    paper_xml = utils.fetch_xml(PMC).replace("|", "/")
    with open(f"papers/{PMC}.xml", "w", encoding="utf-8") as f:
        f.write(paper_xml)
print(paper_xml)

<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE collection SYSTEM "BioC.dtd"><collection><source>BioC-API</source><date>20250603</date><key>collection.key</key><document><id>PMC6449948</id><infon key="license">CC BY</infon><passage><infon key="article-id_doi">10.1186/s12885-019-5516-5</infon><infon key="article-id_pmc">PMC6449948</infon><infon key="article-id_pmid">30947706</infon><infon key="article-id_publisher-id">5516</infon><infon key="elocation-id">315</infon><infon key="kwd">Newcastle disease virus AF2240 Breast cancer 4 T1 cells Cytokines</infon><infon key="license">Open AccessThis article is distributed under the terms of the Creative Commons Attribution 4.0 International License (http://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons license, and indicate if changes were made. The Creative Co

Extracts abstract and tables from paper

In [4]:
from collections import defaultdict

soup = BeautifulSoup(paper_xml, "lxml-xml")

#abstract
abstract_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "ABSTRACT"]
abstract_text_list = [p.find("text").get_text() for p in abstract_passages if p.find("text")]
abstract = "\n".join(abstract_text_list)

#methods
methods_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "METHODS"]
methods_text_list = [p.find("text").get_text() for p in methods_passages if p.find("text")]
methods = "\n".join(methods_text_list)

#results
results_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "RESULTS"]
results_text_list = [p.find("text").get_text() for p in results_passages if p.find("text")]
results = "\n".join(results_text_list)

#tables
table_passages = [p for p in soup.find_all("passage") if (sect := p.find("infon", {"key": "section_type"})) and sect.text.strip() == "TABLE"]
tables = defaultdict(lambda: {"label": "o", "caption": None, "markdown": None, "footnotes": [], "col_labels": None, "group_cols": [], "biomarker_cols": []})
for p in table_passages:
    table_id = p.find("infon", {"key": "id"}).text.strip()
    passage_type = p.find("infon", {"key": "type"}).text.strip()
    if passage_type == "table_caption":
        tables[table_id]["caption"] = p.find("text").text.strip()
    elif passage_type == "table":
        table_xml = p.find("infon", {"key": "xml"}).text
        if "<table" not in table_xml:
            continue
        table = table_utils.single_html_table_to_markdown(html.unescape(table_xml))
        tables[table_id]["markdown"] = table_utils.transpose_markdown_table(table_utils.transpose_markdown_table(table))
    elif passage_type in ("table_foot", "table_footnote"):
        tables[table_id]["footnotes"].append(p.find("text").text.strip())


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_content, "html.parser")

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html, "html.parser")


Generate prompt for identifying each table

In [5]:
id_tables_prompt = utils.id_tables_prompt(methods, tables)
print(id_tables_prompt)

# Instructions
You will be given the methods of a research paper which includes at least one animal toxicity study then a list of all the tables included in that paper, each one labeled with its title. Your goal is to identify each table as either a biomarker table (B), or other (O). Biomarker tables may contain treatment group dosage information, but the main difference between is that biomarker tables should contain biomarker data observed from the treatment groups. This can frequency data (such as survival or number occurences of a condition) or metric data (such as ALT levels or compound concentration), and the table must describe the results of specifically an animal toxicity study within the paper to be labeled B. Otherwise, label it O. Any tables from other irrelevant studies should be labeled O, such as results of in vitro assays or gene expression tables. Additionally, tables where the data is qualitative observations should also be labeled O, as should any tables that give in

API call to identify tables

In [6]:
id_tables_output = utils.llama_request(id_tables_prompt, 100, 0)

time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
with open(f"responses/table_id/{time}.txt", "w", encoding="utf-8") as file:
    file.write(f"{PMC}\n---\n{id_tables_prompt}\n---\n{id_tables_output}")

Assigning labels to table dict

In [7]:
for label in id_tables_output.split("\n"):
    label = label.strip()
    if not label: continue

    content = label[1:-1]
    tid, label = content.split("|")
    tables[tid.strip()]["label"] = label.strip().lower()

print(tables)

defaultdict(<function <lambda> at 0x000002854AF13CE0>, {'Tab1': {'label': 'o', 'caption': 'Showed preparation of virus titres from 108 to get 8, 16, 32 and 64 HA units', 'markdown': '| Virus Titre | Dilution |\n| --- | --- |\n| 8 HA virus titre | 1 part of NDV (10) or 100 ul + 900 mul PBS |\n| 16 HA virus titre | 1 part of NDV (10) or 100 ul + 800 mul PBS |\n| 32 HA virus titre | 1 part of NDV (10) or 100 ul + 700 mul PBS |\n| 64 HA virus titre | 1 part of NDV (10) or 100 ul + 600 mul PBS |', 'footnotes': [], 'col_labels': None, 'group_cols': [], 'biomarker_cols': []}, 'Tab2': {'label': 'b', 'caption': 'Effect of velogenic viscerotropic (VVNDV) AF2240 and tamoxifen in mortality rate, body and tumour weight of mice', 'markdown': '| Group | Animal number.Beginning | Animal number.End | Animal number.Mortality rate (%) | Body weight +- SD.Beginning | Body weight +- SD.End | Tumour weight (g) | Inhibition(%) |\n| --- | --- | --- | --- | --- | --- | --- | --- |\n| CC | 6 | 6 | 0 | 16.66 +- 

Groups from Text

In [8]:
text_groups = []
group_names = []
for row in utils.groups_from_text(PMC, abstract, methods, results):
    if len(row) not in [4, 6, 7, 9] and len(row) < 10:
        print("ERROR: Invalid text_groups label")
        continue
    text_group = {}
    text_group["group"] = row[0]
    text_group["animal_model"] = row[1] if row[1] else None
    text_group["sample_size"] = row[2] if row[2] else None
    if len(row) == 4:
        text_group["treatment1"] = None
        text_group["dose1"] = None
        text_group["units1"] = None
        text_group["treatment2"] = None
        text_group["dose2"] = None
        text_group["units2"] = None
    elif len(row) == 6:
        text_group["treatment1"] = row[3]
        text_group["dose1"] = row[4]
        text_group["units1"] = row[5]
        text_group["treatment2"] = None
        text_group["dose2"] = None
        text_group["units2"] = None
    else:
        text_group["treatment1"] = row[3]
        text_group["dose1"] = row[4]
        text_group["units1"] = row[5]
        text_group["treatment2"] = row[6]
        text_group["dose2"] = row[7]
        text_group["units2"] = row[8]
    text_groups.append(text_group)
    group_names.append(row[0].strip().lower())
text_groups_df = pd.DataFrame(data=text_groups, index=group_names)
    

<control|mouse|6|||||||>
<NC|mouse|6|||||||>
<NDV8|mouse|6|NDV AF2240|8|HA||||>
<NDV16|mouse|6|NDV AF2240|16|HA||||>
<NDV32|mouse|6|NDV AF2240|32|HA||||>
<NDV64|mouse|6|NDV AF2240|64|HA||||>
<CC|mouse|6|||||||>
<CT|mouse|6|tamoxifen|0.5|mug/ml||||>
<CNDV8|mouse|6|NDV AF2240|8|HA||||>
<CNDV16|mouse|6|NDV AF2240|16|HA||||>
<CNDV32|mouse|6|NDV AF2240|32|HA||||>
<CNDV64|mouse|6|NDV AF2240|64|HA||||>
<CNDV8+T|mouse|6|NDV AF2240|8|HA|tamoxifen|0.5|mug/ml>
<CNDV16+T|mouse|6|NDV AF2240|16|HA|tamoxifen|0.5|mug/ml>
<CNDV32+T|mouse|6|NDV AF2240|32|HA|tamoxifen|0.5|mug/ml>
<CNDV64+T|mouse|6|NDV AF2240|64|HA|tamoxifen|0.5|mug/ml>
[['control', 'mouse', '6', '', '', '', '', '', '', ''], ['NC', 'mouse', '6', '', '', '', '', '', '', ''], ['NDV8', 'mouse', '6', 'NDV AF2240', '8', 'HA', '', '', '', ''], ['NDV16', 'mouse', '6', 'NDV AF2240', '16', 'HA', '', '', '', ''], ['NDV32', 'mouse', '6', 'NDV AF2240', '32', 'HA', '', '', '', ''], ['NDV64', 'mouse', '6', 'NDV AF2240', '64', 'HA', '', '', '', ''], ['C

Formatting Tables and converting to df

In [9]:
#Note: 1 LLM API call per table
formatted_tables = utils.format_tables(PMC, text_groups_df["group"].tolist(), methods, tables)

for tid, info in formatted_tables.items():
    print(f"###{tid}###")
    if info["markdown"]: print(info["markdown"])
    info["df"] = table_utils.markdown_to_dataframe(info["markdown"])

START:
| Group | Animal number.Beginning | Animal number.End | Animal number.Mortality rate (%) | Body weight +- SD.Beginning | Body weight +- SD.End | Tumour weight (g) | Inhibition(%) |
| --- | --- | --- | --- | --- | --- | --- | --- |
| CC | 6 | 6 | 0 | 16.66 +- 0.70 | 21.26 +- 1.24 * | 1.45 +- 0.43 | - |
| CT | 6 | 6 | 0 | 16.48 +- 0.09 | 20.88 +- 0.79 * | 1.21 +- 0.41 | 16.55 |
| CNDV | CNDV | CNDV | CNDV | CNDV | CNDV | CNDV | CNDV |
| 8HA NDV | 6 | 6 | 0 | 16.77 +- 0.11 | 19.49 +- 0.30 * | NG | 100 |
| 16HA NDV | 6 | 6 | 0 | 16.64 +- 0.32 | 19.30 +- 0.06 * | NG | 100 |
| 32HA NDV | 6 | 6 | 0 | 16.86 +- 0.10 | 19.45 +- 0.10 * | NG | 100 |
| 64HA NDV | 6 | 6 | 0 | 16.54 +- 0.31 | 19.66 +- 0.13 * | NG | 100 |
| CNDV + T | CNDV + T | CNDV + T | CNDV + T | CNDV + T | CNDV + T | CNDV + T | CNDV + T |
| 8HA NDV | 6 | 6 | 0 | 16.93 +- 0.41 | 19.33 +- 0.16 * | NG | 100 |
| 16HA NDV | 6 | 6 | 0 | 16.86 +- 0.14 | 19.50 +- 0.30 * | NG | 100 |
| 32HA NDV | 6 | 6 | 0 | 16.87 +- 0.06 | 21.00 +

IDing columns via LLM

In [21]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    info_combined = ""
    if not info["markdown"]: continue
    if info["caption"]:
        info_combined += f"Caption: {info["caption"]}\n"
    info_combined += info["markdown"] + "\n"
    if len(info["footnotes"]) > 0:
        info_combined += "Footnotes:\n"
        for fn in info["footnotes"]:
            info_combined += fn + "\n"
    prompt = utils.id_columns_prompt(info_combined)
    answer = utils.llama_request(prompt, 5000, 0)
    info["col_labels"] = [[elt.strip() for elt in label.strip().lstrip('<').rstrip('>').split("|")] for label in answer.split("\n")]
    time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    with open(f"responses/column_id/{time}.txt", "w", encoding="utf-8") as file:
        file.write(f"{PMC}\n---\n{tid}\n---\n{prompt}\n---\n{answer}")

Processing column IDs

In [22]:
for tid, info in formatted_tables.items():
    if info["label"] != "b":
        continue
    info["biomarker_cols"] = []
    info["group_cols"] = []
    column_info = {}
    for label in info["col_labels"]:
        print(label)
        if len(label) < 2 or label[1].lower() not in ["biomarker", "group", "dose", "size", "animal"]: continue
        print(label[1].lower())
        if label[1].lower() == "biomarker":
            if label[3].lower() not in ["mean", "variation", "frequency", "severity"]: continue
            biomarker_label = {"column": label[0], "label": "biomarker", "type": label[3].lower(), "units": None, "name": label[2]}
            if label[3].lower() == "mean" and len(label) > 4:
                biomarker_label["units"] = label[4]
            elif label[3].lower() == "variation" and len(label) > 4:
                biomarker_label["units"] = label[4]
            elif label[3].lower() == "frequency":
                if len(label) > 4 and label[4].lower() in ["percent", "percents", "percentage", "percentages", "%", "count", "counts", "decimal", "decimals"]:
                    biomarker_label["units"] = label[4]
                else:
                    print("ERROR: Invalid type for count label -> skipping")
                    continue
            info["biomarker_cols"].append(biomarker_label)
        else:
            group_label = {"column": label[0], "label": label[1].strip().lower(), "units": [], "link": None, "name": None}
            if label[1].lower() == "dose":
                for i in range(2,len(label)):
                    if label[i].strip()[:5].lower() == "name:":
                        group_label["name"] = label[i].strip()[5:].strip()
                    elif label[i].strip()[:6].lower() == "units:":
                        group_label["units"].append(label[i].strip()[6:].strip())
                    elif label[i].strip()[:5].lower() == "link:":
                        group_label["link"] = label[i].strip()[5:].strip()
            elif label[1].lower() == "animal":
                if len(label) > 2:
                    group_label["name"] = label[2]
            info["group_cols"].append(group_label)

['Category Header', 'group']
group
['Group', 'group']
group
['Animal number.Beginning', 'size']
size
['Animal number.End', 'size']
size
['Animal number.Mortality rate (%)', 'biomarker', 'Mortality rate', 'frequency', 'percent']
biomarker
['Body weight +- SD.Beginning.mean', 'biomarker', 'Body weight', 'mean', 'g']
biomarker
['Body weight +- SD.Beginning.SD', 'biomarker', 'Body weight', 'variation', 'g']
biomarker
['Body weight +- SD.End.mean', 'biomarker', 'Body weight', 'mean', 'g']
biomarker
['Body weight +- SD.End.SD', 'biomarker', 'Body weight', 'variation', 'g']
biomarker
['Tumour weight (g).mean', 'biomarker', 'Tumour weight', 'mean', 'g']
biomarker
['Tumour weight (g).SD', 'biomarker', 'Tumour weight', 'variation', 'g']
biomarker
['Inhibition(%)', 'biomarker', 'Inhibition rate', 'mean', 'percent']
biomarker
['Liver Function Tests. Groups', 'group']
group
['Liver Function Tests.Total Bilirubin [mg/dl].mean', 'biomarker', 'Total Bilirubin', 'mean', 'mg/dl']
biomarker
['Liver Funct

In [23]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    print(info["col_labels"])
    print(f"GROUP COLUMNS:\n{info["group_cols"]}")
    print(f"BIOMARKER COLUMNS:\n{info["biomarker_cols"]}")

[['Category Header', 'group'], ['Group', 'group'], ['Animal number.Beginning', 'size'], ['Animal number.End', 'size'], ['Animal number.Mortality rate (%)', 'biomarker', 'Mortality rate', 'frequency', 'percent'], ['Body weight +- SD.Beginning.mean', 'biomarker', 'Body weight', 'mean', 'g'], ['Body weight +- SD.Beginning.SD', 'biomarker', 'Body weight', 'variation', 'g'], ['Body weight +- SD.End.mean', 'biomarker', 'Body weight', 'mean', 'g'], ['Body weight +- SD.End.SD', 'biomarker', 'Body weight', 'variation', 'g'], ['Tumour weight (g).mean', 'biomarker', 'Tumour weight', 'mean', 'g'], ['Tumour weight (g).SD', 'biomarker', 'Tumour weight', 'variation', 'g'], ['Inhibition(%)', 'biomarker', 'Inhibition rate', 'mean', 'percent']]
GROUP COLUMNS:
[{'column': 'Category Header', 'label': 'group', 'units': [], 'link': None, 'name': None}, {'column': 'Group', 'label': 'group', 'units': [], 'link': None, 'name': None}, {'column': 'Animal number.Beginning', 'label': 'size', 'units': [], 'link': N

Deriving Groups via Tables

In [24]:
from rapidfuzz import fuzz, process

for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    table_groups = []
    group_names = []
    info["df"].reset_index(drop=True, inplace=True)
    for idx, row in info["df"].iterrows():
        group_row = {"group": None, "animal_model": None, "sample_size": None,"treatment1": None, "dose1": None, "units1": None,"treatment2": None, "dose2": None, "units2": None}
        group_name = ""
        for label in info["group_cols"]:
            print(label)
            if label["column"] not in info["df"].columns:
                max_ratio = 0
                max_col = info["df"].columns[0]
                for df_col in info["df"].columns:
                    ratio = fuzz.ratio(label["column"], df_col)
                    if ratio > max_ratio:
                        max_ratio = ratio
                        max_col = df_col
                label["column"] = max_col
            col_val = info["df"].loc[idx, label["column"]]
            if label["label"] == "group":
                group_name += f".{col_val}"
            elif label["label"] == "dose":
                if not group_row["dose1"]:
                    group_row["dose1"] = col_val
                    if label["name"]:
                        group_row["treatment1"] = label["name"]
                    elif label["link"]:
                        group_row["treatment1"] = info["df"].loc[idx, label["link"]]
                elif not group_row["dose2"]:
                    group_row["dose2"] = col_val
                    if label["name"]:
                        group_row["treatment2"] = label["name"]
                    elif label["link"]:
                        group_row["treatment2"] = info["df"].loc[idx, label["link"]]
                else:
                    print("ERROR: >2 treatments")
            elif label["label"] == "size":
                group_row["sample_size"] = col_val
            elif label["label"] == "animal":
                group_row["animal_model"] = col_val
        table_groups.append(group_row)
        group_name = group_name.lstrip(".") if group_name else f"Group {idx}"
        group_names.append(group_name.lower() if group_name else group_name)
        group_row["group"] = group_name
        print(group_names)
    info["df"].index = [name.lower().strip() for name in group_names]
    info["table_groups"] = pd.DataFrame(data=table_groups, index=group_names)
    print(tid)
    print(info["table_groups"])

# group_label = {"column": label[0], "label": label[1].strip().lower(), "type": None, "units": [], "link": None, "name": None}

{'column': 'Category Header', 'label': 'group', 'units': [], 'link': None, 'name': None}
{'column': 'Group', 'label': 'group', 'units': [], 'link': None, 'name': None}
{'column': 'Animal number.Beginning', 'label': 'size', 'units': [], 'link': None, 'name': None}
{'column': 'Animal number.End', 'label': 'size', 'units': [], 'link': None, 'name': None}
['none.cc']
{'column': 'Category Header', 'label': 'group', 'units': [], 'link': None, 'name': None}
{'column': 'Group', 'label': 'group', 'units': [], 'link': None, 'name': None}
{'column': 'Animal number.Beginning', 'label': 'size', 'units': [], 'link': None, 'name': None}
{'column': 'Animal number.End', 'label': 'size', 'units': [], 'link': None, 'name': None}
['none.cc', 'none.ct']
{'column': 'Category Header', 'label': 'group', 'units': [], 'link': None, 'name': None}
{'column': 'Group', 'label': 'group', 'units': [], 'link': None, 'name': None}
{'column': 'Animal number.Beginning', 'label': 'size', 'units': [], 'link': None, 'name':

In [25]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    print(tid)
    print(info["markdown"])

Tab2
| Category Header | Group | Animal number.Beginning | Animal number.End | Animal number.Mortality rate (%) | Body weight +- SD.Beginning.mean | Body weight +- SD.Beginning.SD | Body weight +- SD.End.mean | Body weight +- SD.End.SD | Tumour weight (g).mean | Tumour weight (g).SD | Inhibition(%) |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| None | CC | 6 | 6 | 0 | 16.66 | 0.70 | 21.26 | 1.24 * | 1.45 | 0.43 | - |
| None | CT | 6 | 6 | 0 | 16.48 | 0.09 | 20.88 | 0.79 * | 1.21 | 0.41 | 16.55 |
| CNDV | 8HA NDV | 6 | 6 | 0 | 16.77 | 0.11 | 19.49 | 0.30 * | NG | nan | 100 |
| CNDV | 16HA NDV | 6 | 6 | 0 | 16.64 | 0.32 | 19.30 | 0.06 * | NG | nan | 100 |
| CNDV | 32HA NDV | 6 | 6 | 0 | 16.86 | 0.10 | 19.45 | 0.10 * | NG | nan | 100 |
| CNDV | 64HA NDV | 6 | 6 | 0 | 16.54 | 0.31 | 19.66 | 0.13 * | NG | nan | 100 |
| CNDV + T | 8HA NDV | 6 | 6 | 0 | 16.93 | 0.41 | 19.33 | 0.16 * | NG | nan | 100 |
| CNDV + T | 16HA NDV | 6 | 6 | 0 | 16.86 | 0.14 | 19.50 | 0.

Linking table and text treatment groups

In [26]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    print(info["table_groups"])
    print(text_groups_df.index)
    utils.link_groups(PMC, tid, methods, info["table_groups"], text_groups_df)
    print(table_utils.dataframe_to_markdown(info["table_groups"]))

                               group animal_model sample_size treatment1  \
none.cc                      None.CC         None           6       None   
none.ct                      None.CT         None           6       None   
cndv.8ha ndv            CNDV.8HA NDV         None           6       None   
cndv.16ha ndv          CNDV.16HA NDV         None           6       None   
cndv.32ha ndv          CNDV.32HA NDV         None           6       None   
cndv.64ha ndv          CNDV.64HA NDV         None           6       None   
cndv + t.8ha ndv    CNDV + T.8HA NDV         None           6       None   
cndv + t.16ha ndv  CNDV + T.16HA NDV         None           6       None   
cndv + t.32ha ndv  CNDV + T.32HA NDV         None           6       None   
cndv + t.64ha ndv  CNDV + T.64HA NDV         None           6       None   

                  dose1 units1 treatment2 dose2 units2  
none.cc            None   None       None  None   None  
none.ct            None   None       None  None  

Process Biomarkers

In [27]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    mean_columns = defaultdict(lambda: {"mean_col": None, "variation_col": None, "units": None})
    freq_columns = defaultdict(lambda: {"freq_col": None, "severity_col": None, "units": None})
    for label in info["biomarker_cols"]:
        if label["type"] == "mean":
            mean_columns[label["name"]]["mean_col"] = label["column"]
            if label["units"]: mean_columns[label["name"]]["units"] = label["units"]
        elif label["type"] == "variation":
            mean_columns[label["name"]]["variation_col"] = label["column"]
            if label["units"]: mean_columns[label["name"]]["units"] = label["units"]
        elif label["type"] == "frequency":
            freq_columns[label["name"]]["freq_col"] = label["column"]
            if label["units"]: freq_columns[label["name"]]["units"] = label["units"]
        elif label["type"] == "severity":
            freq_columns[label["name"]]["severity_col"] = label["column"]
    info["mean_cols"] = mean_columns
    info["freq_cols"] = freq_columns

In [28]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    table_df = table_utils.markdown_to_dataframe(info["markdown"])
    table_df.index = info["table_groups"].index
    output_mean = []
    output_freq = []
    for row_index in table_df.index:
        for col_index, col_params in info["mean_cols"].items():
            print(f"row_index = {row_index}, col_params = {col_params}")
            if not table_df.at[row_index.lower(), col_params["mean_col"]]: continue
            output_mean.append({
                "Animal Model": info["table_groups"].at[row_index, "animal_model"],
                "Sample Size": info["table_groups"].at[row_index, "sample_size"],
                "Treatment 1": info["table_groups"].at[row_index, "treatment1"],
                "Dose 1": info["table_groups"].at[row_index, "dose1"],
                "Units 1": info["table_groups"].at[row_index, "units1"],
                "Treatment 2": info["table_groups"].at[row_index, "treatment2"],
                "Dose 2": info["table_groups"].at[row_index, "dose2"],
                "Units 2": info["table_groups"].at[row_index, "units2"],
                "Biomarker": col_index,
                "Value": table_df.at[row_index, col_params["mean_col"]],
                "Units": col_params["units"],
                "Variation": table_df.at[row_index, col_params["variation_col"]] if col_params["variation_col"] else None,
            })
        for col_index, col_params in info["freq_cols"].items():
            print(table_df)
            if not table_df.at[row_index, col_params["freq_col"]] and not table_df.at[row_index, col_params["severity_col"]]: continue
            if table_df.at[row_index, col_params["freq_col"]]:
                if col_params["units"] in ["percent", "percents", "percentage", "percentages", "%"]:
                    frequency = float(table_df.at[row_index, col_params["freq_col"]])/float(info["table_groups"].at[row_index, "sample_size"]) * 100
                elif col_params["units"] in ["count", "counts"]:
                    frequency = float(table_df.at[row_index, col_params["freq_col"]])/float(info["table_groups"].at[row_index, "sample_size"])
                else:
                    frequency = table_df.at[row_index, col_params["freq_col"]]
            output_freq.append({
                "Animal Model": info["table_groups"].at[row_index, "animal_model"],
                "Sample Size": info["table_groups"].at[row_index, "sample_size"],
                "Treatment 1": info["table_groups"].at[row_index, "treatment1"],
                "Dose 1": info["table_groups"].at[row_index, "dose1"],
                "Units 1": info["table_groups"].at[row_index, "units1"],
                "Treatment 2": info["table_groups"].at[row_index, "treatment2"],
                "Dose 2": info["table_groups"].at[row_index, "dose2"],
                "Units 2": info["table_groups"].at[row_index, "units2"],
                "Biomarker": col_index,
                "Frequency": frequency,
                "Severity": table_df.at[row_index, col_params["severity_col"]] if col_params["severity_col"] else None,
            })
    info["output_mean"] = pd.DataFrame(output_mean)
    info["output_freq"] = pd.DataFrame(output_freq)

row_index = none.cc, col_params = {'mean_col': 'Body weight +- SD.End.mean', 'variation_col': 'Body weight +- SD.End.SD', 'units': 'g'}
row_index = none.cc, col_params = {'mean_col': 'Tumour weight (g).mean', 'variation_col': 'Tumour weight (g).SD', 'units': 'g'}
row_index = none.cc, col_params = {'mean_col': 'Inhibition(%)', 'variation_col': None, 'units': 'percent'}
                  Category Header     Group Animal number.Beginning  \
none.cc                      None        CC                       6   
none.ct                      None        CT                       6   
cndv.8ha ndv                 CNDV   8HA NDV                       6   
cndv.16ha ndv                CNDV  16HA NDV                       6   
cndv.32ha ndv                CNDV  32HA NDV                       6   
cndv.64ha ndv                CNDV  64HA NDV                       6   
cndv + t.8ha ndv         CNDV + T   8HA NDV                       6   
cndv + t.16ha ndv        CNDV + T  16HA NDV                  

In [39]:
from openpyxl import load_workbook
all_mean_data = []
all_freq_data = []
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    all_mean_data.append(info["output_mean"])
    all_freq_data.append(info["output_freq"])
mean_df = pd.concat(all_mean_data, axis=0, ignore_index=True)
freq_df = pd.concat(all_freq_data, axis=0, ignore_index=True)
output_file_path = "outputs/benchmark_combined.xlsx"
if not os.path.exists(output_file_path):
    pd.DataFrame().to_excel(output_file_path, index=False)
with pd.ExcelWriter(output_file_path, engine='openpyxl', mode="a", if_sheet_exists="replace") as writer:
    mean_df.to_excel(writer, sheet_name=f"{PMC}_mean", index=False)
    freq_df.to_excel(writer, sheet_name=f"{PMC}_freq", index=False)