In [10]:
import pandas as pd
df = pd.read_csv('Main_data_final_cols.csv')

In [11]:
import pandas as pd
import IPython.display as disp

# Build coverage table
def show_coverage(df):
    coverage = []
    total_rows = len(df)

    for i, col in enumerate(df.columns):
        non_null = df[col].notna().sum()
        unique_count = df[col].nunique(dropna=True)
        coverage.append({
            "Col #": i,
            "Column": col,
            "Non-null count": non_null,
            "Coverage %": round((non_null / total_rows) * 100, 2),
            "Unique count": unique_count
        })

    coverage_df = pd.DataFrame(coverage)

    # Sort by coverage % descending (optional)
    coverage_df = coverage_df.sort_values("Coverage %", ascending=False)

    # Show as a nice table

    disp.display(coverage_df)

In [12]:
show_coverage(df)

Unnamed: 0,Col #,Column,Non-null count,Coverage %,Unique count
0,0,Entry,256690,100.0,256690
1,1,name,256690,100.0,256690
2,2,sequence,256690,100.0,256678
3,3,function,256690,100.0,52194
4,4,Organism,256690,100.0,7524
5,5,Length,256690,100.0,2305
21,21,GO_MF_json,234925,91.52,24613
19,19,GO_BP_json,222047,86.5,36885
20,20,GO_CC_json,215299,83.88,21760
15,15,Subcellular_location_json,167764,65.36,30831


In [14]:
import pandas as pd, json, re, random, hashlib
from typing import Any, Dict, List, Optional

# ========= CONFIG =========
path = "Main_data_final_cols.csv"  # change to your full dataset
out_path = "evifuncqa.jsonl"

# ========= LOAD =========
df = pd.read_csv(path)

# Prefer canonical Length (ignore lowercase duplicate if both present)
LEN_COL = "Length"

# ========= HELPERS =========
def safe_json_load(x):
    if pd.isna(x): return None
    if isinstance(x, (list, dict)): return x
    if isinstance(x, str):
        s = x.strip()
        if not s: return None
        try: return json.loads(s)
        except Exception: return None
    return None

ECO_RX   = re.compile(r'ECO:\d+')
PMID_RX  = re.compile(r'PubMed:\d+')
RHEA_RX  = re.compile(r'RHEA:\d+')
CHEBI_RX = re.compile(r'CHEBI:\d+')
GO_RX    = re.compile(r'GO:\d{7}')

def sha1_seq(seq: Optional[str]) -> Optional[str]:
    if not isinstance(seq, str) or not seq: return None
    return hashlib.sha1(seq.encode("utf-8")).hexdigest()

def collect_evidence_tokens(obj) -> List[str]:
    tokens = []
    def _walk(v):
        if isinstance(v, dict):
            for vv in v.values(): _walk(vv)
        elif isinstance(v, list):
            for it in v: _walk(it)
        elif isinstance(v, str):
            tokens.extend(ECO_RX.findall(v))
            tokens.extend(PMID_RX.findall(v))
    if obj is not None: _walk(obj)
    # dedupe preserving order
    seen, out = set(), []
    for t in tokens:
        if t not in seen:
            seen.add(t); out.append(t)
    return out

def extract_from_catalysis(cat_json):
    rhea_ids, ec_nums, chebis, evid = set(), set(), set(), set()
    if isinstance(cat_json, list):
        for item in cat_json:
            if not isinstance(item, dict): continue
            rid = item.get("rhea_id")
            if isinstance(rid, str) and RHEA_RX.fullmatch(rid): rhea_ids.add(rid)
            en = item.get("ec_number")
            if isinstance(en, str) and en.strip(): ec_nums.add(en.strip())
            for c in (item.get("chebi_ids") or []):
                if isinstance(c, str) and CHEBI_RX.fullmatch(c): chebis.add(c)
            for ev in (item.get("evidence") or []):
                if isinstance(ev, str):
                    for t in ECO_RX.findall(ev): evid.add(t)
                    for t in PMID_RX.findall(ev): evid.add(t)
    return rhea_ids, ec_nums, chebis, evid

def extract_go_ids(go_json):
    ids = set()
    if isinstance(go_json, list):
        for g in go_json:
            if isinstance(g, dict):
                gid = g.get("go_id")
                if isinstance(gid, str) and GO_RX.fullmatch(gid):
                    ids.add(gid)
    return ids

def extract_pathway_levels(pw_json):
    out = []
    if isinstance(pw_json, list):
        for seg in pw_json:
            if isinstance(seg, dict):
                levels = seg.get("levels")
                if isinstance(levels, list) and all(isinstance(x, str) for x in levels):
                    out.append(levels)
    return out

def extract_chebi_from_cofactor(cof_json):
    out = set()
    if isinstance(cof_json, list):
        for c in cof_json:
            if isinstance(c, dict):
                for t in (c.get("chebi_ids") or []):
                    if isinstance(t, str) and CHEBI_RX.fullmatch(t): out.add(t)
    return out

def aggregate_evidence_codes(evidence_obj: Dict[str, Any]) -> List[str]:
    tokens = []
    for k, v in evidence_obj.items():
        tokens.extend(collect_evidence_tokens(v))
    seen, out = set(), []
    for t in tokens:
        if t not in seen:
            seen.add(t); out.append(t)
    return out

# ========= BUILD EXAMPLES =========
examples = []
for idx, row in df.iterrows():
    entry     = str(row["Entry"]) if "Entry" in df.columns else None
    organism  = str(row["Organism"]) if "Organism" in df.columns and pd.notna(row["Organism"]) else None
    length    = int(row[LEN_COL]) if LEN_COL and pd.notna(row[LEN_COL]) else None
    name      = str(row["name"]) if "name" in df.columns and pd.notna(row["name"]) else None
    sequence  = str(row["sequence"]) if "sequence" in df.columns and pd.notna(row["sequence"]) else None
    function  = str(row["function"]) if "function" in df.columns and pd.notna(row["function"]) else ""

    # Evidence JSON fields
    ev = {}
    for col in [
        "GO_MF_json","GO_BP_json","GO_CC_json",
        "Catalytic_Activity_json","Binding_site_json","Cofactor_json","Active_site_json",
        "DNA_binding_json","Pathway_json","Subcellular_location_json","DomainFT_json",
        "Motif_json","Topological_domain_json"
    ]:
        if col in df.columns and pd.notna(row[col]):
            ev[col] = safe_json_load(row[col])

    # Non-JSON evidence-like fields
    if "EC number" in df.columns and pd.notna(row["EC number"]):
        ev["EC number"] = [t.strip() for t in str(row["EC number"]).split(";") if t.strip()]
    if "UniPathway" in df.columns and pd.notna(row["UniPathway"]):
        ev["UniPathway_raw"] = str(row["UniPathway"])
    if "Reactome" in df.columns and pd.notna(row["Reactome"]):
        ev["Reactome_raw"] = str(row["Reactome"])

    evidence_mask = list(ev.keys())

    # Gold supports (aggregated)
    go_mf_ids = extract_go_ids(ev.get("GO_MF_json"))
    go_bp_ids = extract_go_ids(ev.get("GO_BP_json"))
    go_cc_ids = extract_go_ids(ev.get("GO_CC_json"))
    rhea_ids, ec_from_cat, chebi_catalytic, evid_cat = extract_from_catalysis(ev.get("Catalytic_Activity_json"))
    ec_numbers = set(ev.get("EC number", [])) | ec_from_cat
    cof_chebi  = extract_chebi_from_cofactor(ev.get("Cofactor_json"))
    pathway_lv = extract_pathway_levels(ev.get("Pathway_json"))
    evidence_codes = aggregate_evidence_codes({k:v for k,v in ev.items() if k not in ["EC number","UniPathway_raw","Reactome_raw"]})
    for t in evid_cat:
        if t not in evidence_codes: evidence_codes.append(t)

    targets = {
        "function_canonical": function.strip(),
        "EC_set": sorted(ec_numbers),
        "RHEA_set": sorted(rhea_ids),
        "GO_MF_set": sorted(go_mf_ids),
        "GO_BP_set": sorted(go_bp_ids),
        "GO_CC_set": sorted(go_cc_ids),
        "Pathway_levels_set": [[lvl.lower() for lvl in arr] for arr in pathway_lv]
    }

    qa_obj = {
        "id": f"EVIQ-{entry}-{idx}",
        "provenance": {
            "entry": entry, "name": name, "organism": organism, "length": length,
            "sequence": sequence, "sequence_sha1": sha1_seq(sequence), "source_version": None
        },
        "split": {"name": "train", "fold": None, "identity_ceiling": None, "holdout_taxa": None},
        "question": {
            "prompt_style": "curation",
            "inputs": {"core": {"entry": entry, "organism": organism, "length": length, "sequence": sequence},
                       "evidence": ev},
            "evidence_mask": evidence_mask,
            "question_text": None
        },
        "answer": {
            "function_text": function,
            "gold_supports": {
                "EC_number": sorted(ec_numbers),
                "RHEA_ids": sorted(rhea_ids),
                "GO_MF_ids": sorted(go_mf_ids),
                "GO_BP_ids": sorted(go_bp_ids),
                "GO_CC_ids": sorted(go_cc_ids),
                "Pathway_levels": pathway_lv,
                "Cofactor_chebi": sorted(cof_chebi),
                "Catalytic_chebi": sorted(chebi_catalytic),
                "Evidence_codes": evidence_codes
            },
            "rationale": None
        },
        "targets_for_eval": targets
    }
    examples.append(qa_obj)

# ========= SAVE =========
with open(out_path, "w", encoding="utf-8") as f:
    for obj in examples:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print(f"Saved {len(examples)} examples to {out_path}")

# ========= QUICK SHAPE VALIDATION =========
def shape_check(obj: Dict[str, Any]) -> bool:
    for k in ["id","provenance","question","answer"]:
        if k not in obj: return False
    for k in ["entry","organism","length"]:
        if k not in obj["provenance"]: return False
    if "function_text" not in obj["answer"]: return False
    return True

ok_flags = [shape_check(o) for o in examples]
print("Basic shape valid for all:", all(ok_flags))

# ========= PRINT 5 EXAMPLES FOR MANUAL INSPECTION =========
for i in random.sample(range(len(examples)), 5):
    ex = examples[i]
    print("\nID:", ex["id"])
    print("Provenance:", {k: ex['provenance'][k] for k in ['entry','organism','length','name']})
    print("Evidence keys:", sorted(ex["question"]["evidence_mask"]))
    print("Function (trunc):", ex["answer"]["function_text"][:140].replace("\n"," "))
    print("GO_MF_ids:", ex["answer"]["gold_supports"]["GO_MF_ids"][:5])
    print("EC:", ex["answer"]["gold_supports"]["EC_number"][:5], "RHEA:", ex["answer"]["gold_supports"]["RHEA_ids"][:5])


Saved 256690 examples to evifuncqa.jsonl
Basic shape valid for all: True

ID: EVIQ-Q8STF0-234645
Provenance: {'entry': 'Q8STF0', 'organism': 'Strongylocentrotus intermedius', 'length': 156, 'name': 'CALM_STRIE'}
Evidence keys: ['Binding_site_json', 'DomainFT_json', 'GO_CC_json', 'GO_MF_json']
Function (trunc): Calmodulin mediates the control of a large number of enzymes, ion channels and other proteins by Ca(2+). Among the enzymes to be stimulated 
GO_MF_ids: ['GO:0005509']
EC: [] RHEA: []

ID: EVIQ-Q03UL2-140328
Provenance: {'entry': 'Q03UL2', 'organism': 'Leuconostoc mesenteroides subsp. mesenteroides', 'length': 580, 'name': 'ILVD_LEUMM'}
Evidence keys: ['Active_site_json', 'Binding_site_json', 'Catalytic_Activity_json', 'Cofactor_json', 'EC number', 'GO_BP_json', 'GO_CC_json', 'GO_MF_json', 'Pathway_json', 'UniPathway_raw']
Function (trunc): Functions in the biosynthesis of branched-chain amino acids. Catalyzes the dehydration of (2R,3R)-2,3-dihydroxy-3-methylpentanoate (2,3-dihy
G

In [16]:
import os, json, re, hashlib, random
from typing import Any, Dict, List, Tuple

CANDIDATES = ["evifuncqa.jsonl"]  # <-- change to your full dataset path

GO_RX    = re.compile(r'^GO:\d{7}$')
RHEA_RX  = re.compile(r'^RHEA:\d+$')
EC_RX    = re.compile(r'^\d+(?:\.(?:\d+|-)){3}$')  # tolerant to dashes: 1.1.-.-
CHEBI_RX = re.compile(r'^CHEBI:\d+$')
ECO_RX   = re.compile(r'^ECO:\d+$')
PMID_RX  = re.compile(r'^PubMed:\d+$')

def load_jsonl(path: str) -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]

def sha1_seq(seq: str) -> str:
    return hashlib.sha1(seq.encode("utf-8")).hexdigest()

def extract_go_from_evidence(ev: Dict[str, Any]) -> Tuple[List[str], List[str], List[str]]:
    def grab(go_arr):
        out = []
        if isinstance(go_arr, list):
            for g in go_arr:
                if isinstance(g, dict) and isinstance(g.get("go_id"), str):
                    out.append(g["go_id"])
        return out
    return (grab(ev.get("GO_MF_json")), grab(ev.get("GO_BP_json")), grab(ev.get("GO_CC_json")))

def extract_ids_from_catalysis(cat_json):
    rhea, ec, chebi, eco, pmid = [], [], [], [], []
    if isinstance(cat_json, list):
        for it in cat_json:
            if not isinstance(it, dict): continue
            if isinstance(it.get("rhea_id"), str): rhea.append(it["rhea_id"])
            if isinstance(it.get("ec_number"), str): ec.append(it["ec_number"])
            for c in (it.get("chebi_ids") or []):
                if isinstance(c, str): chebi.append(c)
            for e in (it.get("evidence") or []):
                if isinstance(e, str):
                    eco.extend(re.findall(r'ECO:\d+', e))
                    pmid.extend(re.findall(r'PubMed:\d+', e))
    return rhea, ec, chebi, eco, pmid

def pass_fail(name: str, ok: bool, detail: str = ""):
    print(f"[{'PASS' if ok else 'FAIL'}] {name}")
    if detail:
        print(detail)

def ok_binding_item(it: Dict[str, Any]) -> bool:
    # Accept either a single position or a (start,end) range (rare but observed)
    if not isinstance(it, dict): return False
    if "position" in it and isinstance(it["position"], int): return True
    if "start" in it and "end" in it and isinstance(it["start"], int) and isinstance(it["end"], int): return True
    return False

def run_tests(path: str, sample_for_schema_checks: int = 2000):
    print(f"\n=== Running tests on: {path} ===")
    data = load_jsonl(path)
    n = len(data)
    print(f"Loaded {n} examples")

    # T1: structure
    miss = [i for i, ex in enumerate(data) if not all(k in ex for k in ("id","provenance","question","answer"))]
    pass_fail("T1: required top-level blocks", len(miss) == 0)

    # T2: provenance core
    miss = [i for i, ex in enumerate(data) if not all(k in ex["provenance"] for k in ("entry","organism","length"))]
    pass_fail("T2: provenance has entry/organism/length", len(miss) == 0)

    # T3: unique ids
    ids = [ex["id"] for ex in data]
    pass_fail("T3: unique example IDs", len(set(ids)) == len(ids))

    # T4: evidence_mask equals keys of evidence
    bad = [i for i, ex in enumerate(data) if set(ex["question"]["evidence_mask"]) != set(ex["question"]["inputs"].get("evidence",{}).keys())]
    pass_fail("T4: evidence_mask matches evidence keys", len(bad) == 0)

    # T5: length vs sequence
    bad = []
    for i, ex in enumerate(data):
        seq = ex["provenance"].get("sequence"); L = ex["provenance"].get("length")
        if isinstance(seq, str) and isinstance(L, int) and L > 0 and len(seq) != L:
            bad.append(i)
    pass_fail("T5: length matches sequence length when both present", len(bad) == 0)

    # T6: sha1 correctness
    bad = []
    for i, ex in enumerate(data):
        seq = ex["provenance"].get("sequence"); sha = ex["provenance"].get("sequence_sha1")
        if isinstance(seq, str) and seq and sha != sha1_seq(seq):
            bad.append(i)
    pass_fail("T6: sha1 correct", len(bad) == 0)

    # T7: patterns for IDs
    bad_go = []; bad_ec = []; bad_rh = []; bad_ch = []
    for i, ex in enumerate(data):
        gs = ex["answer"]["gold_supports"]
        if any(not GO_RX.match(x) for x in gs.get("GO_MF_ids", []) + gs.get("GO_BP_ids", []) + gs.get("GO_CC_ids", [])): bad_go.append(i)
        if any(not EC_RX.match(x) for x in gs.get("EC_number", [])): bad_ec.append(i)
        if any(not RHEA_RX.match(x) for x in gs.get("RHEA_ids", [])): bad_rh.append(i)
        if any(not CHEBI_RX.match(x) for x in gs.get("Cofactor_chebi", []) + gs.get("Catalytic_chebi", [])): bad_ch.append(i)
    pass_fail("T7a: GO IDs valid", len(bad_go) == 0)
    pass_fail("T7b: EC valid pattern", len(bad_ec) == 0)
    pass_fail("T7c: RHEA valid pattern", len(bad_rh) == 0)
    pass_fail("T7d: CHEBI valid pattern", len(bad_ch) == 0)

    # T8: targets_for_eval == gold_supports
    bad = []
    for i, ex in enumerate(data):
        gs, tg = ex["answer"]["gold_supports"], ex.get("targets_for_eval", {})
        if set(gs.get("EC_number", [])) != set(tg.get("EC_set", [])): bad.append(("EC", i))
        if set(gs.get("RHEA_ids", [])) != set(tg.get("RHEA_set", [])): bad.append(("RHEA", i))
        if set(gs.get("GO_MF_ids", [])) != set(tg.get("GO_MF_set", [])): bad.append(("GO_MF", i))
        if set(gs.get("GO_BP_ids", [])) != set(tg.get("GO_BP_set", [])): bad.append(("GO_BP", i))
        if set(gs.get("GO_CC_ids", [])) != set(tg.get("GO_CC_set", [])): bad.append(("GO_CC", i))
        gs_pw = [[s.lower() for s in seg] for seg in gs.get("Pathway_levels", [])]
        if gs_pw != tg.get("Pathway_levels_set", []): bad.append(("Pathway", i))
    pass_fail("T8: targets_for_eval equals gold_supports", len(bad) == 0)

    # T9: GO shown ⊆ GO in gold
    bad = []
    for i, ex in enumerate(data):
        ev = ex["question"]["inputs"]["evidence"]
        mf, bp, cc = extract_go_from_evidence(ev)
        gs = ex["answer"]["gold_supports"]
        if mf and not set(mf).issubset(set(gs.get("GO_MF_ids", []))): bad.append(("MF", i))
        if bp and not set(bp).issubset(set(gs.get("GO_BP_ids", []))): bad.append(("BP", i))
        if cc and not set(cc).issubset(set(gs.get("GO_CC_ids", []))): bad.append(("CC", i))
    pass_fail("T9: GO in evidence ⊆ gold supports", len(bad) == 0)

    # T10: Catalysis IDs shown ⊆ gold
    bad = []
    for i, ex in enumerate(data):
        ev = ex["question"]["inputs"]["evidence"]
        rhea, ec, chebi, eco, pmid = extract_ids_from_catalysis(ev.get("Catalytic_Activity_json"))
        gs = ex["answer"]["gold_supports"]
        if rhea and not set(rhea).issubset(set(gs.get("RHEA_ids", []))): bad.append(("RHEA", i))
        if ec and not set(ec).issubset(set(gs.get("EC_number", []))): bad.append(("EC", i))
        if chebi and not set(chebi).issubset(set(gs.get("Catalytic_chebi", []))): bad.append(("CHEBI", i))
    pass_fail("T10: Catalysis IDs in evidence ⊆ gold supports", len(bad) == 0)

    # T11: Evidence codes well-formed
    bad = []
    for i, ex in enumerate(data):
        codes = ex["answer"]["gold_supports"].get("Evidence_codes", [])
        for c in codes:
            if not (ECO_RX.match(c) or PMID_RX.match(c)):
                bad.append((i, c)); break
    pass_fail("T11: Evidence codes well-formed (ECO or PubMed)", len(bad) == 0)

    # T12: Evidence array item shapes (spot-check)
    probs = []
    indices = random.sample(range(n), min(n, sample_for_schema_checks))
    for i in indices:
        ev = data[i]["question"]["inputs"]["evidence"]
        bs = ev.get("Binding_site_json")
        if isinstance(bs, list):
            for it in bs[:10]:
                if not ok_binding_item(it):
                    probs.append(("BindingSite", i)); break
        ac = ev.get("Active_site_json")
        if isinstance(ac, list):
            for it in ac[:10]:
                if not (isinstance(it, dict) and isinstance(it.get("position"), int)):
                    probs.append(("ActiveSite", i)); break
        dft = ev.get("DomainFT_json")
        if isinstance(dft, list):
            for it in dft[:10]:
                if not (isinstance(it, dict) and isinstance(it.get("start"), int) and isinstance(it.get("end"), int)):
                    probs.append(("DomainFT", i)); break
        mtf = ev.get("Motif_json")
        if isinstance(mtf, list):
            for it in mtf[:10]:
                if not (isinstance(it, dict) and isinstance(it.get("start"), int) and isinstance(it.get("end"), int)):
                    probs.append(("Motif", i)); break
        topo = ev.get("Topological_domain_json")
        if isinstance(topo, list):
            for it in topo[:10]:
                if not (isinstance(it, dict) and isinstance(it.get("start"), int) and isinstance(it.get("end"), int)):
                    probs.append(("Topological", i)); break
        dna = ev.get("DNA_binding_json")
        if isinstance(dna, list):
            for it in dna[:10]:
                if not (isinstance(it, dict) and isinstance(it.get("start"), int) and isinstance(it.get("end"), int)):
                    probs.append(("DNA", i)); break
        sub = ev.get("Subcellular_location_json")
        if isinstance(sub, list):
            for it in sub[:10]:
                if not (isinstance(it, dict) and isinstance(it.get("location"), str)):
                    probs.append(("Subcellular", i)); break
        for key in ("GO_MF_json","GO_BP_json","GO_CC_json"):
            arr = ev.get(key)
            if isinstance(arr, list):
                for it in arr[:10]:
                    if not (isinstance(it, dict) and isinstance(it.get("go_id"), str) and GO_RX.match(it["go_id"])):
                        probs.append((key, i)); break
    pass_fail("T12: Evidence array item shapes OK (spot-check)", len(probs) == 0)

# Run on first available candidate
any_run = False
for cand in CANDIDATES:
    if os.path.exists(cand):
        run_tests(cand)
        any_run = True

if not any_run:
    print("No JSONL dataset found. Update CANDIDATES with your path and rerun.")



=== Running tests on: evifuncqa.jsonl ===
Loaded 256690 examples
[PASS] T1: required top-level blocks
[PASS] T2: provenance has entry/organism/length
[PASS] T3: unique example IDs
[PASS] T4: evidence_mask matches evidence keys
[PASS] T5: length matches sequence length when both present
[PASS] T6: sha1 correct
[PASS] T7a: GO IDs valid
[FAIL] T7b: EC valid pattern
[PASS] T7c: RHEA valid pattern
[PASS] T7d: CHEBI valid pattern
[PASS] T8: targets_for_eval equals gold_supports
[PASS] T9: GO in evidence ⊆ gold supports
[PASS] T10: Catalysis IDs in evidence ⊆ gold supports
[PASS] T11: Evidence codes well-formed (ECO or PubMed)
[PASS] T12: Evidence array item shapes OK (spot-check)


In [17]:
df['EC number'].unique().tolist()

[nan,
 '2.1.3.3',
 '3.4.22.-',
 '6.1.1.2',
 '2.4.2.9',
 '3.1.1.73',
 '3.5.1.2; 6.3.5.3',
 '4.1.1.49',
 '2.3.2.34',
 '5.4.2.7',
 '3.1.21.10',
 '3.1.27.-',
 '3.5.4.2',
 '2.8.1.-',
 '1.-.-.-',
 '2.3.1.157; 2.7.7.23',
 '2.1.1.223',
 '4.1.1.65',
 '2.4.1.117',
 '2.5.1.7',
 '4.2.1.59',
 '3.1.1.29',
 '2.7.7.42; 2.7.7.89',
 '1.1.1.-',
 '1.18.6.1',
 '6.3.2.1',
 '2.4.1.221',
 '1.2.1.70',
 '3.6.5.-',
 '7.6.2.-',
 '5.1.1.7',
 '2.1.2.9',
 '4.3.2.10',
 '1.1.1.100',
 '4.1.3.1',
 '1.1.1.86',
 '3.1.21.2',
 '2.1.1.228',
 '3.1.1.11',
 '2.7.7.6',
 '1.14.14.1; 1.14.14.130',
 '4.2.1.9',
 '3.2.1.-',
 '3.1.26.11',
 '2.1.1.177',
 '7.1.1.-',
 '1.13.11.12',
 '6.3.3.3',
 '4.3.3.7',
 '2.3.1.274',
 '3.6.5.n1',
 '6.3.5.-',
 '3.6.1.9',
 '2.3.1.48',
 '1.13.11.56',
 '3.5.4.19',
 '4.1.99.17',
 '2.4.1.18',
 '3.1.26.4',
 '3.2.2.27',
 '2.4.2.7',
 '3.5.1.96',
 '5.3.1.9',
 '1.1.1.105; 1.1.1.209; 1.1.1.315; 1.1.1.53',
 '3.6.1.23',
 '3.6.5.3',
 '1.11.1.6',
 '6.3.2.-',
 '2.7.1.23',
 '1.17.7.4',
 '7.1.1.2',
 '3.1.1.96',
 '6.3.2.4

In [None]:
!python jsonl_to_csv.py --in evifuncqa.jsonl --out qa_evifunc_tight.csv

In [19]:
qa_df = pd.read_csv("qa_evifunc_tight.csv")

In [20]:
qa_df

Unnamed: 0,id,entry,split,sequence,question,answer,gold_supports_json,coverage_json
0,EVIQ-B7LNJ1-0,B7LNJ1,train,MSESVHTNTSLWSKGMKAVIVAQFLSAFGDNALLFATLALLKAQFY...,Protein from Escherichia fergusonii (397 aa). ...,Catalyzes the facilitated diffusion of 2-acyl-...,"{""EC_number"": [], ""RHEA_ids"": [], ""GO_MF_ids"":...","{""evidence_counts"": {""GO_MF_json"": 1, ""GO_CC_j..."
1,EVIQ-Q6LG09-1,Q6LG09,train,MSFNLRNRNFLKLLDFTGKEIEHLIALAQDLKHAKYAGTEQQKLKG...,Protein from Photobacterium profundum (333 aa)...,Reversibly catalyzes the transfer of the carba...,"{""EC_number"": [""2.1.3.3""], ""RHEA_ids"": [], ""GO...","{""evidence_counts"": {""GO_MF_json"": 2, ""GO_BP_j..."
2,EVIQ-Q01345-2,Q01345,train,MPAFSCAFPGCRRDLLVIVLVVFVGIGLPIEASAPAYQSHGTEGSH...,Protein from Oncorhynchus mykiss (759 aa). Evi...,Involved in pH regulation to eliminate acids g...,"{""EC_number"": [], ""RHEA_ids"": [], ""GO_MF_ids"":...","{""evidence_counts"": {""GO_MF_json"": 2, ""GO_BP_j..."
3,EVIQ-Q54ME1-3,Q54ME1,train,MKLILVLLCLISTLFVVKGGLSPTEQQIIVSYHNKWRSSPIGPTPS...,Protein from Dictyostelium discoideum (448 aa)...,Thiol protease that seems to be involved in th...,"{""EC_number"": [""3.4.22.-""], ""RHEA_ids"": [], ""G...","{""evidence_counts"": {""GO_MF_json"": 2, ""GO_BP_j..."
4,EVIQ-Q8UIE8-4,Q8UIE8,train,MNAFKPLVFSGVQPTGNLHLGNYLGAIRKFVALQEDNDCIYCVVDM...,Protein from Agrobacterium fabrum (354 aa). Ev...,Catalyzes the attachment of tryptophan to tRNA...,"{""EC_number"": [""6.1.1.2""], ""RHEA_ids"": [], ""GO...","{""evidence_counts"": {""GO_MF_json"": 2, ""GO_BP_j..."
...,...,...,...,...,...,...,...,...
256685,EVIQ-Q99373-256685,Q99373,train,MLKLARPFIPPLSRNNAISSGIVLTSRRFQSSFTFLSNQSLLSKNQ...,Protein from Saccharomyces cerevisiae (253 aa)...,"Required for respiratory growth, stability of ...","{""EC_number"": [], ""RHEA_ids"": [], ""GO_MF_ids"":...","{""evidence_counts"": {""GO_BP_json"": 1, ""GO_CC_j..."
256686,EVIQ-P9WQL7-256686,P9WQL7,train,MTALNRAVASARVGTEVIRVRGLTFRYPKAAEPAVRGMEFTVGRGE...,Protein from Mycobacterium tuberculosis (301 a...,Part of the ABC transporter complex Rv2686c/Rv...,"{""EC_number"": [""7.6.2.-""], ""RHEA_ids"": [], ""GO...","{""evidence_counts"": {""GO_MF_json"": 4, ""GO_BP_j..."
256687,EVIQ-Q3AAW3-256687,Q3AAW3,train,MRYEEINLGYEKNLVRYYLTGYDRVAKFYHYNPWNTRSFYDRADYL...,Protein from Carboxydothermus hydrogenoformans...,Involved in bacillithiol (BSH) biosynthesis. M...,"{""EC_number"": [""6.-.-.-""], ""RHEA_ids"": [], ""GO...","{""evidence_counts"": {""GO_MF_json"": 1, ""EC numb..."
256688,EVIQ-Q5BJD2-256688,Q5BJD2,train,MFKRKTAWFSDSVEKEVISFWVSEGGDISSWKTAGYLFSDDASSED...,Protein from Danio rerio (212 aa). Evidence: (...,Meiosis-specific telomere-associated protein i...,"{""EC_number"": [], ""RHEA_ids"": [], ""GO_MF_ids"":...","{""evidence_counts"": {""GO_BP_json"": 3, ""GO_CC_j..."


In [21]:
# Print dataframe info
qa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256690 entries, 0 to 256689
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  256690 non-null  object
 1   entry               256690 non-null  object
 2   split               256690 non-null  object
 3   sequence            256690 non-null  object
 4   question            256690 non-null  object
 5   answer              256690 non-null  object
 6   gold_supports_json  256690 non-null  object
 7   coverage_json       256690 non-null  object
dtypes: object(8)
memory usage: 15.7+ MB


In [22]:
# Save three rows from every column to a text file for inspection
with open("sample_rows_per_column.txt", "w") as f:
    for col in qa_df.columns:
        f.write(f"=== Column: {col} ===\n")
        samples = qa_df[col].dropna().unique()[:3]
        for s in samples:
            f.write(f"{s}\n")
        f.write("\n")

In [25]:
!pip install ete3

Collecting ete3
  Downloading ete3-3.1.3.tar.gz (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: ete3
  Building wheel for ete3 (setup.py) ... [?25ldone
[?25h  Created wheel for ete3: filename=ete3-3.1.3-py3-none-any.whl size=2273900 sha256=d86571cdda161f15d47855854a0d5acc8a95b6e1374cb835056ac0d4396fd8ce
  Stored in directory: /Users/ayo_on/Library/Caches/pip/wheels/4f/18/8d/3800b8b1dc7a8c1954eaa48424f639b2cfc760922cc3cee479
Successfully built ete3
Installing collected packages: ete3
Successfully installed ete3-3.1.3


In [26]:
#!/usr/bin/env python3
import pandas as pd
import subprocess, os, tempfile, random
from ete3 import NCBITaxa

# === Parameters ===
INPUT_CSV  = "qa_evifunc_tight.csv"
OUTPUT_CSV = "qa_evifunc_tight_splits.csv"
IDENTITY_THRESHOLD = 0.3   # <30% identity for remote homology split
VAL_FRACTION = 0.1         # 10% dev
IID_TEST_FRACTION = 0.1    # 10% test-iid

# === Load data ===
df = pd.read_csv(INPUT_CSV)

from ete3 import NCBITaxa
ncbi = NCBITaxa()
ncbi.update_taxonomy_database()

# === Step 1: cluster sequences with MMseqs2 for identity-based splits ===
with tempfile.TemporaryDirectory() as tmp:
    fasta_in = os.path.join(tmp, "input.fasta")
    with open(fasta_in, "w") as f:
        for i, row in df.iterrows():
            f.write(f">{row['entry']}\n{row['sequence']}\n")
    db = os.path.join(tmp, "db")
    clust = os.path.join(tmp, "clust")
    subprocess.run(["mmseqs", "createdb", fasta_in, db], check=True)
    subprocess.run(["mmseqs", "cluster", db, clust, tmp, 
                    "--min-seq-id", str(IDENTITY_THRESHOLD)], check=True)
    subprocess.run(["mmseqs", "createtsv", db, db, clust, os.path.join(tmp, "clust.tsv")], check=True)
    clusters = pd.read_csv(os.path.join(tmp, "clust.tsv"), sep="\t", header=None)

# Map entry -> cluster representative
rep_map = {}
for rep, mem in clusters.values:
    rep_map[mem] = rep

df["cluster_rep"] = df["entry"].map(rep_map).fillna(df["entry"])

# === Step 2: Assign splits ===
splits = {}

# Pick a random 10% of clusters as validation clusters
all_clusters = list(set(df["cluster_rep"]))
random.shuffle(all_clusters)
val_clusters = set(all_clusters[:int(len(all_clusters)*VAL_FRACTION)])
iid_test_clusters = set(all_clusters[int(len(all_clusters)*VAL_FRACTION):
                                     int(len(all_clusters)*(VAL_FRACTION+IID_TEST_FRACTION))])


df["taxid"] = df["organism"].apply(lambda org: ncbi.get_name_translator([org]).get(org, [None])[0])
fungi_taxid = 4751  # example: fungi
fungi_taxa = set(ncbi.get_descendant_taxa(fungi_taxid, collapse_subspecies=True))

for i, row in df.iterrows():
    cl = row["cluster_rep"]
    if row["taxid"] in fungi_taxa:
        split = "test-taxonomy"
    elif cl in val_clusters:
        split = "val"
    elif cl in iid_test_clusters:
        split = "test-iid"
    else:
        split = "train"
    splits[row["id"]] = split

df["split"] = df["id"].map(splits)

# === Step 3: Dark proteins (missing EC/GO) ===
mask_dark = df["gold_supports_json"].str.contains('"EC_number": []') & \
            df["gold_supports_json"].str.contains('"GO_MF_ids": []')
df.loc[mask_dark, "split"] = "dark"

# === Save ===
df.to_csv(OUTPUT_CSV, index=False)
print("Saved with CAFA-style splits:", OUTPUT_CSV)


NCBI database not present yet (first time used?)
Downloading taxdump.tar.gz from NCBI FTP site (via HTTP)...
Done. Parsing...


Loading node names...
2698383 names loaded.
421770 synonyms loaded.
Loading nodes...
2698383 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /Users/ayo_on/.etetoolkit/taxa.sqlite ...
 2698000 generating entries... 

Inserting synonyms:      20000 


Uploading to /Users/ayo_on/.etetoolkit/taxa.sqlite



Inserting taxid merges:  50000  




Inserting taxids:       20000  




Inserting taxids:       2695000 




Local taxdump.tar.gz seems up-to-date


Loading node names...
2698383 names loaded.
421770 synonyms loaded.
Loading nodes...
2698383 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /Users/ayo_on/.etetoolkit/taxa.sqlite ...
 2698000 generating entries... 
Uploading to /Users/ayo_on/.etetoolkit/taxa.sqlite


Inserting synonyms:       5000 




Inserting taxid merges:  30000  




Inserting taxids:        5000  




Inserting taxids:       2695000 




FileNotFoundError: [Errno 2] No such file or directory: 'mmseqs'