# BioAssay Screening: Enough Actives

In [1]:
import requests

def get_assay_outcome_counts(aid, count_basis="CID"):
    """
    Retrieve Active/Inactive/Inconclusive/Total counts for a PubChem BioAssay AID.

    Parameters
    ----------
    aid : int
        PubChem Assay ID.
    count_basis : str
        "CID" (default) counts unique compounds (recommended for cheminformatics/ML),
        or "SID" counts submitted substances.

    Returns
    -------
    dict
        Counts and basis used.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/summary/JSON"
    r = requests.get(url, timeout=30)
    r.raise_for_status()

    summary = r.json()["AssaySummaries"]["AssaySummary"][0]

    basis = count_basis.upper()
    if basis not in {"CID", "SID"}:
        raise ValueError("count_basis must be 'CID' or 'SID'")

    # Preferred schema (what you are seeing)
    if basis == "CID" and "CIDCountAll" in summary:
        return {
            "Basis": "CID",
            "Total": summary.get("CIDCountAll", 0),
            "Active": summary.get("CIDCountActive", 0),
            "Inactive": summary.get("CIDCountInactive", 0),
            "Inconclusive": summary.get("CIDCountInconclusive", 0),
            "Unspecified": summary.get("CIDCountUnspecified", 0),
            "Probe": summary.get("CIDCountProbe", 0),
        }

    if basis == "SID" and "SIDCountAll" in summary:
        return {
            "Basis": "SID",
            "Total": summary.get("SIDCountAll", 0),
            "Active": summary.get("SIDCountActive", 0),
            "Inactive": summary.get("SIDCountInactive", 0),
            "Inconclusive": summary.get("SIDCountInconclusive", 0),
            "Unspecified": summary.get("SIDCountUnspecified", 0),
            "Probe": summary.get("SIDCountProbe", 0),
        }

    # Fallback schema (less common, but we handle it)
    # Try a couple of plausible alternatives if PubChem changes formatting.
    alt_map = {
        "Total": ["TotalCount", "CIDCount", "SIDCount"],
        "Active": ["ActiveCount"],
        "Inactive": ["InactiveCount"],
        "Inconclusive": ["InconclusiveCount"],
    }

    out = {"Basis": basis}
    for k, candidates in alt_map.items():
        out[k] = 0
        for c in candidates:
            if c in summary:
                out[k] = summary.get(c, 0)
                break

    return out


In [2]:
counts_cid = get_assay_outcome_counts(743139, "CID")
counts_sid = get_assay_outcome_counts(743139, "SID")

print("CID counts:", counts_cid)
print("SID counts:", counts_sid)


CID counts: {'Basis': 'CID', 'Total': 8099, 'Active': 326, 'Inactive': 6114, 'Inconclusive': 2092, 'Unspecified': 0, 'Probe': 0}
SID counts: {'Basis': 'SID', 'Total': 10486, 'Active': 379, 'Inactive': 7562, 'Inconclusive': 2545, 'Unspecified': 0, 'Probe': 0}


In [3]:
import requests

def get_assay_summary(aid):
    """
    Retrieve assay name and outcome counts (CID and SID)
    for a PubChem BioAssay AID.
    """
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/summary/JSON"
    r = requests.get(url, timeout=30)
    r.raise_for_status()

    s = r.json()["AssaySummaries"]["AssaySummary"][0]

    return {
        "AID": aid,
        "AssayName": s.get("Name", ""),
        
        # CID counts (preferred for ML)
        "CID_Total": s.get("CIDCountAll", 0),
        "CID_Active": s.get("CIDCountActive", 0),
        "CID_Inactive": s.get("CIDCountInactive", 0),
        "CID_Inconclusive": s.get("CIDCountInconclusive", 0),

        # SID counts (for completeness)
        "SID_Total": s.get("SIDCountAll", 0),
        "SID_Active": s.get("SIDCountActive", 0),
        "SID_Inactive": s.get("SIDCountInactive", 0),
        "SID_Inconclusive": s.get("SIDCountInconclusive", 0),
    }


In [4]:
import pandas as pd

candidate_aids = [
    743040,  # ERα agonist
    743077,  # ERα antagonist
    720552,  # AR antagonist
    720551,  # AR agonist
    743122,  # AhR activation
    743065,  # PPARγ
    743067,  # PPARδ
    743053,  # NRF2 antioxidant response
]

rows = []
for aid in candidate_aids:
    try:
        rows.append(get_assay_summary(aid))
    except Exception as e:
        rows.append({
            "AID": aid,
            "AssayName": "ERROR",
            "Error": str(e)
        })

df = pd.DataFrame(rows)
df


Unnamed: 0,AID,AssayName,CID_Total,CID_Active,CID_Inactive,CID_Inconclusive,SID_Total,SID_Active,SID_Inactive,SID_Inconclusive
0,743040,qHTS assay to identify small molecule agonists...,8099,341,7706,201,10486,461,9812,213
1,743077,qHTS assay to identify small molecule agonists...,8099,438,7303,736,10486,589,9089,808
2,720552,qHTS assay for small molecule agonists of the ...,8099,527,6960,816,10488,659,8888,941
3,720551,qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS,343666,1267,341674,730,343909,1267,341912,730
4,743122,qHTS assay to identify small molecule that act...,8099,875,6397,1357,10486,1063,7945,1478
5,743065,qHTS assay to identify small molecule antagoni...,8099,1536,5937,912,10486,1880,7568,1038
6,743067,qHTS assay to identify small molecule antagoni...,8099,378,5631,2491,10486,426,7088,2972
7,743053,qHTS assay to identify small molecule agonists...,8099,260,7168,918,10486,372,9070,1044


In [5]:
df["CID_ActiveFraction"] = df["CID_Active"] / df["CID_Total"]
df["CID_InconclusiveFraction"] = df["CID_Inconclusive"] / df["CID_Total"]

df[[
    "AID",
    "AssayName",
    "CID_Total",
    "CID_Active",
    "CID_ActiveFraction",
    "CID_InconclusiveFraction"
]].sort_values("CID_ActiveFraction", ascending=False)


Unnamed: 0,AID,AssayName,CID_Total,CID_Active,CID_ActiveFraction,CID_InconclusiveFraction
5,743065,qHTS assay to identify small molecule antagoni...,8099,1536,0.189653,0.112606
4,743122,qHTS assay to identify small molecule that act...,8099,875,0.108038,0.167552
2,720552,qHTS assay for small molecule agonists of the ...,8099,527,0.06507,0.100753
1,743077,qHTS assay to identify small molecule agonists...,8099,438,0.054081,0.090875
6,743067,qHTS assay to identify small molecule antagoni...,8099,378,0.046672,0.307569
0,743040,qHTS assay to identify small molecule agonists...,8099,341,0.042104,0.024818
7,743053,qHTS assay to identify small molecule agonists...,8099,260,0.032103,0.113347
3,720551,qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS,343666,1267,0.003687,0.002124


In [6]:
def suitability(row):
    if row["CID_Total"] < 5000:
        return "Too Small"
    if row["CID_Active"] < 200:
        return "Too Few Actives"
    if row["CID_ActiveFraction"] < 0.01:
        return "Advanced / Highly Imbalanced"
    if row["CID_ActiveFraction"] < 0.05:
        return "Intermediate"
    return "Intro-Friendly"

df["SuggestedUse"] = df.apply(suitability, axis=1)
df

Unnamed: 0,AID,AssayName,CID_Total,CID_Active,CID_Inactive,CID_Inconclusive,SID_Total,SID_Active,SID_Inactive,SID_Inconclusive,CID_ActiveFraction,CID_InconclusiveFraction,SuggestedUse
0,743040,qHTS assay to identify small molecule agonists...,8099,341,7706,201,10486,461,9812,213,0.042104,0.024818,Intermediate
1,743077,qHTS assay to identify small molecule agonists...,8099,438,7303,736,10486,589,9089,808,0.054081,0.090875,Intro-Friendly
2,720552,qHTS assay for small molecule agonists of the ...,8099,527,6960,816,10488,659,8888,941,0.06507,0.100753,Intro-Friendly
3,720551,qHTS for Inhibitors of KCHN2 3.1: Wildtype qHTS,343666,1267,341674,730,343909,1267,341912,730,0.003687,0.002124,Advanced / Highly Imbalanced
4,743122,qHTS assay to identify small molecule that act...,8099,875,6397,1357,10486,1063,7945,1478,0.108038,0.167552,Intro-Friendly
5,743065,qHTS assay to identify small molecule antagoni...,8099,1536,5937,912,10486,1880,7568,1038,0.189653,0.112606,Intro-Friendly
6,743067,qHTS assay to identify small molecule antagoni...,8099,378,5631,2491,10486,426,7088,2972,0.046672,0.307569,Intermediate
7,743053,qHTS assay to identify small molecule agonists...,8099,260,7168,918,10486,372,9070,1044,0.032103,0.113347,Intermediate
