In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

df = pd.read_csv('lit-pcba_all_data.csv')
print(df['receptor'].unique())

['ADRB2' 'ALDH1' 'ESR1_ago' 'ESR1_ant' 'FEN1' 'GBA' 'IDH1' 'KAT2A' 'MAPK1'
 'MTORC1' 'OPRK1' 'PKM2' 'PPARG' 'TP53' 'VDR']


  df = pd.read_csv('lit-pcba_all_data.csv')


In [18]:
import numpy as np
from rdkit.Chem import rdFingerprintGenerator

fp_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=4096)

unique_cansmiles = df['canonical_smiles'].dropna().unique()
fp_cache = {}

def get_fp(cansmiles):
    if pd.isnull(cansmiles):
        return None
    if cansmiles in fp_cache:
        return fp_cache[cansmiles]
    mol = Chem.MolFromSmiles(cansmiles)
    if mol is None:
        fp_cache[cansmiles] = None
        return None
    fp = fp_generator.GetFingerprint(mol)
    fp_cache[cansmiles] = fp
    return fp

for smi in unique_cansmiles:
    get_fp(smi)

df['fp'] = df['canonical_smiles'].apply(lambda smi: fp_cache.get(smi, None) if pd.notnull(smi) else None)

In [None]:
from rdkit import DataStructs

receptors = [
    'ADRB2', 'ALDH1', 'ESR1_ago', 'ESR1_ant', 'FEN1', 'GBA', 'IDH1', 'KAT2A', 'MAPK1',
    'MTORC1', 'OPRK1', 'PKM2', 'PPARG', 'TP53', 'VDR'
]

fp_col = 'fp'
strategy = "avg_of_maxpools"
all_results = {}

print("Running EF1% calculation for fingerprint, strategy: avg_of_maxpools\n", flush=True)

for receptor in receptors:
    # Select data for this receptor and type
    queries = df[(df['receptor'] == receptor) & (df['type'] == 'query')]
    active_T = df[(df['receptor'] == receptor) & (df['type'] == 'active_T')]
    inactive_T = df[(df['receptor'] == receptor) & (df['type'] == 'inactive_T')]
    active_V = df[(df['receptor'] == receptor) & (df['type'] == 'active_V')]
    inactive_V = df[(df['receptor'] == receptor) & (df['type'] == 'inactive_V')]

    # Get valid fingerprints
    query_fps = [fp for fp in queries[fp_col] if fp is not None]
    active_T_fps = [fp for fp in active_T[fp_col] if fp is not None]

    # For fast lookup: sets of canonical_smiles for active_T and inactive_T
    active_T_smiles_set = set(active_T['canonical_smiles'])
    inactive_T_smiles_set = set(inactive_T['canonical_smiles'])

    # Prepare validation entries
    val_entries = []
    for fp, cansmiles in zip(active_V[fp_col], active_V['canonical_smiles']):
        val_entries.append({'fp': fp, 'label': 'active', 'canonical_smiles': cansmiles})
    for fp, cansmiles in zip(inactive_V[fp_col], inactive_V['canonical_smiles']):
        val_entries.append({'fp': fp, 'label': 'inactive', 'canonical_smiles': cansmiles})

    val_fps = [entry['fp'] for entry in val_entries]
    val_smiles = [entry['canonical_smiles'] for entry in val_entries]

    if not query_fps or not active_T_fps:
        all_results[receptor] = []
        print(f"{receptor:10s}: No valid queries or actives, skipping.", flush=True)
        continue

    # Precompute Tanimoto similarities
    sims_val_vs_query = np.array([
        DataStructs.BulkTanimotoSimilarity(vfp, query_fps) if vfp is not None else [0.0] * len(query_fps)
        for vfp in val_fps
    ]) if query_fps else np.zeros((len(val_fps), 0))

    sims_val_vs_activeT = np.array([
        DataStructs.BulkTanimotoSimilarity(vfp, active_T_fps) if vfp is not None else [0.0] * len(active_T_fps)
        for vfp in val_fps
    ]) if active_T_fps else np.zeros((len(val_fps), 0))

    sim_results = []

    for idx, entry in enumerate(val_entries):
        fp = entry['fp']
        label = entry['label']
        cansmiles = entry['canonical_smiles']

        # If this molecule is in inactive_T, force to bottom (score = -1.0)
        if cansmiles in inactive_T_smiles_set:
            sim_results.append({'label': label, 'score': -1.0})
            continue

        # If this molecule is in active_T, force to top (score = 2.0)
        if cansmiles in active_T_smiles_set:
            sim_results.append({'label': label, 'score': 2.0})
            continue

        if fp is None:
            sim_results.append({'label': label, 'score': 0.0})
            continue

        # avg_of_maxpools: average of max-pool to queries and max-pool to active_T (Tanimoto)
        max_q = np.max(sims_val_vs_query[idx]) if sims_val_vs_query.shape[1] > 0 else 0.0
        max_t = np.max(sims_val_vs_activeT[idx]) if sims_val_vs_activeT.shape[1] > 0 else 0.0
        avg_max = (max_q + max_t) / 2.0
        sim_results.append({'label': label, 'score': avg_max})

    # Compute EF1%
    results_sorted = sorted(sim_results, key=lambda x: x['score'], reverse=True)
    N_total = len(results_sorted)
    N_actives = sum(1 for r in results_sorted if r['label'] == 'active')
    top_1pct = max(1, round(0.01 * N_total))
    n_actives_top1pct = sum(1 for r in results_sorted[:top_1pct] if r['label'] == 'active')
    expected_random = N_actives * (top_1pct / N_total) if N_total > 0 else 0
    ef1 = (n_actives_top1pct / expected_random) if expected_random > 0 else 0
    all_results[receptor] = [ef1]

    print(f"{receptor:10s} | avg_of_maxpools EF1%: {ef1:.2f}", flush=True)

ef1s_all = []
for ef1s in all_results.values():
    ef1s_all.extend(ef1s)

mean_ef1 = np.mean(ef1s_all)
median_ef1 = np.median(ef1s_all)
print(f"\n--- Summary for {fp_col} (avg_of_maxpools) ---", flush=True)
print(f"  Avg EF1%: {mean_ef1:.2f} | Median: {median_ef1:.2f} | n={len(ef1s_all)}", flush=True)

Running EF1% calculation for fingerprint, strategy: avg_of_maxpools

ADRB2      | avg_of_maxpools EF1%: 0.00
ALDH1      | avg_of_maxpools EF1%: 4.25
ESR1_ago   | avg_of_maxpools EF1%: 0.00
ESR1_ant   | avg_of_maxpools EF1%: 4.09
FEN1       | avg_of_maxpools EF1%: 4.40
GBA        | avg_of_maxpools EF1%: 17.08
IDH1       | avg_of_maxpools EF1%: 11.11
KAT2A      | avg_of_maxpools EF1%: 4.17
MAPK1      | avg_of_maxpools EF1%: 6.50
MTORC1     | avg_of_maxpools EF1%: 8.30
OPRK1      | avg_of_maxpools EF1%: 0.00
PKM2       | avg_of_maxpools EF1%: 2.94
PPARG      | avg_of_maxpools EF1%: 16.06
TP53       | avg_of_maxpools EF1%: 0.00
VDR        | avg_of_maxpools EF1%: 7.27

--- Summary for fp (avg_of_maxpools) ---
  Avg EF1%: 5.75 | Median: 4.25 | n=15
