#### This notebook demonstrates how the lit-pcba benchmark can be exploited using a simple algorithm that takes advantage of data leakage between the query, training, and validation sets.

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

# This csv needs to be created from lit-pcba.ipynb, which can be found in the same repo (https://github.com/sievestack/lit-pcba-audit)
df = pd.read_csv('lit-pcba_all_data.csv')
receptors = sorted(df['receptor'].unique())
print(receptors)

['ADRB2', 'ALDH1', 'ESR1_ago', 'ESR1_ant', 'FEN1', 'GBA', 'IDH1', 'KAT2A', 'MAPK1', 'MTORC1', 'OPRK1', 'PKM2', 'PPARG', 'TP53', 'VDR']


  df = pd.read_csv('lit-pcba_all_data.csv')


<br>
Calculate morgan fingerprint for all unique canonical smiles and add to df.

In [2]:
import numpy as np
from rdkit.Chem import rdFingerprintGenerator

# Create Morgan fingerprint generator with radius 2 and 4096 bits
fp_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=4096)

# Collect all unique SMILES, canonical_smiles_with_stereo, and canonical_smiles_no_stereo from dataframe, dropping any NA values
unique_smiles = set(df['smiles'].dropna().unique())
unique_cansmiles_stereo = set(df['canonical_smiles_with_stereo'].dropna().unique())
unique_cansmiles_no_stereo = set(df['canonical_smiles_no_stereo'].dropna().unique())
all_unique_smiles = unique_smiles | unique_cansmiles_stereo | unique_cansmiles_no_stereo

fp_cache = {}
def get_fp(cansmiles):
    if pd.isnull(cansmiles):
        return None
    if cansmiles in fp_cache:
        return fp_cache[cansmiles]
    try:
        mol = Chem.MolFromSmiles(cansmiles)
        if mol is None:
            print(f"[ERROR] MolFromSmiles failed for: {cansmiles!r}")
            fp_cache[cansmiles] = None
            return None
        try:
            fp = fp_generator.GetFingerprint(mol)
        except Exception as e_fp:
            print(f"[ERROR] Fingerprint generation failed for: {cansmiles!r}")
            print(f"  Exception: {e_fp}")
            fp_cache[cansmiles] = None
            return None
        fp_cache[cansmiles] = fp
        return fp
    except Exception as e:
        print(f"[ERROR] Exception in get_fp for: {cansmiles!r}")
        print(f"  Exception: {e}")
        fp_cache[cansmiles] = None
        return None

for smi in all_unique_smiles:
    get_fp(smi)

def fp_lookup(smi):
    if pd.isnull(smi):
        return None
    if smi not in fp_cache:
        return get_fp(smi)
    return fp_cache.get(smi, None)

df['fp_original'] = df['smiles'].apply(fp_lookup)
df['fp_with_stereo'] = df['canonical_smiles_with_stereo'].apply(fp_lookup)
df['fp_no_stereo'] = df['canonical_smiles_no_stereo'].apply(fp_lookup)

[01:10:46] Conflicting single bond directions around double bond at index 7.
[01:10:46]   BondStereo set to STEREONONE and single bond directions set to NONE.


<br>
For each receptor, go through all molecules that we need to rank (active_V and inactive_V), and for each check bulk tanimoto similarities against queries set and active_T set. For each validation molecule, take average of the two similarity scores, then we rank the validation set from greatest similarity score to least. Finally, we calculate the EF1%.

In [3]:
import pandas as pd
from rdkit import DataStructs
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score

def ef1_percent(labels, scores):
    N_total = len(labels)
    N_actives = np.sum(labels)
    top_1pct = max(1, round(0.01 * N_total))
    order = np.argsort(-np.array(scores) + 1e-8 * np.random.randn(N_total))
    top_labels = np.array(labels)[order[:top_1pct]]
    n_actives_top1pct = np.sum(top_labels)
    expected_random = N_actives * (top_1pct / N_total)
    ef1 = n_actives_top1pct / expected_random if expected_random > 0 else 0
    return ef1

def perfect_ef1(labels):
    """Compute the maximum possible EF1% for a given set of labels."""
    N_total = len(labels)
    N_actives = np.sum(labels)
    top_1pct = max(1, round(0.01 * N_total))
    # The best possible: all actives at the top
    n_actives_top1pct = min(N_actives, top_1pct)
    expected_random = N_actives * (top_1pct / N_total)
    ef1 = n_actives_top1pct / expected_random if expected_random > 0 else 0
    return ef1

def roc_auc_score_nansafe(labels, scores):
    labels = np.array(labels)
    if len(np.unique(labels)) < 2:
        return np.nan
    try:
        return roc_auc_score(labels, scores)
    except Exception:
        return np.nan

def average_precision_score_nansafe(labels, scores):
    labels = np.array(labels)
    if len(np.unique(labels)) < 2:
        return np.nan
    try:
        return average_precision_score(labels, scores)
    except Exception:
        return np.nan

def run_eval(fp_col):
    rows = []
    for receptor in receptors:
        row = {"Receptor": receptor}
        queries = df[(df['receptor'] == receptor) & (df['type'] == 'query')]
        active_T = df[(df['receptor'] == receptor) & (df['type'] == 'active_T')]
        active_V = df[(df['receptor'] == receptor) & (df['type'] == 'active_V')]
        inactive_V = df[(df['receptor'] == receptor) & (df['type'] == 'inactive_V')]

        # Use sets to ensure uniqueness for maxpooling
        queries_fps_set = set(fp for fp in queries[fp_col] if fp is not None)
        active_T_fps_set = set(fp for fp in active_T[fp_col] if fp is not None)

        val_entries = []
        for fp in active_V[fp_col]:
            if fp is not None:
                val_entries.append({'fp': fp, 'label': 1})
        for fp in inactive_V[fp_col]:
            if fp is not None:
                val_entries.append({'fp': fp, 'label': 0})

        val_fps = [entry['fp'] for entry in val_entries]
        labels = [entry['label'] for entry in val_entries]

        sims_queries = [DataStructs.BulkTanimotoSimilarity(vfp, list(queries_fps_set)) if vfp is not None else [] for vfp in val_fps]
        scores_queries = [np.max(s) if len(s) > 0 else 0.0 for s in sims_queries]

        sims_activesT = [DataStructs.BulkTanimotoSimilarity(vfp, list(active_T_fps_set)) if vfp is not None else [] for vfp in val_fps]
        scores_activesT = [np.max(s) if len(s) > 0 else 0.0 for s in sims_activesT]

        # EF1%
        row['maxpool_queries_ef1'] = ef1_percent(labels, scores_queries)
        row['maxpool_activesT_ef1'] = ef1_percent(labels, scores_activesT)

        # maxpool_union: max of the two max-pools, not union of sets
        scores_union = [max(q, t) for q, t in zip(scores_queries, scores_activesT)]
        row['maxpool_union_ef1'] = ef1_percent(labels, scores_union)

        # maxpool_avg: average of the two max-pools
        scores_avg = [(q + t) / 2.0 for q, t in zip(scores_queries, scores_activesT)]
        row['maxpool_avg_ef1'] = ef1_percent(labels, scores_avg)

        # Compute perfect EF1% for normalization
        perfect = perfect_ef1(labels)
        # Add normalized EF1% columns (nef1)
        row['maxpool_queries_nef1'] = row['maxpool_queries_ef1'] / perfect if perfect > 0 else np.nan
        row['maxpool_activesT_nef1'] = row['maxpool_activesT_ef1'] / perfect if perfect > 0 else np.nan
        row['maxpool_union_nef1'] = row['maxpool_union_ef1'] / perfect if perfect > 0 else np.nan
        row['maxpool_avg_nef1'] = row['maxpool_avg_ef1'] / perfect if perfect > 0 else np.nan

        # ROC AUC and Average Precision for each scoring method
        row['maxpool_queries_AUC'] = roc_auc_score_nansafe(labels, scores_queries)
        row['maxpool_activesT_AUC'] = roc_auc_score_nansafe(labels, scores_activesT)
        row['maxpool_union_AUC'] = roc_auc_score_nansafe(labels, scores_union)
        row['maxpool_avg_AUC'] = roc_auc_score_nansafe(labels, scores_avg)

        row['maxpool_queries_AP'] = average_precision_score_nansafe(labels, scores_queries)
        row['maxpool_activesT_AP'] = average_precision_score_nansafe(labels, scores_activesT)
        row['maxpool_union_AP'] = average_precision_score_nansafe(labels, scores_union)
        row['maxpool_avg_AP'] = average_precision_score_nansafe(labels, scores_avg)

        rows.append(row)

    results_df = pd.DataFrame(rows)
    mean_row = results_df.mean(numeric_only=True)
    median_row = results_df.median(numeric_only=True)
    mean_row = mean_row.reindex(results_df.columns, fill_value=np.nan)
    median_row = median_row.reindex(results_df.columns, fill_value=np.nan)
    mean_row = mean_row.astype(object)
    median_row = median_row.astype(object)
    mean_row['Receptor'] = 'Mean'
    median_row['Receptor'] = 'Median'
    mean_row = pd.DataFrame([mean_row])
    median_row = pd.DataFrame([median_row])
    results_df = pd.concat([results_df, mean_row, median_row], ignore_index=True)
    return results_df

results = run_eval('fp_original')

# Print all stats at the end
print("\nAll statistics for all max-pooling strategies (per receptor, plus mean/median):")
display(results)


All statistics for all max-pooling strategies (per receptor, plus mean/median):


Unnamed: 0,Receptor,maxpool_queries_ef1,maxpool_activesT_ef1,maxpool_union_ef1,maxpool_avg_ef1,maxpool_queries_nef1,maxpool_activesT_nef1,maxpool_union_nef1,maxpool_avg_nef1,maxpool_queries_AUC,maxpool_activesT_AUC,maxpool_union_AUC,maxpool_avg_AUC,maxpool_queries_AP,maxpool_activesT_AP,maxpool_union_AP,maxpool_avg_AP
0,ADRB2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.182683,0.28296,0.27217,0.197328,3.7e-05,4.4e-05,4.3e-05,3.8e-05
1,ALDH1,2.535453,4.996333,4.996333,4.325184,0.12782,0.25188,0.25188,0.218045,0.52826,0.607342,0.607438,0.609452,0.062779,0.09538,0.095385,0.095955
2,ESR1_ago,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.602606,0.469714,0.555066,0.535977,0.005523,0.006055,0.006317,0.005549
3,ESR1_ant,4.095,4.095,4.095,4.095,0.125,0.125,0.125,0.125,0.423627,0.579647,0.603476,0.534509,0.038031,0.047331,0.050695,0.038905
4,FEN1,2.198762,5.496905,5.496905,4.397524,0.021978,0.054945,0.054945,0.043956,0.466011,0.62341,0.622576,0.598303,0.001087,0.003661,0.003656,0.003918
5,GBA,2.440376,14.642254,14.642254,17.08263,0.02439,0.146341,0.146341,0.170732,0.519137,0.554894,0.556826,0.568724,0.000808,0.011118,0.011119,0.015192
6,IDH1,0.0,11.111361,11.111361,11.111361,0.0,0.111111,0.111111,0.111111,0.393429,0.533244,0.485112,0.492361,9e-05,0.002519,0.002175,0.001137
7,KAT2A,2.083579,4.167159,4.167159,4.167159,0.020833,0.041667,0.041667,0.041667,0.397125,0.490947,0.490561,0.482224,0.000575,0.001276,0.001276,0.001065
8,MAPK1,1.300993,7.805959,7.805959,6.504966,0.012987,0.077922,0.077922,0.064935,0.535496,0.585387,0.59025,0.583192,0.007299,0.011305,0.011538,0.015406
9,MTORC1,0.0,4.1501,4.1501,8.300201,0.0,0.041667,0.041667,0.083333,0.49287,0.503766,0.503222,0.498127,0.003159,0.005626,0.005623,0.006167


In [6]:
print("\nMean and Median values:")
display(results.iloc[[-2, -1]])



Mean and Median values:


Unnamed: 0,Receptor,maxpool_queries_ef1,maxpool_activesT_ef1,maxpool_union_ef1,maxpool_avg_ef1,maxpool_queries_nef1,maxpool_activesT_nef1,maxpool_union_nef1,maxpool_avg_nef1,maxpool_queries_AUC,maxpool_activesT_AUC,maxpool_union_AUC,maxpool_avg_AUC,maxpool_queries_AP,maxpool_activesT_AP,maxpool_union_AP,maxpool_avg_AP
15,Mean,2.33863,4.346968,4.396022,5.830797,0.036226,0.062531,0.063021,0.07598,0.479326,0.531556,0.537023,0.526867,0.016657,0.015668,0.016938,0.0175
16,Median,1.300993,4.1501,4.1501,4.325184,0.012987,0.041667,0.041667,0.064935,0.49287,0.546697,0.55295,0.535977,0.004491,0.00598,0.006317,0.006874
