#### This notebook audits the lit-pcba dataset by analyzing data leakage between query, training, and validation sets.

Download and extract all data from lit-pcba

In [2]:
import requests 
import tarfile
import os

# Create lit-pcba directory if it doesn't exist
lit_pcba_path = "lit-pcba"
os.makedirs(lit_pcba_path, exist_ok=True)

# URL for downloading the AVE_unbiased dataset
url = "http://drugdesign.unistra.fr/LIT-PCBA/Files/AVE_unbiased.tgz"
response = requests.get(url)

# Save downloaded content to a temporary .tgz file
with open("AVE_unbiased.tgz", "wb") as file:
    file.write(response.content)

# Extract contents of .tgz file to lit-pcba directory and clean up
with tarfile.open("AVE_unbiased.tgz", "r:gz") as tar:
    tar.extractall(path=lit_pcba_path)
    os.remove("AVE_unbiased.tgz")

  tar.extractall(path=lit_pcba_path)



<br>
Get all receptor names from the lit-pcba data we've just downloaded.

In [5]:
import os

receptor_names = []

lit_pcba_path = "lit-pcba"
if os.path.exists(lit_pcba_path):
    # Get all items in the directory
    items = os.listdir(lit_pcba_path)
    
    # Filter for directories only (receptor folders)
    for item in items:
        item_path = os.path.join(lit_pcba_path, item)
        if os.path.isdir(item_path):
            receptor_names.append(item)

# Sort the list for consistency
receptor_names.sort()

receptor_names
    

['ADRB2',
 'ALDH1',
 'ESR1_ago',
 'ESR1_ant',
 'FEN1',
 'GBA',
 'IDH1',
 'KAT2A',
 'MAPK1',
 'MTORC1',
 'OPRK1',
 'PKM2',
 'PPARG',
 'TP53',
 'VDR']

<br>
Load mapping of pdb to ligand ids from pdb_to_ligand_mapping.csv

In [7]:
import csv

pdb_to_ligand_id = {}

with open('pdb_to_ligand_mapping.csv', 'r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    
    for row in reader:
        pdb_id = row['pdb_id']
        ligand_id = row['ligand_id']
        pdb_to_ligand_id[pdb_id] = ligand_id

pdb_to_ligand_id

{'3P0G': 'P0G',
 '3PDS': 'ERC',
 '3SN6': 'P0G',
 '4LDE': 'P0G',
 '4LDL': 'XQC',
 '4LDO': 'ALE',
 '4QKX': '35V',
 '6MXT': 'K5Y',
 '4WP7': '3SR',
 '4WPN': '3ST',
 '4X4L': '3XG',
 '5AC2': 'K9P',
 '5L2M': '6ZY',
 '5L2N': '6ZU',
 '5L2O': '6ZW',
 '5TEI': 'M39',
 '1L2I': 'ETC',
 '2B1V': '458',
 '2B1Z': '17M',
 '2P15': 'EZT',
 '2Q70': 'DC8',
 '2QR9': 'HZ3',
 '2QSE': '1HP',
 '2QZO': 'KN1',
 '4IVW': '1GJ',
 '4PPS': 'ESE',
 '5DRJ': '5EU',
 '5DU5': '5G2',
 '5DUE': '5FY',
 '5DZI': '5KF',
 '5E1C': '5K8',
 '1XP1': 'AIH',
 '1XQC': 'AEJ',
 '2AYR': 'L4G',
 '2IOG': 'IOG',
 '2IOK': 'IOK',
 '2OUZ': 'C3D',
 '2POG': 'WST',
 '2R6W': 'LLB',
 '3DT3': '369',
 '5AAU': 'XBR',
 '5FQV': 'VQI',
 '5T92': '77W',
 '5UFX': '86Y',
 '6B0F': 'C6V',
 '6CHW': 'F3D',
 '5FV7': 'R3Z',
 '2V3D': 'NBV',
 '2V3E': 'NND',
 '2XWD': 'LGS',
 '2XWE': 'AMF',
 '3RIK': '3RI',
 '3RIL': '3RK',
 '4I3K': '1BX',
 '4I3L': '1BZ',
 '4UMX': 'VVS',
 '4XRX': '42V',
 '4XS3': '42W',
 '5DE1': '59D',
 '5L57': '6N3',
 '5L58': '6MX',
 '5LGE': '6VN',
 '5SUN':

<br>
To ensure that we have the correct smiles, we take all .smi files from each receptor and use the comp_id to request smiles string from PDB Bank. Then we save the new smiles into lit-pcba_all_data.csv

In [8]:
import os
import csv
import requests

def get_smiles_from_rcsb(comp_id):
    """
    Fetch SMILES string for a compound from RCSB PDB using GraphQL API
    
    Args:
        comp_id (str): PDB compound identifier
        
    Returns:
        str: SMILES string for the compound
    """
    resp = requests.post(
        "https://data.rcsb.org/graphql",
        json={
            "query": """
            query($id: String!) {
                chem_comp(comp_id: $id) {
                rcsb_chem_comp_descriptor { SMILES_stereo }
                }
            }
            """,
            "variables": {"id": comp_id}
        }
    )
    resp.raise_for_status()
    return resp.json()["data"]["chem_comp"]["rcsb_chem_comp_descriptor"]["SMILES_stereo"]


# Output CSV file path and column headers
output_csv = "lit-pcba_all_data.csv"
fieldnames = ["receptor", "mol_id", "smiles", "type"]

# Open output CSV file to write headers
with open(output_csv, "w", newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
    writer.writeheader()

    # Process each receptor
    for receptor_name in receptor_names:
        # Process active/inactive training/validation sets
        for split in ["active_T", "active_V", "inactive_T", "inactive_V"]:
            filename = f"lit-pcba/{receptor_name}/{split}.smi"
            if not os.path.exists(filename):
                continue
            # Read SMILES and molecule IDs from .smi files
            with open(filename, "r") as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) == 2:
                        smiles, mol_id = parts
                        writer.writerow({
                            "receptor": str(receptor_name),
                            "mol_id": str(mol_id),
                            "smiles": str(smiles),
                            "type": str(split)
                        })

        # Process query ligands from mol2 files
        receptor_dir = f"lit-pcba/{receptor_name}"
        if os.path.exists(receptor_dir):
            for fname in os.listdir(receptor_dir):
                if fname.endswith("_ligand.mol2"):
                    # Extract PDB code and get corresponding compound ID
                    pdb_code = fname.split("_ligand.mol2")[0]
                    comp_id = pdb_to_ligand_id.get(pdb_code.upper())
                    if comp_id:
                        # Fetch SMILES from RCSB PDB
                        smiles = get_smiles_from_rcsb(comp_id)
                        if smiles:
                            writer.writerow({
                                "receptor": str(receptor_name),
                                "mol_id": str(pdb_code),
                                "smiles": str(smiles),
                                "type": "query"
                            })

In [9]:
import pandas as pd

df = pd.read_csv("lit-pcba_all_data.csv", low_memory=False)
receptor_names = df['receptor'].unique()

In the following, we write the code that produces the table from the paper with statistics for each receptor.

In [11]:
import pandas as pd

df = pd.read_csv("lit-pcba_all_data.csv", low_memory=False)

receptor_summary = []
for receptor in receptor_names:
    sub = df[df["receptor"] == receptor].copy()
    sub["group"] = sub["type"].replace({"query": "queries"})

    queries = sub[sub["group"] == "queries"]
    active_T = sub[sub["group"] == "active_T"]
    active_V = sub[sub["group"] == "active_V"]
    inactive_T = sub[sub["group"] == "inactive_T"]
    inactive_V = sub[sub["group"] == "inactive_V"]

    # Calculate totals
    active_total = len(active_T) + len(active_V)
    inactive_total = len(inactive_T) + len(inactive_V)

    receptor_summary.append({
        "Target": receptor,
        "Queries": len(queries),
        "Act.": active_total,
        "Act. (T)": len(active_T),
        "Act. (V)": len(active_V),
        "Inact.": inactive_total,
        "Inact. (T)": len(inactive_T),
        "Inact. (V)": len(inactive_V)
    })

# Order columns as specified
columns_order = [
    "Target", "Queries", "Act.", "Act. (T)", "Act. (V)",
    "Inact.", "Inact. (T)", "Inact. (V)"
]
receptor_summary_df = pd.DataFrame(receptor_summary)[columns_order]
receptor_summary_df

Unnamed: 0,Target,Queries,Act.,Act. (T),Act. (V),Inact.,Inact. (T),Inact. (V)
0,ADRB2,8,17,13,4,311748,233957,77791
1,ALDH1,8,5363,4020,1343,101874,76577,25297
2,ESR1_ago,15,13,10,3,4378,3470,908
3,ESR1_ant,15,88,63,25,3820,3026,794
4,FEN1,1,360,269,91,350718,263771,86947
5,GBA,6,163,122,41,291241,219042,72199
6,IDH1,14,39,30,9,358757,269664,89093
7,KAT2A,3,194,146,48,342729,258067,84662
8,MAPK1,15,308,231,77,61567,46317,15250
9,MTORC1,11,97,73,24,32972,24729,8243


<br>
Next, we canonicalize smiles in two ways (with stereo and without stereo) and save to csv.

In [12]:
from rdkit import Chem

def canonicalize_smiles(smiles, keep_stereo=True):
    """
    Convert SMILES string to canonical form, optionally keeping stereochemistry.

    Args:
        smiles (str): Input SMILES string
        keep_stereo (bool): Whether to keep stereochemistry (default True)
    Returns:
        str or None: Canonical SMILES, or None if invalid SMILES
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=keep_stereo)

# Initialize lists to store canonical SMILES with and without stereo
canonical_smiles_with_stereo = []
canonical_smiles_no_stereo = []

# Process molecules in batches to manage memory usage
batch_size = 250000
total = len(df)

# Iterate through dataframe in batches
for i in range(0, total, batch_size):
    batch = df["smiles"].iloc[i:i+batch_size]
    batch_rows = df.iloc[i:i+batch_size]
    batch_canonical_with_stereo = []
    batch_canonical_no_stereo = []

    for idx, (smi, row) in enumerate(zip(batch, batch_rows.itertuples(index=False))):
        try:
            canon_with_stereo = canonicalize_smiles(smi, keep_stereo=True)
            canon_no_stereo = canonicalize_smiles(smi, keep_stereo=False)
            batch_canonical_with_stereo.append(canon_with_stereo)
            batch_canonical_no_stereo.append(canon_no_stereo)
        except Exception as e:
            print(f"Error processing molecule at global index {i+idx}:")
            print(f"  SMILES: {smi}")
            print(f"  Row: {row}")
            print(f"  Exception: {e}")
            batch_canonical_with_stereo.append(None)
            batch_canonical_no_stereo.append(None)

    canonical_smiles_with_stereo.extend(batch_canonical_with_stereo)
    canonical_smiles_no_stereo.extend(batch_canonical_no_stereo)
    print(f"Processed {min(i+batch_size, total)} / {total}")

df["canonical_smiles_with_stereo"] = canonical_smiles_with_stereo
df["canonical_smiles_no_stereo"] = canonical_smiles_no_stereo
df.to_csv("lit-pcba_all_data.csv", index=False)


[00:42:14] Conflicting single bond directions around double bond at index 7.
[00:42:14]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:42:14] Conflicting single bond directions around double bond at index 7.
[00:42:14]   BondStereo set to STEREONONE and single bond directions set to NONE.


Processed 250000 / 2652106
Processed 500000 / 2652106


[00:44:26] Conflicting single bond directions around double bond at index 7.
[00:44:26]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:44:26] Conflicting single bond directions around double bond at index 7.
[00:44:26]   BondStereo set to STEREONONE and single bond directions set to NONE.


Processed 750000 / 2652106


[00:45:53] Conflicting single bond directions around double bond at index 7.
[00:45:53]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:45:53] Conflicting single bond directions around double bond at index 7.
[00:45:53]   BondStereo set to STEREONONE and single bond directions set to NONE.


Processed 1000000 / 2652106
Processed 1250000 / 2652106


[00:47:37] Conflicting single bond directions around double bond at index 7.
[00:47:37]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:47:37] Conflicting single bond directions around double bond at index 7.
[00:47:37]   BondStereo set to STEREONONE and single bond directions set to NONE.


Processed 1500000 / 2652106


[00:49:17] Conflicting single bond directions around double bond at index 7.
[00:49:17]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:49:17] Conflicting single bond directions around double bond at index 7.
[00:49:17]   BondStereo set to STEREONONE and single bond directions set to NONE.


Processed 1750000 / 2652106


[00:49:56] Conflicting single bond directions around double bond at index 7.
[00:49:56]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:49:56] Conflicting single bond directions around double bond at index 7.
[00:49:56]   BondStereo set to STEREONONE and single bond directions set to NONE.


Processed 2000000 / 2652106


[00:51:33] Conflicting single bond directions around double bond at index 7.
[00:51:33]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:51:33] Conflicting single bond directions around double bond at index 7.
[00:51:33]   BondStereo set to STEREONONE and single bond directions set to NONE.


Processed 2250000 / 2652106


[00:52:18] Conflicting single bond directions around double bond at index 7.
[00:52:18]   BondStereo set to STEREONONE and single bond directions set to NONE.
[00:52:18] Conflicting single bond directions around double bond at index 7.
[00:52:18]   BondStereo set to STEREONONE and single bond directions set to NONE.


Processed 2500000 / 2652106
Processed 2652106 / 2652106


In [13]:
df.head()

Unnamed: 0,receptor,mol_id,smiles,type,canonical_smiles_with_stereo,canonical_smiles_no_stereo
0,ADRB2,852502,CC(Nc1nc(n2c(C)cc(C)n2)nc(NC(C)C)n1)C,active_T,Cc1cc(C)n(-c2nc(NC(C)C)nc(NC(C)C)n2)n1,Cc1cc(C)n(-c2nc(NC(C)C)nc(NC(C)C)n2)n1
1,ADRB2,855847,CC[C@H]([C@H](c1c(c2c(O)cc1)ccc(O)n2)O)[NH2+]C...,active_T,CC[C@@H]([NH2+]C(C)C)[C@@H](O)c1ccc(O)c2nc(O)c...,CCC([NH2+]C(C)C)C(O)c1ccc(O)c2nc(O)ccc12
2,ADRB2,7976579,CCNc1nc(n2c(C)cc(C)n2)nc(NCC)n1,active_T,CCNc1nc(NCC)nc(-n2nc(C)cc2C)n1,CCNc1nc(NCC)nc(-n2nc(C)cc2C)n1
3,ADRB2,11532855,CCC[NH2+][C@@H](C(Nc1c(C)cccc1)=O)C,active_T,CCC[NH2+][C@H](C)C(=O)Nc1ccccc1C,CCC[NH2+]C(C)C(=O)Nc1ccccc1C
4,ADRB2,11532990,COc1ccc([C@@H]([NH2+]C[C@@H](c2cc(NC=O)c(O)cc2...,active_T,COc1ccc([C@H](C)[NH2+]C[C@H](O)c2ccc(O)c(NC=O)...,COc1ccc(C(C)[NH2+]CC(O)c2ccc(O)c(NC=O)c2)cc1


In [17]:
print("Number of unique smiles:", df["smiles"].nunique(dropna=True))
print("Number of unique canonical_smiles_with_stereo:", df["canonical_smiles_with_stereo"].nunique(dropna=True))
print("Number of unique canonical_smiles_no_stereo:", df["canonical_smiles_no_stereo"].nunique(dropna=True))

Number of unique smiles: 382863
Number of unique canonical_smiles_with_stereo: 382856
Number of unique canonical_smiles_no_stereo: 378028


In [None]:
grouped = df.groupby("canonical_smiles_no_stereo")["canonical_smiles_with_stereo"].nunique()
ambiguous = grouped[grouped > 1]

variant_counts = ambiguous.value_counts().sort_index()

print("\nStats: Number of canonical_smiles_no_stereo with N distinct canonical_smiles_with_stereo variants (N > 1):")
for n_variants, count in variant_counts.items():
    print(f"{count} molecules have {n_variants} stereovariants")


Stats: Number of canonical_smiles_no_stereo with N distinct canonical_smiles_with_stereo variants (N > 1):
2996 molecules have 2 stereovariants
304 molecules have 3 stereovariants
137 molecules have 4 stereovariants
44 molecules have 5 stereovariants
34 molecules have 6 stereovariants
32 molecules have 7 stereovariants
27 molecules have 8 stereovariants
3 molecules have 9 stereovariants
1 molecules have 10 stereovariants
1 molecules have 11 stereovariants
1 molecules have 14 stereovariants
2 molecules have 16 stereovariants


Let's determine the number of duplicate entries within each set for every receptor.

In [21]:
summary_rows = []

for receptor in receptor_names:
    # Subset for this receptor
    df_receptor = df[df["receptor"] == receptor].copy()

    # Training set only
    train_types = ["active_T", "inactive_T"]
    df_train = df_receptor[df_receptor["type"].isin(train_types)].copy()
    n_train_total = df_train.shape[0]
    n_train_no_stereo = df_train["canonical_smiles_no_stereo"].nunique()

    # Find ambiguous molecules: those with >1 stereovariant in training set
    ambiguous = df_train.groupby("canonical_smiles_no_stereo")["canonical_smiles_with_stereo"].nunique()
    ambiguous = ambiguous[ambiguous > 1]
    n_ambiguous = ambiguous.shape[0]

    # Overlap: molecules with stereovariants in both actives and inactives
    ambiguous_no_stereo = set(ambiguous.index)
    ambiguous_sub = df_train[df_train["canonical_smiles_no_stereo"].isin(ambiguous_no_stereo)]
    both_active_inactive = ambiguous_sub.groupby("canonical_smiles_no_stereo")["type"].apply(lambda x: {"active_T", "inactive_T"}.issubset(set(x)))
    n_both = both_active_inactive.sum()

    percent_ambiguous = 100 * n_ambiguous / n_train_no_stereo if n_train_no_stereo else 0
    percent_both_of_ambiguous = 100 * n_both / n_ambiguous if n_ambiguous else 0
    percent_both_of_total = 100 * n_both / n_train_no_stereo if n_train_no_stereo else 0

    summary_rows.append({
        "receptor": receptor,
        "n_train_total": n_train_total,
        "n_train_no_stereo": n_train_no_stereo,
        "n_ambiguous": n_ambiguous,
        "percent_ambiguous": percent_ambiguous,
        "n_both": n_both,
        "percent_both_of_ambiguous": percent_both_of_ambiguous,
        "percent_both_of_total": percent_both_of_total,
    })

summary_df = pd.DataFrame(summary_rows)

# Only show the most important columns for the "story"
pd.set_option('display.float_format', '{:.3f}'.format)
cols = [
    "receptor",
    "n_train_total",
    "n_train_no_stereo",
    "n_ambiguous",
    "percent_ambiguous",
    "n_both",
    "percent_both_of_ambiguous",
    "percent_both_of_total"
]
display(summary_df)

Unnamed: 0,receptor,n_train_total,n_train_no_stereo,n_ambiguous,percent_ambiguous,n_both,percent_both_of_ambiguous,percent_both_of_total
0,ADRB2,233970,232892,901,0.387,0,0.0,0.0
1,ALDH1,80597,80101,431,0.538,7,1.624,0.009
2,ESR1_ago,3480,3441,35,1.017,0,0.0,0.0
3,ESR1_ant,3089,3058,27,0.883,0,0.0,0.0
4,FEN1,264040,262705,1099,0.418,0,0.0,0.0
5,GBA,219164,217622,1307,0.601,1,0.077,0.0
6,IDH1,269694,267918,1313,0.49,0,0.0,0.0
7,KAT2A,258213,256628,1277,0.498,0,0.0,0.0
8,MAPK1,46548,46152,343,0.743,0,0.0,0.0
9,MTORC1,24802,24802,0,0.0,0,0.0,0.0


In [22]:
# Compute two tables: one for no stereo, one for with stereo, and display both at the end

receptor_summary_no_stereo = []
receptor_summary_stereo = []

for receptor in receptor_names:
    sub = df[df["receptor"] == receptor].copy()
    sub["group"] = sub["type"].replace({"query": "queries"})

    queries = sub[sub["group"] == "queries"]
    active_T = sub[sub["group"] == "active_T"]
    active_V = sub[sub["group"] == "active_V"]
    inactive_T = sub[sub["group"] == "inactive_T"]
    inactive_V = sub[sub["group"] == "inactive_V"]

    row_no_stereo = {
        "receptor": receptor,
        "queries_no_stereo": queries["canonical_smiles_no_stereo"].duplicated().sum(),
        "active_T_no_stereo": active_T["canonical_smiles_no_stereo"].duplicated().sum(),
        "active_V_no_stereo": active_V["canonical_smiles_no_stereo"].duplicated().sum(),
        "inactive_T_no_stereo": inactive_T["canonical_smiles_no_stereo"].duplicated().sum(),
        "inactive_V_no_stereo": inactive_V["canonical_smiles_no_stereo"].duplicated().sum(),
    }
    row_stereo = {
        "receptor": receptor,
        "queries_stereo": queries["canonical_smiles_with_stereo"].duplicated().sum(),
        "active_T_stereo": active_T["canonical_smiles_with_stereo"].duplicated().sum(),
        "active_V_stereo": active_V["canonical_smiles_with_stereo"].duplicated().sum(),
        "inactive_T_stereo": inactive_T["canonical_smiles_with_stereo"].duplicated().sum(),
        "inactive_V_stereo": inactive_V["canonical_smiles_with_stereo"].duplicated().sum(),
    }
    receptor_summary_no_stereo.append(row_no_stereo)
    receptor_summary_stereo.append(row_stereo)

print("Table: Number of duplicate canonical SMILES within each set (per receptor) [no stereo]")
receptor_summary_no_stereo_df = pd.DataFrame(receptor_summary_no_stereo)
display(receptor_summary_no_stereo_df)

print("Table: Number of duplicate canonical SMILES within each set (per receptor) [with stereo]")
receptor_summary_stereo_df = pd.DataFrame(receptor_summary_stereo)
display(receptor_summary_stereo_df)

Table: Number of duplicate canonical SMILES within each set (per receptor) [no stereo]


Unnamed: 0,receptor,queries_no_stereo,active_T_no_stereo,active_V_no_stereo,inactive_T_no_stereo,inactive_V_no_stereo
0,ADRB2,2,0,0,1078,135
1,ALDH1,0,5,1,484,30
2,ESR1_ago,0,0,0,39,2
3,ESR1_ant,0,1,0,30,1
4,FEN1,0,3,0,1332,172
5,GBA,0,1,0,1540,136
6,IDH1,0,0,0,1776,203
7,KAT2A,0,1,0,1584,155
8,MAPK1,0,0,0,396,37
9,MTORC1,6,0,0,0,0


Table: Number of duplicate canonical SMILES within each set (per receptor) [with stereo]


Unnamed: 0,receptor,queries_stereo,active_T_stereo,active_V_stereo,inactive_T_stereo,inactive_V_stereo
0,ADRB2,2,0,0,0,0
1,ALDH1,0,0,0,0,0
2,ESR1_ago,0,0,0,0,0
3,ESR1_ant,0,0,0,0,0
4,FEN1,0,0,0,0,0
5,GBA,0,0,0,0,0
6,IDH1,0,0,0,0,0
7,KAT2A,0,0,0,0,0
8,MAPK1,0,0,0,0,0
9,MTORC1,6,0,0,0,0


In [25]:
import itertools

group_names = ["queries", "active_T", "active_V", "inactive_T", "inactive_V"]

# Dataframes for overlap counts (no stereo and with stereo)
rows_no_stereo = []
rows_stereo = []

for receptor in receptor_names:
    sub = df[df["receptor"] == receptor].copy()
    sub["group"] = sub["type"].replace({"query": "queries"})

    queries = sub[sub["group"] == "queries"]
    active_T = sub[sub["group"] == "active_T"]
    active_V = sub[sub["group"] == "active_V"]
    inactive_T = sub[sub["group"] == "inactive_T"]
    inactive_V = sub[sub["group"] == "inactive_V"]

    # Build sets for intersections
    sets_no_stereo = {
        "queries": set(queries["canonical_smiles_no_stereo"].dropna()),
        "active_T": set(active_T["canonical_smiles_no_stereo"].dropna()),
        "active_V": set(active_V["canonical_smiles_no_stereo"].dropna()),
        "inactive_T": set(inactive_T["canonical_smiles_no_stereo"].dropna()),
        "inactive_V": set(inactive_V["canonical_smiles_no_stereo"].dropna()),
    }
    sets_stereo = {
        "queries": set(queries["canonical_smiles_with_stereo"].dropna()),
        "active_T": set(active_T["canonical_smiles_with_stereo"].dropna()),
        "active_V": set(active_V["canonical_smiles_with_stereo"].dropna()),
        "inactive_T": set(inactive_T["canonical_smiles_with_stereo"].dropna()),
        "inactive_V": set(inactive_V["canonical_smiles_with_stereo"].dropna()),
    }

    row_no_stereo = {"receptor": receptor}
    row_stereo = {"receptor": receptor}

    for g1, g2 in itertools.combinations(group_names, 2):
        key = f"{g1} ∩ {g2}"
        row_no_stereo[key] = len(sets_no_stereo[g1] & sets_no_stereo[g2])
        row_stereo[key] = len(sets_stereo[g1] & sets_stereo[g2])

    rows_no_stereo.append(row_no_stereo)
    rows_stereo.append(row_stereo)

# Display DataFrame for no stereo
print("Table: Number of overlapping canonical SMILES between sets (per receptor) [no stereo]")
receptor_summary_no_stereo_df = pd.DataFrame(rows_no_stereo)
display(receptor_summary_no_stereo_df)

# Display DataFrame for with stereo
print("Table: Number of overlapping canonical SMILES between sets (per receptor) [with stereo]")
receptor_summary_stereo_df = pd.DataFrame(rows_stereo)
display(receptor_summary_stereo_df)

Table: Number of overlapping canonical SMILES between sets (per receptor) [no stereo]


Unnamed: 0,receptor,queries ∩ active_T,queries ∩ active_V,queries ∩ inactive_T,queries ∩ inactive_V,active_T ∩ active_V,active_T ∩ inactive_T,active_T ∩ inactive_V,active_V ∩ inactive_T,active_V ∩ inactive_V,inactive_T ∩ inactive_V
0,ADRB2,0,0,0,0,0,0,0,0,0,526
1,ALDH1,0,0,0,0,0,7,3,12,0,219
2,ESR1_ago,0,0,0,0,0,0,0,0,0,18
3,ESR1_ant,0,0,0,0,0,0,0,2,0,15
4,FEN1,0,0,0,0,0,0,1,2,0,698
5,GBA,0,0,0,0,0,1,0,2,0,707
6,IDH1,0,0,0,0,0,0,0,0,0,897
7,KAT2A,0,0,0,0,0,0,1,2,0,781
8,MAPK1,0,0,0,0,0,0,0,1,1,214
9,MTORC1,0,0,0,0,0,0,0,0,0,0


Table: Number of overlapping canonical SMILES between sets (per receptor) [with stereo]


Unnamed: 0,receptor,queries ∩ active_T,queries ∩ active_V,queries ∩ inactive_T,queries ∩ inactive_V,active_T ∩ active_V,active_T ∩ inactive_T,active_T ∩ inactive_V,active_V ∩ inactive_T,active_V ∩ inactive_V,inactive_T ∩ inactive_V
0,ADRB2,0,0,0,0,0,0,0,0,0,0
1,ALDH1,0,0,0,0,0,0,0,0,0,0
2,ESR1_ago,0,0,0,0,0,0,0,0,0,0
3,ESR1_ant,0,0,0,0,0,0,0,0,0,0
4,FEN1,0,0,0,0,0,0,0,0,0,0
5,GBA,0,0,0,0,0,0,0,0,0,0
6,IDH1,0,0,0,0,0,0,0,0,0,0
7,KAT2A,0,0,0,0,0,0,0,0,0,0
8,MAPK1,0,0,0,0,0,0,0,0,0,0
9,MTORC1,0,0,0,0,0,0,0,0,0,0



<br>
For each receptor, use canonicalized smiles to find repeats within: <br>
- queries/queries<br>
- queries/active_T<br>
- queries/active_V<br>
- active_T/active_T<br>
- active_T/active_V<br>
- inactive_T/inactive_T<br>
- inactive_T/inactive_V<br>

We also keep track of overall stats about repeats and print at the end.

In [32]:
import collections

# Initialize sets to track stats of overlaps and repeats across all receptors
overall_queries_overlap = set()  # Query molecules that overlap with train/val sets
overall_queries_overlap_val = set()  # Query molecules that overlap with validation set
overall_train_val_overlaps = {"active": set(), "inactive": set()}  # Molecules in both train and val sets
overall_repeats = {group: set() for group in ["queries", "active_T", "active_V", "inactive_T", "inactive_V"]}  # Repeated molecules by group

# Process each receptor separately
receptors = df["receptor"].unique()
for receptor in receptors:
    found_any = False  # Track if any overlaps/repeats found for this receptor
    sub = df[df["receptor"] == receptor].copy()
    # Map type to group name (queries for query type)
    sub["group"] = sub["type"].apply(lambda t: "queries" if t == "query" else t)

    # Group by canonical SMILES (no stereo) to find overlaps
    grouped = sub.groupby("canonical_smiles_no_stereo")["group"].agg(lambda x: set(x.dropna()))
    grouped_type = sub.groupby("canonical_smiles_no_stereo")["type"].agg(lambda x: set(x.dropna()))

    # Find query molecules that overlap with train/val sets
    queries_overlap = []
    queries_overlap_val = []
    queries_overlap_examples = []
    for smi, groups in grouped.items():
        if "queries" in groups and (("active_T" in groups) or ("active_V" in groups) or ("inactive_T" in groups) or ("inactive_V" in groups)):
            queries_overlap.append(smi)
            overall_queries_overlap.add(smi)
            if ("active_V" in groups) or ("inactive_V" in groups):
                queries_overlap_val.append(smi)
                overall_queries_overlap_val.add(smi)
            # Store example rows for reporting
            rows = sub[sub["canonical_smiles_no_stereo"] == smi]
            example = {
                "smi": smi,
                "rows": [
                    {
                        "group": row['group'],
                        "type": row['type'],
                        "mol_id": row['mol_id']
                    }
                    for _, row in rows.iterrows()
                ]
            }
            queries_overlap_examples.append(example)

    # Find molecules that overlap between train and validation sets
    train_val_overlaps = {}
    train_val_examples = {}
    for label in ["active", "inactive"]:
        t_type = f"{label}_T"
        v_type = f"{label}_V"
        t_smiles = set(sub[sub["type"] == t_type]["canonical_smiles_no_stereo"].dropna())
        v_smiles = set(sub[sub["type"] == v_type]["canonical_smiles_no_stereo"].dropna())
        overlap = t_smiles & v_smiles
        train_val_overlaps[label] = overlap
        overall_train_val_overlaps[label].update(overlap)
        train_val_examples[label] = []
        if overlap:
            # Store example rows for reporting
            for smi in overlap:
                rows = sub[sub["canonical_smiles_no_stereo"] == smi]
                example = {
                    "smi": smi,
                    "rows": [
                        {
                            "group": row['group'],
                            "type": row['type'],
                            "mol_id": row['mol_id']
                        }
                        for _, row in rows.iterrows()
                    ]
                }
                train_val_examples[label].append(example)

    # Find repeated molecules within each group
    repeat_types = [
        ("query", "queries"),
        ("active_T", "active_T"),
        ("active_V", "active_V"), 
        ("inactive_T", "inactive_T"),
        ("inactive_V", "inactive_V"),
    ]
    repeats = {}
    repeat_examples = {}
    for t, group_name in repeat_types:
        smiles_list = sub[sub["type"] == t]["canonical_smiles_no_stereo"].dropna()
        counter = collections.Counter(smiles_list)
        repeated = [smi for smi, count in counter.items() if count > 1]
        repeats[group_name] = repeated
        overall_repeats[group_name].update(repeated)
        repeat_examples[group_name] = []
        # Store example rows for reporting (up to 3)
        for smi in repeated[:3]:
            rows = sub[(sub["type"] == t) & (sub["canonical_smiles_no_stereo"] == smi)]
            example = {
                "smi": smi,
                "rows": [
                    {
                        "group": row['group'],
                        "type": row['type'],
                        "mol_id": row['mol_id']
                    }
                    for _, row in rows.iterrows()
                ]
            }
            repeat_examples[group_name].append(example)

    # Generate report for this receptor
    output_lines = []
    if queries_overlap_val:
        found_any = True
        output_lines.append(f"\n{'='*80}\n[Receptor: {receptor}] Canonical SMILES Overlaps (no stereo)")
        output_lines.append("-"*80)
        output_lines.append(f"For receptor {receptor}, {len(queries_overlap_val)} query molecule(s) are also present in the validation set active_V.")
        output_lines.append("  Example(s) of query/active_V overlap (top 3):")
        for example in queries_overlap_examples[:3]:
            output_lines.append(f"    SMILES: {example['smi']}")
            for row in example['rows']:
                output_lines.append(f"      group: {row['group']}, type: {row['type']}, mol_id: {row['mol_id']}")
            output_lines.append("")
    elif queries_overlap:
        found_any = True
        output_lines.append(f"\n{'='*80}\n[Receptor: {receptor}] Canonical SMILES Overlaps (no stereo)")
        output_lines.append("-"*80)
        output_lines.append(f"For receptor {receptor}, {len(queries_overlap)} query molecule(s) are also present in the training set active_T.")
        output_lines.append("  Example(s) of query/active_T overlap (top 3):")
        for example in queries_overlap_examples[:3]:
            output_lines.append(f"    SMILES: {example['smi']}")
            for row in example['rows']:
                output_lines.append(f"      group: {row['group']}, type: {row['type']}, mol_id: {row['mol_id']}")
            output_lines.append("")

    # Report train/val overlaps
    for label in ["active", "inactive"]:
        overlap = train_val_overlaps[label]
        if overlap:
            if not found_any:
                output_lines.append(f"\n{'='*80}\n[Receptor: {receptor}] Canonical SMILES Overlaps (no stereo)")
                output_lines.append("-"*80)
                found_any = True
            output_lines.append(f"For receptor {receptor}, {len(overlap)} {label} molecule(s) are in both training and validation sets.")
            output_lines.append(f"  Example(s) of {label}_T/{label}_V overlap (top 3):")
            for example in train_val_examples[label][:3]:
                output_lines.append(f"    SMILES: {example['smi']}")
                for row in example['rows']:
                    output_lines.append(f"      group: {row['group']}, type: {row['type']}, mol_id: {row['mol_id']}")
                output_lines.append("")

    # Report repeats
    for t, group_name in repeat_types:
        repeated = repeats[group_name]
        if repeated:
            if not found_any:
                output_lines.append(f"\n{'='*80}\n[Receptor: {receptor}] Canonical SMILES Overlaps (no stereo)")
                output_lines.append("-"*80)
                found_any = True
            output_lines.append(f"For receptor {receptor}, {len(repeated)} repeating {group_name} molecule(s) found in the set.")
            output_lines.append(f"  Example(s) of repeating {group_name} (top 3):")
            for example in repeat_examples[group_name]:
                output_lines.append(f"    SMILES: {example['smi']}")
                for row in example['rows']:
                    output_lines.append(f"      group: {row['group']}, type: {row['type']}, mol_id: {row['mol_id']}")
                output_lines.append("")

    # Print report for this receptor if any issues found
    if found_any:
        output_lines.append("-"*80 + "\n")
        print("\n".join(output_lines))

# Check if any issues found across all receptors
overall_found = (
    len(overall_queries_overlap) > 0 or
    len(overall_queries_overlap_val) > 0 or
    len(overall_train_val_overlaps['active']) > 0 or
    len(overall_train_val_overlaps['inactive']) > 0 or
    any(len(overall_repeats[group]) > 0 for group in ['queries', 'active_T', 'inactive_T', 'active_V', 'inactive_V'])
)

# Print overall statistics if any issues found
if overall_found:
    print("\n" + "="*80)
    print("OVERALL CANONICAL SMILES OVERLAP STATISTICS ACROSS ALL RECEPTORS (no stereo)")
    print("-"*80)
    if len(overall_queries_overlap) > 0:
        print(f"Number of query molecules also in training set (overall): {len(overall_queries_overlap)}")
    if len(overall_queries_overlap_val) > 0:
        print(f"Number of query molecules also in validation set (overall): {len(overall_queries_overlap_val)}")
    if len(overall_train_val_overlaps['active']) > 0:
        print(f"Number of active molecules in both train and val (overall): {len(overall_train_val_overlaps['active'])}")
    if len(overall_train_val_overlaps['inactive']) > 0:
        print(f"Number of inactive molecules in both train and val (overall): {len(overall_train_val_overlaps['inactive'])}")
    for group in ['queries', 'active_T', 'inactive_T', 'active_V', 'inactive_V']:
        if len(overall_repeats[group]) > 0:
            print(f"Number of repeating molecules in {group} (overall): {len(overall_repeats[group])}")
    print("="*80 + "\n")



[Receptor: ADRB2] Canonical SMILES Overlaps (no stereo)
--------------------------------------------------------------------------------
For receptor ADRB2, 526 inactive molecule(s) are in both training and validation sets.
  Example(s) of inactive_T/inactive_V overlap (top 3):
    SMILES: COC(=O)C=C1SC(NC(=O)c2cccc([N+](=O)[O-])c2)=[NH+]C1=O
      group: inactive_T, type: inactive_T, mol_id: 49714892
      group: inactive_V, type: inactive_V, mol_id: 49817206

    SMILES: C[NH+]1CCN(C(=O)C(=Cc2cccs2)NC(=O)c2cccs2)CC1
      group: inactive_T, type: inactive_T, mol_id: 17506379
      group: inactive_V, type: inactive_V, mol_id: 17508328

    SMILES: CCCN1C(=O)SC(=Cc2ccc(N3CCOCC3)o2)C1=O
      group: inactive_T, type: inactive_T, mol_id: 26662643
      group: inactive_V, type: inactive_V, mol_id: 24810986

For receptor ADRB2, 1 repeating queries molecule(s) found in the set.
  Example(s) of repeating queries (top 3):
    SMILES: Cc1ccccc1CC(C)(C)NCC(O)c1ccc(O)c2c1OCC(=O)N2
      group: 

<br>
Here we look at tanimoto similarity within the queries set for each receptor. 

In [28]:
from rdkit.Chem import rdFingerprintGenerator, DataStructs
import numpy as np
import pandas as pd

# Number of top most similar pairs to find for each receptor within the set of query molecules
top_n = 5
results = {}

# Iterate through each receptor
for receptor in receptor_names:
    # Get query molecules for this receptor
    query_df = df[(df['receptor'] == receptor) & (df['type'] == 'query')].copy()
    # Skip if less than 2 query molecules
    if query_df.empty or len(query_df) < 2:
        continue

    # Convert SMILES to RDKit molecules if needed
    query_mols = query_df['rdmol'].tolist() if 'rdmol' in query_df.columns else [Chem.MolFromSmiles(smi) for smi in query_df['smiles']]

    # Generate Morgan fingerprints with radius 2 and 4096 bits
    fp_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2**12)
    query_fps = [fp_generator.GetFingerprint(mol) for mol in query_mols]

    # Calculate Tanimoto similarity between all pairs
    n = len(query_fps)
    pairs = []
    mol_ids = query_df['mol_id'].tolist()
    for i in range(n):
        for j in range(i+1, n):
            # Get Tanimoto similarity between fingerprints
            sim = DataStructs.TanimotoSimilarity(query_fps[i], query_fps[j])
            pairs.append((sim, mol_ids[i], mol_ids[j]))

    # Get top N most similar pairs
    top_pairs = sorted(pairs, reverse=True)[:top_n]
    results[receptor] = top_pairs

# Print results for each receptor
for receptor, top_pairs in results.items():
    print(f"\nReceptor: {receptor}")
    print("Top 5 most similar query pairs (Tanimoto):")
    for sim, mol_id1, mol_id2 in top_pairs:
        print(f"  {mol_id1} - {mol_id2}: similarity = {sim:.4f}")



Receptor: ADRB2
Top 5 most similar query pairs (Tanimoto):
  4lde - 3p0g: similarity = 1.0000
  3sn6 - 4lde: similarity = 1.0000
  3sn6 - 3p0g: similarity = 1.0000
  4qkx - 3pds: similarity = 0.5634
  4ldo - 4ldl: similarity = 0.5366

Receptor: ALDH1
Top 5 most similar query pairs (Tanimoto):
  4wpn - 4wp7: similarity = 0.5882
  5l2m - 5l2n: similarity = 0.2794
  5l2n - 5l2o: similarity = 0.2712
  5l2m - 5l2o: similarity = 0.2258
  4x4l - 5tei: similarity = 0.2043

Receptor: ESR1_ago
Top 5 most similar query pairs (Tanimoto):
  5dzi - 5e1c: similarity = 0.4800
  2qzo - 4ivw: similarity = 0.4754
  5du5 - 5drj: similarity = 0.4000
  2b1v - 2q70: similarity = 0.3051
  5e1c - 2qr9: similarity = 0.2963

Receptor: ESR1_ant
Top 5 most similar query pairs (Tanimoto):
  2iog - 2iok: similarity = 0.6438
  1xp1 - 5ufx: similarity = 0.4286
  2r6w - 5ufx: similarity = 0.4250
  2r6w - 2ayr: similarity = 0.3780
  1xp1 - 2r6w: similarity = 0.3671

Receptor: GBA
Top 5 most similar query pairs (Tanimot


Here we look at Maximum Common Substructure ratio (MCS) within the queries set for each receptor. Ratio of '1' between means that the smaller query molecule is completely embedded in the larger query molecule. 

In [29]:
from rdkit.Chem import rdFMCS

# Number of top most similar pairs to find for each receptor within the set of query molecules
top_n = 5
mcs_results = {}

# Iterate through each receptor
for receptor in receptor_names:
    # Get query molecules for this receptor
    query_df = df[(df['receptor'] == receptor) & (df['type'] == 'query')].copy()
    # Skip if less than 2 query molecules
    if query_df.empty or len(query_df) < 2:
        continue

    # Remove duplicate SMILES to avoid redundant comparisons
    query_df_nodup = query_df.drop_duplicates(subset='smiles').copy()
    if query_df_nodup.empty or len(query_df_nodup) < 2:
        continue

    # Convert SMILES to RDKit molecules if needed
    query_mols = query_df_nodup['rdmol'].tolist() if 'rdmol' in query_df_nodup.columns else [Chem.MolFromSmiles(smi) for smi in query_df_nodup['smiles']]
    mol_ids = query_df_nodup['mol_id'].tolist()
    n = len(query_mols)
    pairs = []

    # Compare each pair of molecules
    for i in range(n):
        for j in range(i+1, n):
            mol1 = query_mols[i]
            mol2 = query_mols[j]
            # Find Maximum Common Substructure between the two molecules
            # completeRingsOnly=True ensures rings are matched as complete units
            # timeout=10 prevents hanging on difficult comparisons
            res = rdFMCS.FindMCS([mol1, mol2], completeRingsOnly=True, ringMatchesRingOnly=True, timeout=10)
            
            # Calculate similarity ratio based on MCS size
            if res.canceled or res.numAtoms == 0:
                ratio = 0.0
            else:
                mcs_smarts = res.smartsString
                mcs_mol = Chem.MolFromSmarts(mcs_smarts)
                if mcs_mol is not None:
                    # Calculate ratio of MCS atoms to smaller molecule's atoms
                    mcs_num_atoms = mcs_mol.GetNumAtoms()
                    min_atoms = min(mol1.GetNumAtoms(), mol2.GetNumAtoms())
                    ratio = mcs_num_atoms / min_atoms if min_atoms > 0 else 0.0
                else:
                    ratio = 0.0
            pairs.append((ratio, mol_ids[i], mol_ids[j]))

    # Get top N most similar pairs
    top_pairs = sorted(pairs, reverse=True)[:top_n]
    mcs_results[receptor] = top_pairs

# Print results for each receptor
for receptor, top_pairs in mcs_results.items():
    print(f"\nReceptor: {receptor}")
    print("Top 5 most similar query pairs (MCS ratio):")
    for ratio, mol_id1, mol_id2 in top_pairs:
        print(f"  {mol_id1} - {mol_id2}: MCS ratio = {ratio:.4f}")



Receptor: ADRB2
Top 5 most similar query pairs (MCS ratio):
  4qkx - 4ldo: MCS ratio = 1.0000
  4ldo - 4ldl: MCS ratio = 1.0000
  6mxt - 4ldo: MCS ratio = 0.9231
  4qkx - 3pds: MCS ratio = 0.9231
  3sn6 - 4ldo: MCS ratio = 0.9231

Receptor: ALDH1
Top 5 most similar query pairs (MCS ratio):
  4wpn - 4wp7: MCS ratio = 0.7143
  5l2n - 5l2o: MCS ratio = 0.7059
  5l2m - 5l2o: MCS ratio = 0.7059
  5l2m - 5l2n: MCS ratio = 0.5417
  4x4l - 5ac2: MCS ratio = 0.3636

Receptor: ESR1_ago
Top 5 most similar query pairs (MCS ratio):
  5dzi - 5e1c: MCS ratio = 1.0000
  2b1z - 2p15: MCS ratio = 1.0000
  2qzo - 4ivw: MCS ratio = 0.8750
  2b1z - 1l2i: MCS ratio = 0.7143
  2b1v - 2q70: MCS ratio = 0.6842

Receptor: ESR1_ant
Top 5 most similar query pairs (MCS ratio):
  2iog - 2iok: MCS ratio = 1.0000
  3dt3 - 2ayr: MCS ratio = 0.9259
  5fqv - 5t92: MCS ratio = 0.8148
  1xqc - 5t92: MCS ratio = 0.7667
  3dt3 - 2ouz: MCS ratio = 0.6296

Receptor: GBA
Top 5 most similar query pairs (MCS ratio):
  3ril - 3r

<br>
Here we calculate Tanimoto similarity scores between active molecules from the training set and active molecules from the validation set. We set the similarity threshold as 0.6. If any pairs of molecules from training and validation sets have similarity scores above this threshold, it indicates data leakage between the sets.

In [30]:
from rdkit.Chem import rdFingerprintGenerator, DataStructs

# Create Morgan fingerprint generator with radius 2 and 4096 bits
fp_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2**12)

# Set minimum similarity threshold for considering two molecules similar
similarity_threshold = 0.6

for receptor in receptor_names:
    # Get active compounds from training and validation sets for this receptor
    active_T_df = df[(df['receptor'] == receptor) & (df['type'] == 'active_T')].copy()
    active_V_df = df[(df['receptor'] == receptor) & (df['type'] == 'active_V')].copy()
    if active_T_df.empty or active_V_df.empty:
        continue

    # Remove duplicate SMILES strings
    active_T_df_nodup = active_T_df.drop_duplicates(subset='smiles').copy()
    active_V_df_nodup = active_V_df.drop_duplicates(subset='smiles').copy()
    if active_T_df_nodup.empty or active_V_df_nodup.empty:
        continue

    # Convert SMILES to RDKit molecules if needed
    active_T_mols = active_T_df_nodup['rdmol'].tolist() if 'rdmol' in active_T_df_nodup.columns else [Chem.MolFromSmiles(smi) for smi in active_T_df_nodup['smiles']]
    active_V_mols = active_V_df_nodup['rdmol'].tolist() if 'rdmol' in active_V_df_nodup.columns else [Chem.MolFromSmiles(smi) for smi in active_V_df_nodup['smiles']]
    active_T_ids = active_T_df_nodup['mol_id'].tolist()
    active_V_ids = active_V_df_nodup['mol_id'].tolist()

    # Generate Morgan fingerprints for all molecules
    active_T_fps = [fp_generator.GetFingerprint(mol) for mol in active_T_mols]
    active_V_fps = [fp_generator.GetFingerprint(mol) for mol in active_V_mols]

    # Compare each training active against each validation active
    found_similar = []
    for i, afp in enumerate(active_T_fps):
        for j, vfp in enumerate(active_V_fps):
            # Calculate Tanimoto similarity between fingerprints
            sim = DataStructs.TanimotoSimilarity(afp, vfp)
            if sim > similarity_threshold:
                found_similar.append((sim, active_T_ids[i], active_V_ids[j]))

    # Print results for this receptor
    print(f"\nReceptor: {receptor} | Validation active_T vs active_V")
    print(f"Number of pairs with Tanimoto similarity > {similarity_threshold}: {len(found_similar)}")
    if found_similar:
        print(f"Pairs with Tanimoto similarity > {similarity_threshold}:")
        for sim, mol_id1, mol_id2 in sorted(found_similar, reverse=True):
            print(f"  {mol_id1} (active_T) - {mol_id2} (active_V): similarity = {sim:.4f}")
    else:
        print(f"No pairs with Tanimoto similarity > {similarity_threshold}")


Receptor: ADRB2 | Validation active_T vs active_V
Number of pairs with Tanimoto similarity > 0.6: 0
No pairs with Tanimoto similarity > 0.6

Receptor: ALDH1 | Validation active_T vs active_V
Number of pairs with Tanimoto similarity > 0.6: 323
Pairs with Tanimoto similarity > 0.6:
  17513169 (active_T) - 26663676 (active_V): similarity = 0.9796
  17514456 (active_T) - 17413119 (active_V): similarity = 0.8909
  849849 (active_T) - 851058 (active_V): similarity = 0.8852
  17514672 (active_T) - 17413119 (active_V): similarity = 0.8750
  865822 (active_T) - 866152 (active_V): similarity = 0.8636
  24781614 (active_T) - 24782909 (active_V): similarity = 0.8507
  4242197 (active_T) - 7973338 (active_V): similarity = 0.8485
  14742523 (active_T) - 26661991 (active_V): similarity = 0.8462
  4244867 (active_T) - 4241016 (active_V): similarity = 0.8421
  24828484 (active_T) - 24830679 (active_V): similarity = 0.8387
  7968537 (active_T) - 7975778 (active_V): similarity = 0.8367
  7967221 (active