Tool to generate files for decoy generation
Written by claude sonnet 4.5

In [4]:
import pandas as pd

# ============================================================================
# CONFIGURATION
# ============================================================================
INPUT_CSV = "P08173_random_subset.csv"
OUTPUT_CSV = "P08173_random_subset_with_active.csv"
OUTPUT_ISM = "actives.ism"
OUTPUT_DECOY_CONFIG = "decoy_generation.in"
SMILES_COLUMN = "canonical_smiles"  # Adjust this to match your CSV column name
ID_COLUMN = "ligand_chembl_id"      # Adjust this to match your CSV column name

# ============================================================================
# MAIN SCRIPT
# ============================================================================

# Read the CSV file
print(f"Reading {INPUT_CSV}...")
df = pd.read_csv(INPUT_CSV)
print(f"Loaded {len(df)} rows")

# Add 'active' column
df['active'] = 'active'
print("Added 'active' column")

# Save the modified CSV
df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved modified CSV to {OUTPUT_CSV}")

# Create the .ism file (SMILES format: SMILES ID)
print(f"\nCreating {OUTPUT_ISM}...")
with open(OUTPUT_ISM, 'w') as f:
    for idx, row in df.iterrows():
        smiles = row[SMILES_COLUMN]
        mol_id = row[ID_COLUMN]
        f.write(f"{smiles} {mol_id}\n")

print(f"Saved {len(df)} SMILES to {OUTPUT_ISM}")
print("\nFirst 3 lines of the .ism file:")
with open(OUTPUT_ISM, 'r') as f:
    for i, line in enumerate(f):
        if i < 3:
            print(line.strip())
        else:
            break

# Create the decoy generation configuration file
print(f"\nCreating {OUTPUT_DECOY_CONFIG}...")
decoy_config = """SMILES YES
PROTONATE YES
MWT 0 500
LOGP 0 3.6
RB 0 5
HBA 0 4
HBD 0 3
CHARGE 0 0
LIGAND TC RANGE 0.0 0.35
MINIMUM DECOYS PER LIGAND 1
DECOYS PER LIGAND 50
MAXIMUM TC BETWEEN DECOYS 0.8
TANIMOTO YES
"""

with open(OUTPUT_DECOY_CONFIG, 'w') as f:
    f.write(decoy_config)

print(f"Saved decoy generation config to {OUTPUT_DECOY_CONFIG}")

print("\nDone!")
print(f"\nGenerated files:")
print(f"  - {OUTPUT_CSV}")
print(f"  - {OUTPUT_ISM}")
print(f"  - {OUTPUT_DECOY_CONFIG}")

Reading P08173_random_subset.csv...
Loaded 100 rows
Added 'active' column
Saved modified CSV to P08173_random_subset_with_active.csv

Creating actives.ism...
Saved 100 SMILES to actives.ism

First 3 lines of the .ism file:
CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)CC[C@]5(C)[C@H]4[C@@H](O)C[C@]3(C)[C@]2(C(=O)CO)O1 CHEMBL1201012
Cc1cc2c(cc1Cc1ccc(-c3cocn3)cc1)C(=O)N([C@H]1CCOC[C@@H]1O)C2 CHEMBL4067722
Nc1c(CC(=O)O)cccc1C(=O)c1ccc(Br)cc1 CHEMBL1077

Creating decoy_generation.in...
Saved decoy generation config to decoy_generation.in

Done!

Generated files:
  - P08173_random_subset_with_active.csv
  - actives.ism
  - decoy_generation.in


In [None]:
import pandas as pd

# ============================================================================
# CONFIGURATION FOR DECOY APPENDING
# ============================================================================
DECOYS_SMI_FILE = "decoys.smi"
INPUT_CSV_WITH_ACTIVES = "P08173_random_subset_with_active.csv"
OUTPUT_CSV_WITH_DECOYS = "P08173_random_subset_with_decoys.csv"

# ============================================================================
# APPEND DECOYS TO CSV
# ============================================================================

print(f"Reading decoys from {DECOYS_SMI_FILE}...")
# Parse decoys.smi file (format: SMILES ID)
decoys_list = []
with open(DECOYS_SMI_FILE, 'r') as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith('#'):
            continue
        parts = line.split()
        if len(parts) >= 2:
            smiles = parts[0]
            mol_id = parts[1]
            decoys_list.append({
                'canonical_smiles': smiles,
                'std_smiles': smiles,
                'ligand_chembl_id': mol_id,
                'active': 'decoy'
            })

print(f"Loaded {len(decoys_list)} decoys from {DECOYS_SMI_FILE}")

# Read the CSV with actives
print(f"\nReading {INPUT_CSV_WITH_ACTIVES}...")
df_actives = pd.read_csv(INPUT_CSV_WITH_ACTIVES)
print(f"Loaded {len(df_actives)} active compounds")

# Create DataFrame from decoys
df_decoys = pd.DataFrame(decoys_list)

# Append decoys to the actives DataFrame
df_combined = pd.concat([df_actives, df_decoys], ignore_index=True)

print(f"\nCombined dataset:")
print(f"  Total compounds: {len(df_combined)}")
print(f"  Actives: {(df_combined['active'] == 'active').sum()}")
print(f"  Decoys: {(df_combined['active'] == 'decoy').sum()}")

# Save the combined CSV
df_combined.to_csv(OUTPUT_CSV_WITH_DECOYS, index=False)
print(f"\nSaved combined CSV to {OUTPUT_CSV_WITH_DECOYS}")

print("\nFirst 3 decoy entries:")
print(df_combined[df_combined['active'] == 'decoy'].head(3)[['canonical_smiles', 'ligand_chembl_id', 'active']])

print("\nDone!")

Reading decoys from decoys.smi...


NameError: name 'SMILES_COLUMN' is not defined