In [None]:
import pandas as pd
import pubchempy as pcp
from tqdm import tqdm

In [None]:
def get_smiles_robust(drug_name):
    results = pcp.get_compounds(drug_name, 'name')
    if results:
        compound = results[0]
        return compound.canonical_smiles
    else:
        return None

In [None]:
df = pd.read_csv('db_drug_interactions.csv')

unique_drugs = pd.concat([df['Drug 1'], df['Drug 2']]).unique()
print(f"Total unique drugs: {len(unique_drugs)}")

smiles_dict = {}
failed = []
for drug in tqdm(unique_drugs, desc="Fetching SMILES"):
    smiles = get_smiles_robust(drug)
    smiles_dict[drug] = smiles

print(f"\nFailed drugs:")
for drug in failed:
    print(f"  - {drug}")

"""
not able to get smiles for 
  - Polythiazide
  - Radium Ra 223 dichloride
  - Verteporfin
  - Sucralfate
  - Kaolin
  - Methadyl acetate
  - Nitric Oxide
  - Nitroprusside
  - Pentosan polysulfate
  - Mipomersen
  - Polymyxin B
"""

In [None]:
df['Drug1_SMILES'] = df['Drug 1'].map(smiles_dict)
df['Drug2_SMILES'] = df['Drug 2'].map(smiles_dict)

df_cleaned = df.dropna(subset=['Drug1_SMILES', 'Drug2_SMILES'])

print(f"\nOriginal dataset size: {len(df)} rows")
print(f"Cleaned dataset size: {len(df_cleaned)} rows")
print(f"Removed: {len(df) - len(df_cleaned)} rows")

df_cleaned.to_csv('drug_interactions_cleaned.csv', index=False)