In [2]:
import os
import pandas as pd
datas = []
for fold in os.listdir('folds'):
    for file in os.listdir(f'folds/{fold}'):
        data = pd.read_csv(f'folds/{fold}/{file}')
        datas.append(data)
combined_data = pd.concat(datas, ignore_index=True)
filtered_data = combined_data[['dlip_id', 'SMILES']]
filtered_data = filtered_data.drop_duplicates().reset_index(drop=True)
filtered_data.to_csv('mod_filtered_data.csv', index=False)

In [9]:
filtered_data.head()

Unnamed: 0,dlip_id,SMILES
0,T002ZI,Cc1cccc(Cl)c1C(=O)N[C@@H](Cc1ccc(NC(=O)c2c(Cl)...
1,T0026U,N=C(N)NCCC[C@@H]1NC(=O)[C@H]2COCCN2C(=O)[C@@H]...
2,T007JD,O=C(O)CCNC(=O)c1ccc2c(c1)C(=O)N(CCC1CCNCC1)C2
3,P0008W,CC(C)S(=O)(=O)NCCCn1c2c(c3ccccc31)C(=O)CN1CCCC...
4,T003RI,NCCCC[C@H](N)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)...


In [25]:
import requests
import json

def search_pubchem(smiles):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/{smiles}/JSON"
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.text)
    else:
        return None

# Example SMILES for testing
test_smiles = "Cc1cccc(Cl)c1C(=O)N[C@@H](Cc1ccc(NC(=O)c2c(Cl)cccc2Cl)cc1)C(=O)O"

result = search_pubchem(test_smiles)

if result:
    print(json.dumps(result, indent=2))
else:
    print("No results found or an error occurred.")


No results found or an error occurred.


In [12]:
import pubchempy as pcp
import os
import tempfile
import pandas as pd
from rdkit import Chem
from tqdm import tqdm

# Create mol_pdb folder if it doesn't exist
mol_pdb_folder = "mol_pdb"
os.makedirs(mol_pdb_folder, exist_ok=True)
error_list = []

# Load filtered_data
filtered_data = pd.read_csv('mod_filtered_data.csv')

for index, row in tqdm(filtered_data.iterrows(), total=len(filtered_data)):
    smiles = row['SMILES']
    dlip_id = row['dlip_id']
    if os.path.exists(f'{mol_pdb_folder}/{dlip_id}.pdb'):
        continue
    try:
        compound = pcp.get_compounds(smiles, 'smiles', record_type='3d')
        if compound:
            cid = compound[0].cid
            
            # Download SDF to a temporary file
            with tempfile.NamedTemporaryFile(mode="w", suffix=".sdf", delete=False) as temp_sdf_file:
                pcp.download('SDF', temp_sdf_file.name, cid, 'cid', overwrite=True, record_type='3d')
                
                # Convert SDF to PDB
                mol_supplier = Chem.SDMolSupplier(temp_sdf_file.name)
                for mol in mol_supplier:
                    if mol is not None:
                        pdb_file_path = os.path.join(mol_pdb_folder, f"{dlip_id}.pdb")
                        Chem.MolToPDBFile(mol, pdb_file_path)
                        print(f"Processed {pdb_file_path}")
                        break
                else:
                    error_list.append((dlip_id,smiles))
                    # print(f"No valid molecule found for {dlip_id}")

            # Clean up the temporary SDF file
            os.unlink(temp_sdf_file.name)
        else:
            error_list.append((dlip_id,smiles))
            # print(f"No compound found for {dlip_id} and smiles: {smiles}")
    except Exception as e:
        error_list.append((dlip_id,smiles))
        # print(f"Error processing {dlip_id}: {str(e)}")

print("All PDB files have been saved to the 'mol_pdb' folder.")
print(f"total length: {len(filtered_data)} and downloaded length: {len(os.listdir('mol_pdb'))}")

  0%|          | 0/21781 [00:00<?, ?it/s]

 83%|████████▎ | 18012/21781 [02:59<00:30, 125.03it/s]

In [10]:
mod_list = list(set(filtered_data['SMILES'].tolist()))
print(f"mod_list length: {len(mod_list)}")

mod_list length: 9817


In [11]:
error_list

[('T003RI',
  'NCCCC[C@H](N)C(=O)NCC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)NCCCCCC(=O)N[C@@H](CCCCNC(=O)CCCC[C@H]1SC[C@H]2NC(=O)N[C@H]21)C(=O)NCCCCCC(=O)NCCCC[C@@H]1CNC(=O)C[C@H](CCCCNC(=O)CCCCCNC(=O)[C@H](CCCCNC(=O)CCCC[C@H]2SC[C@H]3NC(=O)N[C@H]32)NC(=O)CCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)CNC(=O)[C@@H](N)CCCCN)NC[C@H](CCCCNC(=O)CCCCCNC(=O)[C@H](CCCCNC(=O)CCCC[C@H]2SC[C@H]3NC(=O)N[C@H]32)NC(=O)CCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)CNC(=O)[C@@H](N)CCCCN)NC(=O)C1'),
 ('T003RI', 'C[NH+](C)CCn1nnnc1Sc1cc(C2CC2)nc2ncnn12.[Cl-]'),
 ('P000CE',
  'CC[C@H](NC(=O)[C@H](C)NC)C(=O)N1C[C@@H](O)C[C@H]1Cc1c(-c2[nH]c3cc(F)ccc3c2C[C@@H]2C[C@H](O)CN2C(=O)[C@@H](CC)NC(=O)[C@H](C)NC)[nH]c2cc(F)ccc12'),
 ('C1012P',
  'CC1(C)CCC(c2ccc(Cl)cc2)=C(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(N[C@H](CCN5CCOCC5)CSc5ccccc5)c(S(=O)(=O)C(F)(F)F)c4)cc3)CC2)C1'),
 ('P000CE',
  'CC(C)(C)OC(=O)/N=C(\\NCCNC[C@@H]1O[C@H](CC(=O)NCCc2c[nH]c3ccccc23)[C@@H]2OC(C)(C)O[C@H]