In [2]:
import os
import pandas as pd
datas = []
for fold in os.listdir('folds'):
    for file in os.listdir(f'folds/{fold}'):
        data = pd.read_csv(f'folds/{fold}/{file}')
        datas.append(data)
combined_data = pd.concat(datas, ignore_index=True)
filtered_data = combined_data[['dlip_id', 'SMILES']]
filtered_data = filtered_data.drop_duplicates().reset_index(drop=True)
filtered_data.to_csv('mod_filtered_data.csv', index=False)

In [5]:
filtered_data.iloc[0,1]

'Cc1cccc(Cl)c1C(=O)N[C@@H](Cc1ccc(NC(=O)c2c(Cl)cccc2Cl)cc1)C(=O)O'

In [25]:
import requests
import json

def search_pubchem(smiles):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/substructure/smiles/{smiles}/JSON"
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.text)
    else:
        return None

# Example SMILES for testing
test_smiles = "Cc1cccc(Cl)c1C(=O)N[C@@H](Cc1ccc(NC(=O)c2c(Cl)cccc2Cl)cc1)C(=O)O"

result = search_pubchem(test_smiles)

if result:
    print(json.dumps(result, indent=2))
else:
    print("No results found or an error occurred.")


No results found or an error occurred.


In [4]:
import pubchempy as pcp
import os
import tempfile
import pandas as pd
from rdkit import Chem
from tqdm import tqdm

# Create mol_pdb folder if it doesn't exist
mol_pdb_folder = "mol_pdb"
os.makedirs(mol_pdb_folder, exist_ok=True)
error_list = []

# Load filtered_data
filtered_data = pd.read_csv('mod_filtered_data.csv')

for index, row in tqdm(filtered_data.iterrows(), total=len(filtered_data)):
    smiles = row['SMILES']
    dlip_id = row['dlip_id']
    if os.path.exists(f'{mol_pdb_folder}/{dlip_id}.pdb'):
        continue
    try:
        compound = pcp.get_compounds(smiles, 'smiles', record_type='3d')
        if compound:
            cid = compound[0].cid
            
            # Download SDF to a temporary file
            with tempfile.NamedTemporaryFile(mode="w", suffix=".sdf", delete=False) as temp_sdf_file:
                pcp.download('SDF', temp_sdf_file.name, cid, 'cid', overwrite=True, record_type='3d')
                
                # Convert SDF to PDB
                mol_supplier = Chem.SDMolSupplier(temp_sdf_file.name)
                for mol in mol_supplier:
                    if mol is not None:
                        pdb_file_path = os.path.join(mol_pdb_folder, f"{dlip_id}.pdb")
                        Chem.MolToPDBFile(mol, pdb_file_path)
                        print(f"Processed {pdb_file_path}")
                        break
                else:
                    error_list.append((dlip_id,smiles))
                    # print(f"No valid molecule found for {dlip_id}")

            # Clean up the temporary SDF file
            os.unlink(temp_sdf_file.name)
        else:
            error_list.append((dlip_id,smiles))
            # print(f"No compound found for {dlip_id} and smiles: {smiles}")
    except Exception as e:
        error_list.append((dlip_id,smiles))
        # print(f"Error processing {dlip_id}: {str(e)}")

print("All PDB files have been saved to the 'mol_pdb' folder.")
print(f"total length: {len(filtered_data)} and downloaded length: {len(os.listdir('mol_pdb'))}")

  0%|          | 0/21781 [00:00<?, ?it/s]

100%|██████████| 21781/21781 [03:39<00:00, 99.28it/s] 

All PDB files have been saved to the 'mol_pdb' folder.
total length: 21781 and downloaded length: 10025



