In [4]:
import requests
import json
import os
import csv
import tempfile
import re
import pandas as pd
from rdkit import Chem
import os


def get_info_from_pubchem(inchikey):
    response = requests.get(
        f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/JSON"
    )
    if response.status_code == 200:
        return response.json()  # Access the JSON data using .json property
    else:
        return None


def get_conformer_ids_from_cid(cid):
    response = requests.get(
        f" https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/conformers/JSON"
    )
    if response.status_code == 200:
        return response.json()  # Access the JSON data using .json property
    else:
        return None


def get_sdf_from_conformer_id(conformer_id):
    response = requests.get(
        f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/conformers/{conformer_id}/SDF"
    )
    if response.status_code == 200:
        return response.text  # Access the JSON data using .json property
    else:
        return None


def get_sdf_from_csv(csv_path):
    with open(csv_path, "r") as f:
        reader = csv.reader(f)
        inchikey = None
        for i, row in enumerate(reader):
            if i == 4:
                inchikey = row[1]
                break
        if inchikey is None:
            print("InChIKey not found in the CSV file.")
            return None
    print(inchikey)
    info = get_info_from_pubchem(inchikey)
    if info is None:
        print("Failed to retrieve information from PubChem.")
        return None

    cid = info["PC_Compounds"][0]["id"]["id"]["cid"]
    print(f"Compound ID (CID): {cid}")

    conformer_ids = get_conformer_ids_from_cid(cid)
    if conformer_ids is None:
        print("Failed to retrieve conformer IDs from PubChem.")
        return None
    # print(json.dumps(conformer_ids, indent=4))

    first_conformer_id = conformer_ids["InformationList"]["Information"][0][
        "ConformerID"
    ][0]
    print(f"First Conformer ID: {first_conformer_id}")

    sdf = get_sdf_from_conformer_id(first_conformer_id)
    if sdf is None:
        print("Failed to retrieve SDF data from PubChem.")
        return None

    return sdf, inchikey

def slugify(name):
    return re.sub(r"[\W_.-]+", "-", name)

def process_sdf(sdf, pdb_file_path):
    with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_sdf_file:
        temp_sdf_file.write(sdf)
    mol_supplier = Chem.SDMolSupplier(temp_sdf_file.name)
    for i, mol in enumerate(mol_supplier):
        if mol is not None:
            Chem.MolToPDBFile(mol, pdb_file_path)
            print(f"Processed {pdb_file_path}")
            return True
    print("mol is none")
    return False


def process_file(root, file, list_of_failures, downloads_path, sdf_to_csv):
    name = None
    try:
        df = pd.read_csv(os.path.join(root, file), header=None)
        name = df.iloc[4, 1]
        pdb_file_path = f"{downloads_path}/{slugify(name)}.pdb"
        print(pdb_file_path)
        open(sdf_to_csv, "a").write(f"{pdb_file_path}\t{os.path.join(root,file)}\n")

    except Exception as e:
        print(f"Reading problem in file {file}: {e}")
        list_of_failures.append(name)
        return False
    if not os.path.exists(pdb_file_path):
        try:
            sdf, inchikey = get_sdf_from_csv(os.path.join(root, file))
            if sdf is not None:
                return process_sdf(sdf, pdb_file_path)
        except Exception as e:
            print(f"Problem processing SDF for file {file}: {e}")
            list_of_failures.append(name)
            return False
    else:
        print(f"File {file} already exists")
        return True

import concurrent.futures

def process_directory(path, downloads_path, sdf_to_csv):
    idx = 0
    total = 0
    success = 0
    already_exist = 0
    failed = 0
    new_added = 0
    list_of_failures = []

    def process_file_wrapper(file):
        nonlocal idx, total, success, already_exist, failed, new_added
        idx += 1
        print(f"idx = {idx}")
        if file.endswith(".csv"):
            total += 1
            if process_file(
                root, file, list_of_failures, downloads_path, sdf_to_csv
            ):
                success += 1
            else:
                failed += 1
        if idx % 50 == 0:
            print(
                f"success = {success}, exist = {already_exist}, failed = {failed}"
            )

    with concurrent.futures.ThreadPoolExecutor() as executor:
        for root, dirs, files in os.walk(path):
            executor.map(process_file_wrapper, files)

    print(f"success = {success}, exist = {already_exist}, failed = {failed}")


In [5]:
csv_folder = "dlip_files"
inactive_csv_folder = "dlip_files_inactive"
compound_folder = "compound_files"
sdf_to_csv = "sdf_to_csv.txt"
process_directory(csv_folder,compound_folder,sdf_to_csv)
process_directory(inactive_csv_folder,compound_folder,sdf_to_csv)


idx = 1idx = 2

idx = 3
idx = 4
idx = 5
idx = 6
idx = 7
idx = 8
compound_files/CRSZBBYHHDCMTP-UHFFFAOYSA-N.pdb
compound_files/QSEDWPXFGFLBIO-UHFFFAOYSA-N.pdb
idx = 9
compound_files/ZNGZDOVQBYPJEY-UHFFFAOYSA-N.pdb
compound_files/CKBWTRSIUARPEC-UHFFFAOYSA-N.pdb
idx = 10
File compound_I0006Y.csv already exists
File compound_T0081H.csv already exists
compound_files/FNOAOFIHRVSHJT-UHFFFAOYSA-N.pdb
File compound_C101QF.csv already exists
idx = 11
idx = 12
idx = 13
idx = 14
idx = 15
compound_files/BUDMKOHZVRWDQQ-UHFFFAOYSA-N.pdb
compound_files/OXNCVPXHKUULSH-UHFFFAOYSA-N.pdb
compound_files/MFZKFMUYNLFXKY-UHFFFAOYSA-N.pdb
File compound_I000B7.csv already exists
idx = 16
File compound_I00068.csv already exists
idx = 17
compound_files/BMGQMFHTPSJIIW-UHFFFAOYSA-N.pdb
compound_files/FOYRFLMZVXPASD-UHFFFAOYSA-N.pdb
compound_files/ZOMSUTGXSSYGLG-UHFFFAOYSA-N.pdb
compound_files/FQAUQBOULILEAV-UHFFFAOYSA-N.pdb
compound_files/FQPKIJJOVFKDLD-KRWDZBQOSA-N.pdb
compound_files/MLNZGNREXNDJBQ-VQTJNVASSA-N.pd