In [2]:
import pandas as pd
import os
from lxml import etree
import numpy as np
from tqdm import tqdm
from pandarallel import pandarallel
from rdkit import Chem 


In [2]:
pandarallel.initialize(progress_bar=True)
hmdb_path = "/home/prajkumar/Documents/GitHub/spectral_translation_datasets/data/hmdb_experimental_msms_spectra/"

def divide_chunks(l, n):
    for i in range(0, len(l), n): 
        yield l[i:i + n]

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [9]:
file_list = sorted(os.listdir(hmdb_path))
list_of_file_lists = list(divide_chunks(file_list,13000))
for j in range(len(list_of_file_lists)):
    processed_data = pd.DataFrame(columns=["hmdb_id", "splash_key", "ionization_mode","collision_energy_voltage","collision_energy_level","peaklist"])
    for i, file in enumerate(tqdm(list_of_file_lists[j])):
        root = etree.parse(os.path.join(hmdb_path, file)).getroot()
        if file.endswith(".xml"):
            hmdb_id = file[:11]
            try:
                splash_key = root.findall(".//splash-key")[0].text
            except:
                splash_key = None
            try:
                ionization_mode = root.findall(".//ionization-mode")[0].text
            except:
                ionization_mode = None
            try:
                collision_energy_voltage = float(root.findall(".//collision-energy-voltage")[0].text)
            except:
                collision_energy_voltage = None
            try:
                collision_energy_level = root.findall(".//collision-energy-level")[0].text
            except:
                collision_energy_level = None
            try:
                mz = [float(mz.text) for mz in root.findall(".//mass-charge")]
            except:
                mz = None
            try:
                intensity = [float(intensity.text) for intensity in root.findall(".//intensity")]
            except:
                intensity = None

            try:
                instrument = root.findall(".//instrument-type")[0].text
            except:
                instrument = None
            
            
            peaklist = np.array([mz,intensity]).T
            processed_data.loc[i] = {"hmdb_id":hmdb_id,
                                    "splash_key":splash_key,
                                    "ionization_mode":ionization_mode,
                                    "collision_energy_voltage":collision_energy_voltage,
                                    "collision_energy_level":collision_energy_level,
                                    "instrument":instrument,
                                    "peaklist":peaklist}
        else:
            print(file)
    processed_data.to_pickle(f"./data/processed_hmdb_data/hmdb_spectral_data_{j}.pkl")

  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_id":hmdb_id,
  processed_data.loc[i] = {"hmdb_i

In [16]:
hmdb_output_path = "./data/processed_hmdb_data"
hmdb_processed_file_list = sorted(os.listdir(hmdb_output_path))

frame = pd.DataFrame(columns=["hmdb_id", "splash_key", "ionization_mode","collision_energy_voltage","collision_energy_level","peaklist"])
for file in hmdb_processed_file_list:
    processed_data = pd.read_pickle(os.path.join(hmdb_output_path, file))
    frame = pd.concat((frame,processed_data),axis=0)

frame.to_pickle("./data/processed_hmdb_data/hmdb_spectral_data_all.pkl")

  frame = pd.concat((frame,processed_data),axis=0)


In [4]:
all_gnps_smiles_chunks = pd.read_csv("./data/processed_gnps_data/cleaned_spectra_processed.csv",usecols=[1],chunksize=1, header=None)

def composition(molecule):
    try:
        mol = Chem.MolFromSmiles(molecule)
    except:
        return None
    total = 0
    if mol:
        for atom in mol.GetAtoms():
            total += 1
    return total

In [None]:
RANDOM_SEED = 42
dataset_size = 50000

all_smiles = pd.read_csv("./data/smiles_data/chembl_22_clean_1576904_sorted_std_final.smi",usecols=[0], header=None,sep="\t")
all_smiles_restricted = all_smiles.drop(all_smiles[all_smiles[0].map(len) < 4].index)
all_smiles_restricted = all_smiles_restricted.drop(all_smiles_restricted[all_smiles_restricted[0].map(len) > 28].index)
all_smiles_restricted.rename(columns={0:"smiles"},inplace=True)
all_smiles_sample = all_smiles_restricted.sample(n=dataset_size, random_state=RANDOM_SEED).reset_index(drop=True)


In [None]:
def pred_mols(x):
    replace_chars = ["=", "#", ":", "+", "-", "[", "]", "(", ")", "/", "\\", "\@", ".", "\%"]
    return len(''.join(c for c in x if c not in replace_chars))

def num_mols(x):
    x = Chem.MolFromSmiles(x)
    try:
        return len(x.GetAtoms())
    except:
        return 0
    
all_smiles_sample.to_csv("./data/smiles_data/chembl_smiles_sample_50000.csv",sep=",",index=False,header=False)

In [None]:
all_smiles_28 = all_smiles.drop(all_smiles[all_smiles[0].map(len) < 28].index)
all_smiles_28 = all_smiles_28.drop(all_smiles_28[all_smiles_28[0].map(len) > 70].index)
all_smiles_28 = all_smiles_28.drop(all_smiles_28[all_smiles_28[0].map(pred_mols) < 25].index)
all_smiles_28 = all_smiles_28.drop(all_smiles_28[all_smiles_28[0].map(pred_mols) > 35].index)
print(len(all_smiles_28))
all_smiles_28 = all_smiles_28.drop(all_smiles_28[all_smiles_28[0].map(num_mols) != 28].index)
#all_smiles_28 = all_smiles_28.drop(all_smiles_28[all_smiles_28[0].map(num_mols) < 25].index)

In [None]:
all_smiles_28.to_csv("./data/smiles_data/chembl_smiles_atom_ct_28.csv",sep=",",index=False,header=False)