# Prepare Training Data  
  
In this step we read the data from [Maier, L., et al (2018)](https://www.nature.com/articles/nature25979). And prepare it to be used for training, testing and validation of the XGBoost model. 

In [2]:
# Read Libraries
import os

import numpy as np
import pandas as pd

from tqdm import tqdm
import pubchempy as pcp

from dataset.dataset_representation import process_dataset


from rdkit import Chem
from rdkit.Chem import Descriptors

  from .autonotebook import tqdm as notebook_tqdm


## Global variables  

In [3]:
# Directory from which to read the raw data
INPUT_DIR = '../raw_data/maier_microbiome'

# Create the output directory
OUTPUT_DIR = "../data/01.prepare_training_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Pvalue cutoff for label determination
PVAL_CUTOFF = 0.05

## Read data and binarize
  
Here we read in Supplementary Table 3 from the study.

In [3]:
# Read the raw datas
screen_df = pd.read_excel(os.path.join(INPUT_DIR, "screen_results_info_SF3.xlsx"))

# Clean the data
screen_df.drop(columns=["chemical_name", "drug_class", "n_hit"], inplace=True)
screen_df.set_index("prestwick_ID", inplace=True)

# Convert the data to binary
screen_df = screen_df <= PVAL_CUTOFF
screen_df = screen_df.astype(int)
screen_df.head()



Unnamed: 0_level_0,Akkermansia muciniphila (NT5021),Bacteroides caccae (NT5050),Bacteroides fragilis (ET) (NT5033),Bacteroides fragilis (NT) (NT5003),Bacteroides ovatus (NT5054),Bacteroides thetaiotaomicron (NT5004),Bacteroides uniformis (NT5002),Bacteroides vulgatus (NT5001),Bacteroides xylanisolvens (NT5064),Bifidobacterium adolescentis (NT5022),...,Parabacteroides merdae (NT5071),Prevotella copri (NT5019),Roseburia hominis (NT5079),Roseburia intestinalis (NT5011),Ruminococcus bromii (NT5045),Ruminococcus gnavus (NT5046),Ruminococcus torques (NT5047),Streptococcus parasanguinis (NT5072),Streptococcus salivarius (NT5038),Veillonella parvula (NT5017)
prestwick_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Prestw-1109,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Prestw-1399,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Prestw-145,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Prestw-1464,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Prestw-31,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


## Gather SMILES
  
We use the chemical names to gather the SMILES from PubChem using [PubChemPy](https://pubchempy.readthedocs.io/en/latest/)


In [None]:
# FUNCTIONS FOR PROCESSING OBTAINING SMILES

def clean_names_chemlibrary(original_name):
    
    # Remove additional information from name 
    name = original_name.split(" (")[0]
    name = name.split(" [")[0]
    name = name.rstrip()
    
    return name

def get_pubchemid(name):
    
    # Attempt to find result with search name
    results = pcp.get_compounds(name, 'name')
    result_dict = {}

    # If that did not work, use the clean name
    if len(results) == 0:
        clean_name = clean_names_chemlibrary(name)
        results = pcp.get_compounds(clean_name, "name")
    
    # Now prepare the output
    if len(results) > 0:
        result_dict["name"] = name 
        result_dict["cid"] = results[0].cid
        result_dict["pchem_canonical_smile"] = results[0].canonical_smiles
        result_dict["pchem_isomeric_smile"] = results[0].isomeric_smiles
        result_dict["pchem_inchi"] = results[0].inchi
        result_dict["pchem_inchikey"] = results[0].inchikey
        
    else:
        result_dict["name"] = name
        result_dict["cid"] = "not_found"
        result_dict["pchem_canonical_smile"] = "not_found"
        result_dict["pchem_isomeric_smile"] = "not_found"
        result_dict["pchem_inchi"] = "not_found"
        result_dict["pchem_inchikey"] = "not_found"
        
    result_df = pd.DataFrame(result_dict, index=[0])
    
    return result_df


def cid_info(cid, df):
    # Query Pubchem
    results = pcp.Compound.from_cid(cid)
    
    # Init dictionary
    result_dict = {}
    
    # Prepare output
    result_dict["name"] = df.loc[df["cid"]==cid, "name"].values[0]
    result_dict["cid"] = cid
    result_dict["pchem_canonical_smile"] = results.canonical_smiles
    result_dict["pchem_isomeric_smile"] = results.isomeric_smiles
    result_dict["pchem_inchi"] = results.inchi
    result_dict["pchem_inchikey"] = results.inchikey
    
    return pd.DataFrame(result_dict, index=[0])

In [7]:
# READ THE DATA

maier_chemicals = pd.read_excel(os.path.join(INPUT_DIR, "chem_library_info_SF1.xlsx"))
maier_chemicals.set_index("prestwick_ID", inplace=True)

I will use the **chemical_name** field to query PubChem (via pubchempy) to find their SMILES and other relevant information

In [None]:
# Iterate over the unique names in the dataset
cid_search = pd.concat([get_pubchemid(name) for name in tqdm(maier_chemicals["chemical name"].unique())])

# Couldn't find these chemicals using PCP, so we will manually add them
manual_cid = pd.DataFrame([["(-)-Eseroline fumarate salt", 16219298], ["Clonixin Lysinate", 3080836], ["Ziprasidone  Hydrochloride", 219099],
             ["Clavulanate potassium salt", 23665591], ["Oxibendazol", 4622], 
             ["Morpholinoethylamino-3-benzocyclohepta-(5,6-c)-pyridazine dihydrochloride", 195164],
             ["Gabazine bromide", 71316800], ["Colistin sulfate", 91885449], ["Bacitracin", 11980094]], 
             columns=["name", "cid"])

manual_try = pd.concat([cid_info(c, df=manual_cid) for c in tqdm(manual_cid.cid.to_list())])

# Concatenate the results
chem_smiles = pd.concat([cid_search[cid_search["cid"] != "not_found"], manual_try]).drop_duplicates()


We can now get the RDKit versions of these SMILES

In [None]:
# Obtiain canonical smiles
chem_smiles["rdkit_canonical_smile"] = chem_smiles["pchem_canonical_smile"].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x),
                                                                                          canonical=True, 
                                                                                          isomericSmiles=False))

# Obtain isomeric smiles
chem_smiles["rdkit_isomeric_smile"] = chem_smiles["pchem_canonical_smile"].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x),
                                                                                          canonical=True, 
                                                                                          isomericSmiles=True))

# Obtain chemical metadata
chem_smiles["n_atoms"] = chem_smiles["pchem_canonical_smile"].apply(lambda x: Chem.MolFromSmiles(x).GetNumAtoms())
chem_smiles["n_bonds"] = chem_smiles["pchem_canonical_smile"].apply(lambda x: Chem.MolFromSmiles(x).GetNumBonds())
chem_smiles["ExactMolWt"] = chem_smiles["pchem_canonical_smile"].apply(lambda x: Descriptors.ExactMolWt(Chem.MolFromSmiles(x)))


# Remove the salts
remover = SaltRemover()
chem_smiles["rdkit_no_salt"] = chem_smiles["rdkit_canonical_smile"].apply(lambda x: Chem.MolToSmiles(remover.StripMol(Chem.MolFromSmiles(x))))


Now we can combine this information with the provided data

In [None]:
maier_chemicals.reset_index(inplace=True)
maier_chemicals.set_index("chemical name", inplace=True)

chem_smiles.set_index("name", inplace=True)

chemical_metadata = maier_chemicals.join(chem_smiles)
chemical_metadata.head()

With that, we can write the final output

In [None]:
os.makedirs("../data/01.prepare_training_data", exist_ok=True)
chemical_metadata.to_csv("../data/01.prepare_training_data/prestwick_library.tsv.gz", sep='\t')

## Molecular representation and data splitting. 
  
Now that we have determined the labels, we can now represent the chemical library using MolE, ECFP4 and Chemical Descriptors. At the same time, we can split the dataset using scaffold splitting

In [8]:
chemical_metadata_screened = chemical_metadata.loc[chemical_metadata["prestwick_ID"].isin(screen_df.index)]
chemical_metadata_screened.to_csv(os.path.join(OUTPUT_DIR, "prestwick_library_screened.tsv.gz"), sep='\t')

In [68]:
# MOLE REPRESENTATION

maier_scaffold_split, mole_representation = process_dataset(dataset_path = os.path.join(OUTPUT_DIR, "prestwick_library_screened.tsv.gz"), 
                                                  pretrain_architecture = "gin_concat", 
                                                  pretrained_model = "model_ginconcat_btwin_100k_d8000_l0.0001", 
                                                  split_approach = "scaffold", 
                                                  validation_proportion = 0.1, 
                                                  test_proportion = 0.1, 
                                                  smile_column_str = "rdkit_no_salt", 
                                                  id_column_str = "prestwick_ID") 

About to generate scaffolds
About to sort in scaffold sets
../pretrained_model/model_ginconcat_btwin_100k_d8000_l0.0001/model.pth
x_embedding1.weight
x_embedding2.weight
gnns.0.mlp.0.weight
gnns.0.mlp.0.bias
gnns.0.mlp.1.weight
gnns.0.mlp.1.bias
gnns.0.mlp.1.running_mean
gnns.0.mlp.1.running_var
gnns.0.mlp.1.num_batches_tracked
gnns.0.mlp.3.weight
gnns.0.mlp.3.bias
gnns.0.edge_embedding1.weight
gnns.0.edge_embedding2.weight
gnns.1.mlp.0.weight
gnns.1.mlp.0.bias
gnns.1.mlp.1.weight
gnns.1.mlp.1.bias
gnns.1.mlp.1.running_mean
gnns.1.mlp.1.running_var
gnns.1.mlp.1.num_batches_tracked
gnns.1.mlp.3.weight
gnns.1.mlp.3.bias
gnns.1.edge_embedding1.weight
gnns.1.edge_embedding2.weight
gnns.2.mlp.0.weight
gnns.2.mlp.0.bias
gnns.2.mlp.1.weight
gnns.2.mlp.1.bias
gnns.2.mlp.1.running_mean
gnns.2.mlp.1.running_var
gnns.2.mlp.1.num_batches_tracked
gnns.2.mlp.3.weight
gnns.2.mlp.3.bias
gnns.2.edge_embedding1.weight
gnns.2.edge_embedding2.weight
gnns.3.mlp.0.weight
gnns.3.mlp.0.bias
gnns.3.mlp.1.weigh

In [4]:
# ECFP4

_, ecfp4_representation = process_dataset(dataset_path = os.path.join(OUTPUT_DIR, "prestwick_library_screened.tsv.gz"), 
                                                  pretrain_architecture = "ECFP4", 
                                                  pretrained_model = None, 
                                                  split_approach = "scaffold", 
                                                  validation_proportion = 0.1, 
                                                  test_proportion = 0.1, 
                                                  smile_column_str = "rdkit_no_salt", 
                                                  id_column_str = "prestwick_ID") 

About to generate scaffolds
About to sort in scaffold sets


In [17]:
# CHEMICAL DESCRIPTORS
_, chemdesc_representation = process_dataset(dataset_path = os.path.join(OUTPUT_DIR, "prestwick_library_screened.tsv.gz"), 
                                                  pretrain_architecture = "ChemDesc", 
                                                  pretrained_model = None, 
                                                  split_approach = "scaffold", 
                                                  validation_proportion = 0.1, 
                                                  test_proportion = 0.1, 
                                                  smile_column_str = "pchem_isomeric_smile", 
                                                  id_column_str = "prestwick_ID")
  


About to generate scaffolds
About to sort in scaffold sets
Could not compute descriptors for Prestw-919


In [15]:
chemdesc_df = pd.read_csv("../../udl_microbiome/data/02.pretrained_representation/prestwick_chemical_descriptors.tsv.gz", 
                              sep='\t')

chemdesc_df.rename(columns={"chem_id": "prestwick_ID"}, inplace=True)

In [16]:
chemdesc_df

Unnamed: 0,prestwick_ID,MolWt,BertzCT,MolLogP,MolMR,HeavyAtomCount,NumHAcceptors,NumHDonors,NumValenceElectrons,RingCount,...,SlogP_VSA8,SlogP_VSA9,SlogP_VSA10,SlogP_VSA11,pyLabuteASA,Asphericity,Eccentricity,InertialShapeFactor,RadiusOfGyration,SpherocityIndex
0,Prestw-1,152.117,453.984413,-1.5057,35.1313,11,5,3,56,2,...,0.000000,0.0,5.817863,0.000000,66.539526,0.302836,0.930303,0.002449,2.154798,0.007502
1,Prestw-10,214.250,740.896704,-0.7691,53.8500,14,3,3,76,1,...,0.000000,0.0,5.687386,0.000000,95.870160,0.505477,0.978004,0.002762,2.875382,0.223066
2,Prestw-100,301.327,1283.150192,3.0337,79.8729,21,5,2,106,3,...,11.033401,0.0,10.742876,0.000000,138.891400,0.590360,0.986737,0.001068,4.134612,0.071995
3,Prestw-1000,602.641,2844.364318,-0.6108,151.3916,43,13,9,234,4,...,10.052764,0.0,0.000000,5.749512,301.180439,0.636237,0.989825,0.000281,6.476668,0.179723
4,Prestw-1001,390.520,2000.204789,3.3174,103.2538,28,5,1,156,4,...,0.000000,0.0,0.000000,0.000000,214.992665,0.535310,0.980926,0.000779,4.234209,0.315925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,Prestw-994,379.891,1526.920304,1.5976,87.1324,23,5,3,128,3,...,0.000000,0.0,5.687386,0.000000,165.760464,0.477294,0.975161,0.000698,3.961669,0.240218
1196,Prestw-995,434.411,1608.973768,2.9133,103.1260,30,8,3,168,1,...,2.862399,0.0,13.171245,0.000000,206.730055,0.279689,0.931818,0.000358,3.688724,0.209779
1197,Prestw-997,500.579,2195.862246,4.4300,120.8678,34,6,1,188,4,...,0.000000,0.0,13.171245,0.000000,244.748148,0.408376,0.962859,0.000449,4.234840,0.465431
1198,Prestw-998,473.897,1808.411949,5.0799,128.1748,29,4,1,158,4,...,5.573105,0.0,0.000000,0.000000,231.987662,0.476328,0.974754,0.000412,4.325693,0.064414


In [8]:
ecfp4_representation

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
Prestw-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Prestw-10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Prestw-100,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Prestw-1000,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Prestw-1001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Prestw-994,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Prestw-995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Prestw-997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Prestw-998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


## Write files

In [18]:
maier_scaffold_split.to_csv(os.path.join(OUTPUT_DIR, "maier_scaffold_split.tsv.gz"), sep='\t')
mole_representation.to_csv(os.path.join(OUTPUT_DIR, "maier_mole_representation.tsv.gz"), sep='\t')
ecfp4_representation.to_csv(os.path.join(OUTPUT_DIR, "maier_ecfp4_representation.tsv.gz"), sep='\t')
chemdesc_df.to_csv(os.path.join(OUTPUT_DIR, "maier_chemdesc_representation.tsv.gz"), sep='\t', index=False)

## Written files

In [10]:
os.listdir(OUTPUT_DIR)

['maier_ecfp4_representation.tsv.gz',
 'prestwick_library.tsv.gz',
 'prestwick_library_screened.tsv.gz',
 'maier_scaffold_split.tsv.gz',
 'maier_chemdesc_representation.tsv.gz',
 'maier_mole_representation.tsv.gz',
 'maier_screening_results.tsv.gz']