### Load libraries

In [1]:
import pandas as pd
import numpy as np
import os, sys, glob, tqdm, pickle, joblib,json, shutil
import re,time,argparse,logging, tempfile
from tqdm import tqdm
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import rdkit
from rdkit import Chem
import rdkit.Chem
import rdkit.Chem.AllChem
import rdkit.Chem.MolStandardize
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import AllChem, Descriptors
from sklearn.preprocessing import LabelEncoder
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
from collections import OrderedDict
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import MolStandardize
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from rdkit.Chem import SmilesMolSupplier, SDMolSupplier, SDWriter, SmilesWriter, MolStandardize, MolToSmiles, MolFromSmiles


### new data work

In [2]:
def create_necessary_dirs():
    dirs_list = ['../data_main/data_new_stand', \
                 '../data_main/data_new_models', \
                 '../data_main/data_stand_cleaned_new', \
                '../data_main/data_new_split_stand', '../scalers', \
                '../rdk_descriptors', '../data_main/added_new_data/']
    for _dir in dirs_list:
        if not os.path.isdir(_dir):
            os.mkdir(_dir)

In [3]:
create_necessary_dirs()

In [4]:
# New approach resolves chiral issue
def Standardize(stdzr, remove_isomerism, molReader, molWriter):
    n_mol=0; 
    for mol in molReader:
        n_mol+=1
        molname = mol.GetProp('_Name') if mol.HasProp('_Name') else ''
        logging.debug('%d. %s:'%(n_mol, molname))
        mol2 = StdMol(stdzr, mol, remove_isomerism)
        output = rdkit.Chem.MolToSmiles(mol2, isomericSmiles=True) if mol2 else None
        return output
#############################################################################
def MyNorms():
    norms = list(MolStandardize.normalize.NORMALIZATIONS)
    for i in range(len(norms)-1, 0, -1):
        norm = norms[i]
        if norm.name == "Sulfoxide to -S+(O-)-":
            del(norms[i])
    norms.append(MolStandardize.normalize.Normalization("[S+]-[O-] to S=O",
    "[S+:1]([O-:2])>>[S+0:1](=[O-0:2])"))
    logging.info("Normalizations: {}".format(len(norms)))
    return(norms)

#############################################################################
def MyStandardizer(norms):
    stdzr = MolStandardize.Standardizer(
        normalizations = norms,
        max_restarts = MolStandardize.normalize.MAX_RESTARTS,
        prefer_organic = MolStandardize.fragment.PREFER_ORGANIC,
        acid_base_pairs = MolStandardize.charge.ACID_BASE_PAIRS,
        charge_corrections = MolStandardize.charge.CHARGE_CORRECTIONS,
        tautomer_transforms = MolStandardize.tautomer.TAUTOMER_TRANSFORMS,
        tautomer_scores = MolStandardize.tautomer.TAUTOMER_SCORES,
        max_tautomers = MolStandardize.tautomer.MAX_TAUTOMERS
        )
    return(stdzr)

#############################################################################
def StdMol(stdzr, mol, remove_isomerism=False):
    smi = MolToSmiles(mol, isomericSmiles=(not remove_isomerism)) if mol else None
    mol_std = stdzr.standardize(mol) if mol else None
    smi_std = MolToSmiles(mol_std, isomericSmiles=(not remove_isomerism)) if mol_std else None
    logging.debug(f"{smi:>28s} >> {smi_std}")
    return(mol_std)

#############################################################################
def preprocess_smi(smi):
    norms = MolStandardize.normalize.NORMALIZATIONS

    test_smiles = [smi]
    test_label = [1] # dummy list
    temp_dir = tempfile.mkdtemp()
    df = pd.DataFrame(zip(test_smiles, test_label), columns=['SMILES', 'Label'])

    df.to_csv(temp_dir+'/temp_file.csv', index=False)

    try:
        molReader = SmilesMolSupplier(temp_dir+'/temp_file.csv', delimiter=',', smilesColumn=0, nameColumn=1, titleLine=True, sanitize=True)

        molWriter = SmilesWriter(temp_dir+'/temp_outfile.csv', delimiter=',', nameHeader='Name',
        includeHeader=True, isomericSmiles = (True), kekuleSmiles=False)
        stdzr = MyStandardizer(norms)
        stand_smiles = Standardize(stdzr, True, molReader, molWriter)
        shutil.rmtree(temp_dir)
        
        return stand_smiles
    except:
        return '' 
def standardize_smiles(data_stand_dir, data_type):
    if not os.path.isdir(data_stand_dir):
        os.mkdir(data_stand_dir)
        print('Directory for standard_smiles is created!')
    def clean_smiles(df):
        smiles = df['SMILES'].to_list()
        smiles_stand = []
        count_none = 0

        for i in tqdm(range(len(smiles))):
            smi_stand = preprocess_smi(smiles[i])
            if smi_stand == '':
                count_none+=1
            smiles_stand.append(smi_stand)

        df.insert(2, 'SMILES_stand', smiles_stand)
        df['SMILES_stand'].replace('', np.nan, inplace=True)
        # Before
        df_before = df
        df.dropna(subset=['SMILES_stand'], inplace=True)
        df.reset_index(drop=True, inplace=True)
        # After
        print(f"Total NaN removed {len(df_before)-len(df)}")
        print('Total number of unique Standard smiles', len(set(df['SMILES_stand'].tolist())))
        print('SMILES_stand not found: ', count_none)
        return df
        
    
    _file = '../ncats_data_v2/NCATS-HTS_mined20200921.xlsx'
    df = pd.read_excel(open(_file, 'rb'), sheet_name = 'default')
    filename,_ = os.path.splitext(os.path.basename(_file))
    clean_df = clean_smiles(df)
    clean_df.to_csv(os.path.join(data_stand_dir, filename+'.csv'), index=False)

In [5]:
standardize_smiles('../data_main/data_new_stand/', 'new')

 21%|██        | 2241/10687 [00:11<00:46, 180.90it/s]RDKit ERROR: [09:40:07] Explicit valence for atom # 21 N, 4, is greater than permitted
RDKit ERROR: [09:40:07] ERROR: Could not sanitize molecule on line 1
 22%|██▏       | 2377/10687 [00:12<00:46, 177.83it/s]RDKit ERROR: [09:40:07] ERROR: Explicit valence for atom # 21 N, 4, is greater than permitted
RDKit ERROR: [09:40:08] Explicit valence for atom # 6 O, 3, is greater than permitted
RDKit ERROR: [09:40:08] ERROR: Could not sanitize molecule on line 1
 24%|██▍       | 2575/10687 [00:13<00:50, 160.01it/s]RDKit ERROR: [09:40:08] ERROR: Explicit valence for atom # 6 O, 3, is greater than permitted
RDKit ERROR: [09:40:09] Explicit valence for atom # 9 O, 3, is greater than permitted
RDKit ERROR: [09:40:09] ERROR: Could not sanitize molecule on line 1
RDKit ERROR: [09:40:09] ERROR: Explicit valence for atom # 9 O, 3, is greater than permitted
RDKit ERROR: [09:40:09] Explicit valence for atom # 3 Mg, 4, is greater than permitted
RDKit ER

Total NaN removed 0
Total number of unique Standard smiles 10665
SMILES_stand not found:  15


In [6]:
def phys_chem_filters(data_standard_file, activity, data_stand_cleaned):# data here is standardized data
    print('Remove "nan" from activity column')
    data = pd.read_csv(data_standard_file)
    temp = data
    data = data[data[activity]!=np.nan]
    temp2 = data # For the case when no filters are used
    data.dropna(subset = [activity], inplace =True)
    print(f"'nan' removed {len(temp)-len(data)}")
    data_hm_before = data[data[activity].isin(['HIGH', 'MODERATE'])]
    data_l_before = data[data[activity].isin(['LOW'])]
    print(f'Total number of actives and inactives before filters:\
        {len(data_hm_before)}, {len(data_l_before)}')
#     use_filters_list = ['CPE.ACTIVITY','cytotox.ACTIVITY','AlphaLISA.ACTIVITY','TruHit.ACTIVITY']
#     not_use_filters_list = ['CoV1-PPE.ACTIVITY','CoV1-PPE_cs.ACTIVITY',\
#                                      'MERS-PPE.ACTIVITY','MERS-PPE_cs.ACTIVITY',\
#                                      'hCYTOX.ACTIVITY','ACE2.ACTIVITY','3CL.ACTIVITY']
#         if activity_type in use_filters_list:
    data_filt = data[(data['logPow {predicted by ochem.eu/model/535 in Log unit}']>=1) \
          & (data['logPow {predicted by ochem.eu/model/535 in Log unit}']<=9) \
         & (data['Aqueous Solubility {predicted by ochem.eu/model/536 in log(mol/L)}']<=-3) \
         &(data['Aqueous Solubility {predicted by ochem.eu/model/536 in log(mol/L)}']>=-7.5)] 
#         else:
#             data_filt = data
    data_hm_after = data_filt[data_filt[activity].isin(['HIGH', 'MODERATE'])]
    data_l_after = data_filt[data_filt[activity].isin(['LOW'])]
    print(f'Total number of actives and inactives after filters:\
        {len(data_hm_after)}, {len(data_l_after)}')
    removed_actives = 100*((len(data_hm_before)-len(data_hm_after))/len(data_hm_before))
    removed_inactives = 100*((len(data_l_before)-len(data_l_after))/len(data_l_before))
    print(f'Removed actives %: {removed_actives}')
    print(f'Removed inactives %: {removed_inactives}')
#     if activity in not_use_filters_list:
    if removed_actives >15.0:
        # Do not use filters
        print('Filters not used')
        data_filt = temp2
    else:
        print('Filters used')
        
    data_filt_before = len(data_filt)
    data_filt = data_filt[data_filt[activity.split('.')[0]+'.SIGNIFICANCE']!='INCONCLUSIVE']
    data_filt_after = len(data_filt)
    print(f'INCONCLUSIVE removed {data_filt_before - data_filt_after}')
    # Label the data
    data_filt.insert(len(data_filt.columns), 'Label', ['ACTIVE' if \
                                            i=='HIGH' or i=='MODERATE' else 'INACTIVE' \
                                             for i in data_filt[activity].to_list()])
    data_filt.sort_values(by = 'Label', ascending = True)
    data_filt = data_filt.drop_duplicates(subset='SMILES_stand', keep="first") # , inplace=True removed
    if not os.path.isdir(data_stand_cleaned):
        os.mkdir(data_stand_cleaned)
    # will be saved in data_stand_cleaned (given, eg: data_old_models (5), data_new_models (10))
    data_filt.to_csv(data_stand_cleaned+'/'+activity.split('.')[0]+'.csv', index=False)

In [7]:
activity_list_new = ['CoV1-PPE.ACTIVITY','CoV1-PPE_cs.ACTIVITY',\
                                 'MERS-PPE.ACTIVITY','MERS-PPE_cs.ACTIVITY',\
                                 'hCYTOX.ACTIVITY','ACE2.ACTIVITY','3CL.ACTIVITY',\
                       'CPE.ACTIVITY','cytotox.ACTIVITY','AlphaLISA.ACTIVITY','TruHit.ACTIVITY']
for activity in activity_list_new:
    print('==========Working with=============', activity)
    phys_chem_filters('../data_main/data_new_stand/NCATS-HTS_mined20200921.csv', activity, '../data_main/data_stand_cleaned_new')

Remove "nan" from activity column
'nan' removed 8009
Total number of actives and inactives before filters:        923, 1740
Total number of actives and inactives after filters:        665, 844
Removed actives %: 27.952329360780066
Removed inactives %: 51.49425287356322
Filters not used
INCONCLUSIVE removed 32
Remove "nan" from activity column
'nan' removed 8009
Total number of actives and inactives before filters:        166, 2497
Total number of actives and inactives after filters:        119, 1390
Removed actives %: 28.313253012048197
Removed inactives %: 44.33319983980777
Filters not used
INCONCLUSIVE removed 6
Remove "nan" from activity column
'nan' removed 8009
Total number of actives and inactives before filters:        540, 2123
Total number of actives and inactives after filters:        432, 1077
Removed actives %: 20.0
Removed inactives %: 49.26990108337259
Filters not used
INCONCLUSIVE removed 55
Remove "nan" from activity column
'nan' removed 8009
Total number of actives and

In [8]:
def update_data():
    def balance_and_split(update_data):
        df2 = pd.read_csv(update_data)
        df2_positive_only = df2[df2['Label']=='ACTIVE']
        df2_negative_only = df2[~df2['SMILES_stand'].isin(df2_positive_only['SMILES_stand'].to_list())]
        df2_negative_reduced = df2_negative_only.sample(n=len(df2_positive_only), random_state=42) 
        df_balanced = pd.concat([df2_positive_only, df2_negative_reduced])
        df_balanced.reset_index(drop = True, inplace = True)

        x = df_balanced.iloc[:,:-1].values
        y = df_balanced.iloc[:,-1].values
        val=test=0.15
        x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.15, random_state=42, stratify=y)
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,test_size=\
                                        (round((val/(1-test)), 3)),random_state=42, stratify = y_train)
        # Resize y for concatenation
        y_train = y_train.reshape(len(y_train),1)
        y_val = y_val.reshape(len(y_val),1)
        y_test = y_test.reshape(len(y_test),1)
        print(len(y_train), len(y_val), len(y_test))
        # Conctenation
        tr = np.concatenate((x_train, y_train), axis=1)
        va = np.concatenate((x_val, y_val), axis=1)
        te = np.concatenate((x_test, y_test), axis=1)

        df_new = [tr,va,te]
        return df_new

    for _file in glob.glob('../data_main/data_stand_cleaned_new/*.csv'):
        d,_ = os.path.splitext(os.path.basename(_file))
        temp_df = pd.read_csv(_file)
        # Select necessary columns only
        # Get train, validation, and test split of new files 
        new_split = balance_and_split(_file)
        
        splits = ['tr', 'va', 'te']
        # Change Label names ACTIVE, INACTIVE to 1 and 0 resp.
        if not os.path.isdir('../data_main/data_new_models/'+d):
            os.mkdir('../data_main/data_new_models/'+d)
        
        for s, n in zip(splits, new_split):
            pd.DataFrame(n).to_csv('../data_main/data_new_split_stand/'+\
                   d+ "-balanced_randomsplit7_70_15_15"+"_"+ s +".csv",\
                   header = list(temp_df.columns), index=False)
            df_n = pd.read_csv('../data_main/data_new_split_stand/'+\
                               d+ "-balanced_randomsplit7_70_15_15"+"_"+ s +".csv")
            df_n["Label"].replace({"ACTIVE": 1, "INACTIVE": 0}, inplace=True)
            df_n = df_n.loc[:, ~df_n.columns.str.contains('^Unnamed')]
            df_n.to_csv('../data_main/data_new_models'+'/'+d+'/'+\
                               d+ "-balanced_randomsplit7_70_15_15"+"_"+ s +".csv", index=False)

In [9]:
update_data() # using filters (only two (before I used for 4))

1093 234 235
1587 340 341
229 49 50
1247 267 268
1181 253 254
257 55 56
678 146 146
224 48 48
420 90 90
538 116 116
698 150 150
