### Load libraries

In [1]:
import pandas as pd
import numpy as np
import os, sys, glob, tqdm, pickle, joblib,json, shutil
import re,time,argparse,logging, tempfile
from tqdm import tqdm
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import rdkit
from rdkit import Chem
import rdkit.Chem
import rdkit.Chem.AllChem
import rdkit.Chem.MolStandardize
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import AllChem, Descriptors
from sklearn.preprocessing import LabelEncoder
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
from collections import OrderedDict
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import MolStandardize
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from rdkit.Chem import SmilesMolSupplier, SDMolSupplier, SDWriter, SmilesWriter, MolStandardize, MolToSmiles, MolFromSmiles

### new data work

In [2]:
def create_necessary_dirs():
    dirs_list = ['../data_main', '../data_volsurf', '../data_main/data_new_stand', \
                 '../data_main/data_new_models', \
                 '../data_main/data_stand_cleaned_new', \
                '../data_main/data_new_split_stand', '../scalers', \
                '../rdk_descriptors', '../data_main/added_new_data/', \
                '../volsurf_descriptors', '../data_volsurf/data_new_split_stand_volsurf', \
                '../data_volsurf/data_new_stand_volsurf/', '../data_volsurf/data_stand_cleaned_new_volsurf', \
                '../data_volsurf/data_new_models_volsurf']
    for _dir in dirs_list:
        if not os.path.isdir(_dir):
            os.mkdir(_dir)

In [3]:
create_necessary_dirs()

In [4]:
# New approach resolves chiral issue
def Standardize(stdzr, remove_isomerism, molReader, molWriter):
    n_mol=0; 
    for mol in molReader:
        n_mol+=1
        molname = mol.GetProp('_Name') if mol.HasProp('_Name') else ''
        logging.debug('%d. %s:'%(n_mol, molname))
        mol2 = StdMol(stdzr, mol, remove_isomerism)
        output = rdkit.Chem.MolToSmiles(mol2, isomericSmiles=True) if mol2 else None
        return output
#############################################################################
def MyNorms():
    norms = list(MolStandardize.normalize.NORMALIZATIONS)
    for i in range(len(norms)-1, 0, -1):
        norm = norms[i]
        if norm.name == "Sulfoxide to -S+(O-)-":
            del(norms[i])
    norms.append(MolStandardize.normalize.Normalization("[S+]-[O-] to S=O",
    "[S+:1]([O-:2])>>[S+0:1](=[O-0:2])"))
    logging.info("Normalizations: {}".format(len(norms)))
    return(norms)

#############################################################################
def MyStandardizer(norms):
    stdzr = MolStandardize.Standardizer(
        normalizations = norms,
        max_restarts = MolStandardize.normalize.MAX_RESTARTS,
        prefer_organic = MolStandardize.fragment.PREFER_ORGANIC,
        acid_base_pairs = MolStandardize.charge.ACID_BASE_PAIRS,
        charge_corrections = MolStandardize.charge.CHARGE_CORRECTIONS,
        tautomer_transforms = MolStandardize.tautomer.TAUTOMER_TRANSFORMS,
        tautomer_scores = MolStandardize.tautomer.TAUTOMER_SCORES,
        max_tautomers = MolStandardize.tautomer.MAX_TAUTOMERS
        )
    return(stdzr)

#############################################################################
def StdMol(stdzr, mol, remove_isomerism=False):
    smi = MolToSmiles(mol, isomericSmiles=(not remove_isomerism)) if mol else None
    mol_std = stdzr.standardize(mol) if mol else None
    smi_std = MolToSmiles(mol_std, isomericSmiles=(not remove_isomerism)) if mol_std else None
    logging.debug(f"{smi:>28s} >> {smi_std}")
    return(mol_std)

#############################################################################
def preprocess_smi(smi):
    norms = MolStandardize.normalize.NORMALIZATIONS

    test_smiles = [smi]
    test_label = [1] # dummy list
    temp_dir = tempfile.mkdtemp()
    df = pd.DataFrame(zip(test_smiles, test_label), columns=['SMILES', 'Label'])

    df.to_csv(temp_dir+'/temp_file.csv', index=False)

    try:
        molReader = SmilesMolSupplier(temp_dir+'/temp_file.csv', delimiter=',', smilesColumn=0, nameColumn=1, titleLine=True, sanitize=True)

        molWriter = SmilesWriter(temp_dir+'/temp_outfile.csv', delimiter=',', nameHeader='Name',
        includeHeader=True, isomericSmiles = (True), kekuleSmiles=False)
        stdzr = MyStandardizer(norms)
        stand_smiles = Standardize(stdzr, True, molReader, molWriter)
        shutil.rmtree(temp_dir)
        
        return stand_smiles
    except:
        return '' 
def standardize_smiles(data_stand_dir, data_type):
    if not os.path.isdir(data_stand_dir):
        os.mkdir(data_stand_dir)
        print('Directory for standard_smiles is created!')
    def clean_smiles(df):
        smiles = df['SMILES'].to_list()
        smiles_stand = []
        count_none = 0

        for i in tqdm(range(len(smiles))):
            smi_stand = preprocess_smi(smiles[i])
            if smi_stand == '':
                count_none+=1
            smiles_stand.append(smi_stand)

        df.insert(2, 'SMILES_stand', smiles_stand)
        df['SMILES_stand'].replace('', np.nan, inplace=True)
        # Before
        df_before = df
        df.dropna(subset=['SMILES_stand'], inplace=True)
        df.reset_index(drop=True, inplace=True)
        # After
        print(f"Total NaN removed {len(df_before)-len(df)}")
        print('Total number of unique Standard smiles', len(set(df['SMILES_stand'].tolist())))
        print('SMILES_stand not found: ', count_none)
        return df
    
    _file = '../ncats_data_v2/NCATS-HTS_mined20200921.xlsx'
    _file2 = '../ncats_data_v2/NCATS-HTS_mined20200921_volsurf.csv'
    if data_type == 'new':
        df = pd.read_excel(open(_file, 'rb'), sheet_name = 'default')
        filename,_ = os.path.splitext(os.path.basename(_file))
    else:
        df = pd.read_csv(_file2)
        filename,_ = os.path.splitext(os.path.basename(_file2))
    clean_df = clean_smiles(df)
    clean_df.to_csv(os.path.join(data_stand_dir, filename+'_stand.csv'), index=False)

In [5]:
# standardize_smiles('../data_main/data_new_stand/', 'new')

In [6]:
standardize_smiles('../data_volsurf/data_new_stand_volsurf/', 'volsurf')

  if (await self.run_code(code, result,  async_=asy)):
 24%|██▍       | 2544/10532 [00:13<00:58, 135.77it/s]RDKit ERROR: [09:46:14] Explicit valence for atom # 19 N, 4, is greater than permitted
RDKit ERROR: [09:46:14] ERROR: Could not sanitize molecule on line 1
RDKit ERROR: [09:46:14] ERROR: Explicit valence for atom # 19 N, 4, is greater than permitted
RDKit ERROR: [09:46:14] Explicit valence for atom # 21 O, 3, is greater than permitted
RDKit ERROR: [09:46:14] ERROR: Could not sanitize molecule on line 1
RDKit ERROR: [09:46:14] ERROR: Explicit valence for atom # 21 O, 3, is greater than permitted
RDKit ERROR: [09:46:14] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [09:46:14] ERROR: Could not sanitize molecule on line 1
RDKit ERROR: [09:46:14] ERROR: Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [09:46:14] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [09:46:14] ERROR: Could not sanitize molecule

Total NaN removed 0
Total number of unique Standard smiles 10520
SMILES_stand not found:  6


In [7]:
def phys_chem_filters(data_standard_file, activity, data_stand_cleaned):# data here is standardized data
    print('Remove "nan" from activity column')
    data = pd.read_csv(data_standard_file, low_memory=False)
    temp = data
    data = data[data[activity]!=np.nan]
    temp2 = data # For the case when no filters are used
    data.dropna(subset = [activity], inplace =True)
    print(f"'nan' removed {len(temp)-len(data)}")
    data_hm_before = data[data[activity].isin(['HIGH', 'MODERATE'])]
    data_l_before = data[data[activity].isin(['LOW'])]
    print(f'Total number of actives and inactives before filters:\
        {len(data_hm_before)}, {len(data_l_before)}')
#     use_filters_list = ['CPE.ACTIVITY','cytotox.ACTIVITY','AlphaLISA.ACTIVITY','TruHit.ACTIVITY']
#     not_use_filters_list = ['CoV1-PPE.ACTIVITY','CoV1-PPE_cs.ACTIVITY',\
#                                      'MERS-PPE.ACTIVITY','MERS-PPE_cs.ACTIVITY',\
#                                      'hCYTOX.ACTIVITY','ACE2.ACTIVITY','3CL.ACTIVITY']
#         if activity_type in use_filters_list:
    data_filt = data[(data['logPow {predicted by ochem.eu/model/535 in Log unit}']>=1) \
          & (data['logPow {predicted by ochem.eu/model/535 in Log unit}']<=9) \
         & (data['Aqueous Solubility {predicted by ochem.eu/model/536 in log(mol/L)}']<=-3) \
         &(data['Aqueous Solubility {predicted by ochem.eu/model/536 in log(mol/L)}']>=-7.5)] 
#         else:
#             data_filt = data
    data_hm_after = data_filt[data_filt[activity].isin(['HIGH', 'MODERATE'])]
    data_l_after = data_filt[data_filt[activity].isin(['LOW'])]
    print(f'Total number of actives and inactives after filters:\
        {len(data_hm_after)}, {len(data_l_after)}')
    removed_actives = 100*((len(data_hm_before)-len(data_hm_after))/len(data_hm_before))
    removed_inactives = 100*((len(data_l_before)-len(data_l_after))/len(data_l_before))
    print(f'Removed actives %: {removed_actives}')
    print(f'Removed inactives %: {removed_inactives}')
#     if activity in not_use_filters_list:
    if removed_actives >15.0:
        # Do not use filters
        print('Filters not used')
        data_filt = temp2
    else:
        print('Filters used')
        
    data_filt_before = len(data_filt)
    data_filt = data_filt[data_filt[activity.split('.')[0]+'.SIGNIFICANCE']!='INCONCLUSIVE']
    data_filt_after = len(data_filt)
    print(f'INCONCLUSIVE removed {data_filt_before - data_filt_after}')
    # Label the data
    data_filt.insert(len(data_filt.columns), 'Label', ['ACTIVE' if \
                                            i=='HIGH' or i=='MODERATE' else 'INACTIVE' \
                                             for i in data_filt[activity].to_list()])
    data_filt.sort_values(by = 'Label', ascending = True)
    data_filt = data_filt.drop_duplicates(subset='SMILES_stand', keep="first") # , inplace=True removed
    if not os.path.isdir(data_stand_cleaned):
        os.mkdir(data_stand_cleaned)
    # will be saved in data_stand_cleaned (given, eg: data_old_models (5), data_new_models (10))
    data_filt.to_csv(data_stand_cleaned+'/'+activity.split('.')[0]+'.csv', index=False)

In [8]:
activity_list_new = ['CoV1-PPE.ACTIVITY','CoV1-PPE_cs.ACTIVITY',\
                                 'MERS-PPE.ACTIVITY','MERS-PPE_cs.ACTIVITY',\
                                 'hCYTOX.ACTIVITY','ACE2.ACTIVITY','3CL.ACTIVITY',\
                       'CPE.ACTIVITY','cytotox.ACTIVITY','AlphaLISA.ACTIVITY','TruHit.ACTIVITY']
for activity in activity_list_new:
    print('==========Working with=============', activity)
    phys_chem_filters('../data_volsurf/data_new_stand_volsurf/NCATS-HTS_mined20200921_volsurf_stand.csv', activity, '../data_volsurf/data_stand_cleaned_new_volsurf')

Remove "nan" from activity column
'nan' removed 7899
Total number of actives and inactives before filters:        906, 1721
Total number of actives and inactives after filters:        659, 840
Removed actives %: 27.262693156732894
Removed inactives %: 51.19116792562464
Filters not used
INCONCLUSIVE removed 30
Remove "nan" from activity column
'nan' removed 7899
Total number of actives and inactives before filters:        160, 2467
Total number of actives and inactives after filters:        117, 1382
Removed actives %: 26.875
Removed inactives %: 43.980543169841916
Filters not used
INCONCLUSIVE removed 5
Remove "nan" from activity column
'nan' removed 7899
Total number of actives and inactives before filters:        535, 2092
Total number of actives and inactives after filters:        431, 1068
Removed actives %: 19.439252336448597
Removed inactives %: 48.94837476099426
Filters not used
INCONCLUSIVE removed 53
Remove "nan" from activity column
'nan' removed 7899
Total number of actives 

In [9]:
def update_data():
    def balance_and_split(update_data):
        df2 = pd.read_csv(update_data, low_memory=False)
        df2_positive_only = df2[df2['Label']=='ACTIVE']
        df2_negative_only = df2[~df2['SMILES_stand'].isin(df2_positive_only['SMILES_stand'].to_list())]
        df2_negative_reduced = df2_negative_only.sample(n=len(df2_positive_only), random_state=42) 
        df_balanced = pd.concat([df2_positive_only, df2_negative_reduced])
        df_balanced.reset_index(drop = True, inplace = True)

        x = df_balanced.iloc[:,:-1].values
        y = df_balanced.iloc[:,-1].values
        val=test=0.15
        x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.15, random_state=42, stratify=y)
        x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,test_size=\
                                        (round((val/(1-test)), 3)),random_state=42, stratify = y_train)
        # Resize y for concatenation
        y_train = y_train.reshape(len(y_train),1)
        y_val = y_val.reshape(len(y_val),1)
        y_test = y_test.reshape(len(y_test),1)
        print(os.path.basename(update_data), len(y_train), len(y_val), len(y_test))
        # Conctenation
        tr = np.concatenate((x_train, y_train), axis=1)
        va = np.concatenate((x_val, y_val), axis=1)
        te = np.concatenate((x_test, y_test), axis=1)

        df_new = [tr,va,te]
        return df_new

    for _file in glob.glob('../data_volsurf/data_stand_cleaned_new_volsurf/*.csv'):
        d,_ = os.path.splitext(os.path.basename(_file))
        temp_df = pd.read_csv(_file, low_memory=False)
        # Select necessary columns only
        # Get train, validation, and test split of new files 
        new_split = balance_and_split(_file)
        
        splits = ['tr', 'va', 'te']
        # Change Label names ACTIVE, INACTIVE to 1 and 0 resp.
        if not os.path.isdir('../data_volsurf/data_new_models_volsurf/'+d):
            os.mkdir('../data_volsurf/data_new_models_volsurf/'+d)
        
        for s, n in zip(splits, new_split):
            pd.DataFrame(n).to_csv('../data_volsurf/data_new_split_stand_volsurf/'+\
                   d+ "-balanced_randomsplit7_70_15_15"+"_"+ s +".csv",\
                   header = list(temp_df.columns), index=False)
            df_n = pd.read_csv('../data_volsurf/data_new_split_stand_volsurf/'+\
                               d+ "-balanced_randomsplit7_70_15_15"+"_"+ s +".csv", low_memory=False)
            df_n["Label"].replace({"ACTIVE": 1, "INACTIVE": 0}, inplace=True)
            df_n = df_n.loc[:, ~df_n.columns.str.contains('^Unnamed')]
            df_n.to_csv('../data_volsurf/data_new_models_volsurf'+'/'+d+'/'+\
                               d+ "-balanced_randomsplit7_70_15_15"+"_"+ s +".csv", index=False)

In [10]:
update_data() # using filters (only two (before I used for 4))

AlphaLISA.csv 1089 233 234
cytotox.csv 1569 336 337
ACE2.csv 228 49 49
CoV1-PPE.csv 1226 263 263
TruHit.csv 1175 251 252
MERS-PPE_cs.csv 250 54 54
MERS-PPE.csv 674 145 145
CoV1-PPE_cs.csv 216 47 47
3CL.csv 404 87 87
hCYTOX.csv 530 114 114
CPE.csv 691 148 149


# Get rdkit Descriptors Scalers for all

In [11]:
def rdk_features(data_folder):
    def rdk_descriptors(csvFile):
        calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
        fpdict ={}
        fpdict['rdkDes'] = lambda m: calc.CalcDescriptors(m)
        df = pd.read_csv(csvFile)
        stand_smi = df['SMILES_stand'].tolist()
        rdkit_des = []
        
        def CalculateFP(fp_name, smiles):
            m = Chem.MolFromSmiles(smiles)
            return fpdict[fp_name](m)
        for i in range(len(stand_smi)):
            fp = CalculateFP('rdkDes', stand_smi[i])
            fp = np.asarray(fp)
            fp = fp.reshape(1,200)
            rdkit_des.append(fp)
        X = np.array(rdkit_des)
        X = X.reshape(len(rdkit_des),200)
        df = pd.DataFrame.from_records(X)
        print(df.isnull().sum().sum())
        for col in df.columns:
            df[col].fillna(df[col].mean(), inplace=True)
        X = df.iloc[:,0:].values
        X = np.vstack(X).astype(np.float32)
        scaler = MinMaxScaler()
        X = np.nan_to_num(X) # For 'inf' or '-inf' values. replaces by large num.
        X = scaler.fit_transform(X)
        return scaler
    for d in os.listdir(data_folder):
        for f in glob.glob(data_folder+'/'+d+'/*.csv'):
            if f.endswith('_tr.csv'):
                scaler = rdk_descriptors(f)
                pickle.dump(scaler, open('../scalers/'+d+'-rdkDes_scaler.pkl', 'wb'))

In [12]:
rdk_features('../data_volsurf/data_new_models_volsurf')

0
0
0
0
0
0
0
0
0
0
0


In [13]:
from sklearn.preprocessing import MinMaxScaler
def volsurf_scaler(data_folder):
    def volsurf_descriptors(csvFile):
        df = pd.read_csv(csvFile)
        descriptors = df.iloc[:, 33:-1] # Give value of i where vs descriptors start.
        scaler = MinMaxScaler()
        X = scaler.fit_transform(descriptors)
        return scaler
    for d in os.listdir(data_folder):
        for f in glob.glob(data_folder+'/'+d+'/*.csv'):
            if f.endswith('_tr.csv'):
                scaler = volsurf_descriptors(f)
                pickle.dump(scaler, open('../scalers/'+d+'-volsurf_scaler.pkl', 'wb'))

In [14]:
volsurf_scaler('../data_volsurf/data_new_models_volsurf')

### Get rdkDescriptors using saved Scalers

In [15]:
def get_rdk_features(data_folder):
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    fpdict ={}
    fpdict['rdkDes'] = lambda m: calc.CalcDescriptors(m)

    def CalculateFP(fp_name, smiles):
        m = Chem.MolFromSmiles(smiles)
        return fpdict[fp_name](m)

    for d in os.listdir(data_folder):
        if d=='.DS_Store':
            continue
        pos = []
        trg = []
        for csv in glob.glob(data_folder\
                             +'/'+d+'/*.csv'):
            des = []
            not_found = []
            df = pd.read_csv(csv)
            clean_smi = df['SMILES_stand'].tolist()
            rdkit_des = []
            
            for i in range(len(clean_smi)):
                fp = CalculateFP('rdkDes', clean_smi[i])
                fp = np.asarray(fp)
                fp = fp.reshape(1,200)
                rdkit_des.append(fp)

            X = np.array(rdkit_des)
            X = X.reshape(len(rdkit_des),200)
            ndf = pd.DataFrame.from_records(X)
            ndf.isnull().sum().sum()
            r, _ = np.where(df.isna())
            ndf.isnull().sum().sum()

            for col in ndf.columns:
                ndf[col].fillna(ndf[col].mean(), inplace=True)
            ndf.isnull().sum().sum() 
            
            X = ndf.iloc[:, 0:].values
            X = np.vstack(X).astype(np.float32)
            load_scaler = pickle.load(open('../scalers/'+d+'-'+'rdkDes_scaler.pkl', 'rb'))
            X = np.nan_to_num(X) #np.nan_to_num(X) "replace nan with zero and inf with finite numbers"
            X = load_scaler.transform(X)
#             labelencoder = LabelEncoder()                       #Converting 'str' label to numeric label
#             Y = labelencoder.fit_transform(df['Label'].values) 
            Y = df['Label'].values
            Y = Y.reshape(Y.shape[0], 1)

            fp_array = ( np.asarray((X), dtype=object) ) # Do i need this line
            X = np.delete(fp_array, not_found, axis=0)
            X = X.astype(np.float32) 
            print(fp_array.shape)
            final_array = np.concatenate((X, Y), axis=1)
            out = os.path.splitext(os.path.basename(csv))[0]
            print(out,'---',final_array.shape)
            pos.append(np.count_nonzero(Y))
            trg.append(out)
            print('pos',pos[-1])
            np.save('../rdk_descriptors/'+\
                    'rdkDes-'+out+'.npy', np.asarray((final_array), dtype=np.float32))

In [16]:
get_rdk_features('../data_volsurf/data_new_models_volsurf')

(250, 200)
MERS-PPE_cs-balanced_randomsplit7_70_15_15_tr --- (250, 201)
pos 125
(54, 200)
MERS-PPE_cs-balanced_randomsplit7_70_15_15_te --- (54, 201)
pos 27
(54, 200)
MERS-PPE_cs-balanced_randomsplit7_70_15_15_va --- (54, 201)
pos 27
(216, 200)
CoV1-PPE_cs-balanced_randomsplit7_70_15_15_tr --- (216, 201)
pos 108
(47, 200)
CoV1-PPE_cs-balanced_randomsplit7_70_15_15_te --- (47, 201)
pos 24
(47, 200)
CoV1-PPE_cs-balanced_randomsplit7_70_15_15_va --- (47, 201)
pos 23
(87, 200)
3CL-balanced_randomsplit7_70_15_15_va --- (87, 201)
pos 43
(404, 200)
3CL-balanced_randomsplit7_70_15_15_tr --- (404, 201)
pos 202
(87, 200)
3CL-balanced_randomsplit7_70_15_15_te --- (87, 201)
pos 44
(336, 200)
cytotox-balanced_randomsplit7_70_15_15_va --- (336, 201)
pos 168
(337, 200)
cytotox-balanced_randomsplit7_70_15_15_te --- (337, 201)
pos 169
(1569, 200)
cytotox-balanced_randomsplit7_70_15_15_tr --- (1569, 201)
pos 784
(691, 200)
CPE-balanced_randomsplit7_70_15_15_tr --- (691, 201)
pos 345
(149, 200)
CPE-balan

In [17]:
get_rdk_features('../data_volsurf/externals')

(6, 200)
3cl_external_set_stand --- (6, 201)
pos 6
(24, 200)
cpe_external_set_after_phys-chem-filters_stand --- (24, 201)
pos 24


In [18]:
def get_volsurf_descriptors(data_folder):
    for d in os.listdir(data_folder):
        if d=='.DS_Store':
            continue
        pos = []
        trg = []
        for csv in glob.glob(data_folder\
                             +'/'+d+'/*.csv'):
            des = []
            not_found = []
            df = pd.read_csv(csv)
            values = df.iloc[:, 33:-1]
            load_scaler = pickle.load(open('../scalers/'+d+'-'+'volsurf_scaler.pkl', 'rb'))
            X = load_scaler.transform(values)
            Y = df['Label'].values
            Y = Y.reshape(Y.shape[0],1)
    #         fp_array = ( np.asarray((X), dtype=object) )
    #         X = np.delete(fp_array, not_found, axis=0)
            X = X.astype(np.float32) 
            final_array = np.concatenate((X, Y), axis=1)
            out = os.path.splitext(os.path.basename(csv))[0]
            print(out,'---',final_array.shape)
            pos.append(np.count_nonzero(Y))
            trg.append(out)
            print('pos',pos[-1])
            np.save('../volsurf_descriptors/'+\
                    'volsurf-'+out+'.npy', np.asarray((final_array), dtype=np.float32))

In [19]:
get_volsurf_descriptors('../data_volsurf/data_new_models_volsurf')

MERS-PPE_cs-balanced_randomsplit7_70_15_15_tr --- (250, 129)
pos 125
MERS-PPE_cs-balanced_randomsplit7_70_15_15_te --- (54, 129)
pos 27
MERS-PPE_cs-balanced_randomsplit7_70_15_15_va --- (54, 129)
pos 27
CoV1-PPE_cs-balanced_randomsplit7_70_15_15_tr --- (216, 129)
pos 108
CoV1-PPE_cs-balanced_randomsplit7_70_15_15_te --- (47, 129)
pos 24
CoV1-PPE_cs-balanced_randomsplit7_70_15_15_va --- (47, 129)
pos 23
3CL-balanced_randomsplit7_70_15_15_va --- (87, 129)
pos 43
3CL-balanced_randomsplit7_70_15_15_tr --- (404, 129)
pos 202
3CL-balanced_randomsplit7_70_15_15_te --- (87, 129)
pos 44
cytotox-balanced_randomsplit7_70_15_15_va --- (336, 129)
pos 168
cytotox-balanced_randomsplit7_70_15_15_te --- (337, 129)
pos 169
cytotox-balanced_randomsplit7_70_15_15_tr --- (1569, 129)
pos 784
CPE-balanced_randomsplit7_70_15_15_tr --- (691, 129)
pos 345
CPE-balanced_randomsplit7_70_15_15_te --- (149, 129)
pos 75
CPE-balanced_randomsplit7_70_15_15_va --- (148, 129)
pos 74
AlphaLISA-balanced_randomsplit7_70_15_

In [20]:
def get_volsurf_descriptors_ext(data_folder):
    for d in os.listdir(data_folder):
        if d=='.DS_Store':
            continue
        pos = []
        trg = []
        for csv in glob.glob(data_folder\
                             +'/'+d+'/*.csv'):
            des = []
            not_found = []
            df = pd.read_csv(csv)
            values = df.iloc[:, 3:-1]
            load_scaler = pickle.load(open('../scalers/'+d+'-'+'volsurf_scaler.pkl', 'rb'))
            X = load_scaler.transform(values)
            Y = df['Label'].values
            Y = Y.reshape(Y.shape[0],1)
    #         fp_array = ( np.asarray((X), dtype=object) )
    #         X = np.delete(fp_array, not_found, axis=0)
            X = X.astype(np.float32) 
            final_array = np.concatenate((X, Y), axis=1)
            out = os.path.splitext(os.path.basename(csv))[0]
            print(out,'---',final_array.shape)
            pos.append(np.count_nonzero(Y))
            trg.append(out)
            print('pos',pos[-1])
            np.save('../volsurf_descriptors_ext/'+\
                    'volsurf-'+out+'.npy', np.asarray((final_array), dtype=np.float32))

In [21]:
get_volsurf_descriptors_ext('../data_volsurf/externals_volsurf')

3cl_external_set_stand --- (6, 129)
pos 6
cpe_external_set_after_phys-chem-filters_stand --- (24, 129)
pos 24
