In [1]:
# METADATA:
__author__='Leela Sarath Kumar Konda'
#########################################################################################################
# README                                                                                                #
#########################################################################################################
# Before executing this code install the following                                                      #
#                                                                                                       #
# 1) postgresql                                                                                         #
#   linux            - sudo apt install postgresql postgresql-contrib                                   #
#                    - sudo systemctl start postgresql                                                  #
#   windows          - download the file from                                                           #
#                    - https://www.enterprisedb.com/downloads/postgres-postgresql-downloads             #
#   if you are unable to install it then install chembl-webresource-client by using the command given   #
#   in the below optional section                                                                       #
#                                                                                                       #
# 2) chembl sql dump - Download the file from ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/ #
#   1. Log into PostgreSQL database server where you intend to load chembl data and run the following   #
#   command to create new database:                                                                     #
#   pgdb=# create database chembl_27;                                                                   #
#   2. Logout of database and run the following command to load data. You will need to replace          #
#   USERNAME, HOST and PORT with local settings. Depending on your database setup you may not need      #
#   the host and port arguments.                                                                        #
#   $> pg_restore --no-owner -h HOST -p PORT -U USERNAME -d chembl_27 chembl_27_postgresql.dmp          #
#                                                                                                       #
# 3) anaconda        - Download the file from https://www.anaconda.com/ and                             #
#                    - follow the given instructions                                                    #
#                                                                                                       #
# 4) sqlalchemy      - pip install SQLAlchemy                                                           #
#                                                                                                       #
# 5) psycopg2        - pip install psycopg2                                                             #
#                                                                                                       #
# 6) rdkit           - conda install -c conda-forge rdkit                                               #
#                                                                                                       #
# 7) mordred         - pip install mordred                                                              #
#                                                                                                       #
# 8) MolVS           - pip install MolVS                                                                #
#                                                                                                       #
# 9) pycaret         - pip install pycaret                                                              #
#                                                                                                       #
# 10) ngboost        - pip install ngboost                                                              #
#                                                                                                       #
# 11) smogn          - pip install smogn                                                                #
#                                                                                                       #
# 12) java                                                                                              #
#     linux          - sudo apt install default-jdk default-jre                                         #
#     windows        - download the file from                                                           #
#                    - https://www.oracle.com/in/java/technologies/javase-jdk15-downloads.html          #
#                                                                                                       #
# 13) psutil         - pip install psutil                                                               #
#                                                                                                       #
# 14) easygui        - pip install easygui                                                              #
#                                                                                                       #
#########################################################################################################
#                                                                                                       #
# optional packages to install                                                                          #
#                                                                                                       #
# 1) chembl-web     - pip install chembl-webresource-client                                             #
#    if it throws any error while using chembl-webresource-client package after installing it by using  #
#    pip. Try to re-install it using                                                                    #
#                   - conda install -c conda-forge chembl_webresource_client                            #
#                                                                                                       #
# 2) PaDEL Descriptor python wrapper - pip install padelpy                                              #
#                                                                                                       #
# 3) weka python wrapper             - pip install python-weka-wrapper3                                 #
#                                                                                                       #
#########################################################################################################
#                                                                                                       #
# user input file                                                                                       #
#                                                                                                       #
# for user input csv file must contain following column names to execute the code properly in any       #
# sequence. column names = mol_id, standard_type, pvalue, canonical_smiles                              #
#                                                                                                       #
#########################################################################################################
# for AttributeError: 'Remove_100' object has no attribute 'columns_to_drop' use following command to fix
# pip install --upgrade --user git+https://github.com/pycaret/pycaret.git


In [2]:
# imports
import sys
import warnings
import time
import os
from os.path import join
from os.path import splitext
from os.path import basename
import platform
from psutil import virtual_memory
import easygui

from sqlalchemy import create_engine #RDBS connections
from chembl_webresource_client.new_client import new_client
import pandas
import numpy
import re
from subprocess import call
import itertools
from xml.etree import ElementTree as ET
from collections import Counter

from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Avalon.pyAvalonTools import GetAvalonFP
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit import SimDivFilters

from molvs import Standardizer
from molvs.standardize import standardize_smiles

from mordred import Calculator
from mordred import descriptors

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
#from sklearn.linear_model import LassoCV
#from sklearn.linear_model import Lasso
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
#from lightgbm import LGBMClassifier
import scipy.stats as stats

# import imbalance learn oversampling techniques
from imblearn.over_sampling import *
# SMOTE for Regression with Gaussian Noise. similar to imblearn but for regression
import smogn
#import pycaret classification module
from pycaret import classification
#import pycaret regression module
from pycaret import regression
from sklearn.metrics import confusion_matrix
from ngboost import NGBClassifier

In [3]:
def get_assay_conc(assay_desc, mol_wt):
    pattern = 'at(| | <| >)(\d+(\.\d+)?) (nM|uM|mM|ug/mL|ug/ml)'
    pattern = re.search(pattern, assay_desc)
    if pattern is not None:
        pattern = pattern[0].replace('at ','')
        molar = pattern.split(' ')[1]
        if '<' in pattern.split(' ')[0] or '>' in pattern.split(' ')[0]:
            pattern = pattern.translate(str.maketrans({'<': '', '>': ''}))
        if molar == 'nM':
            return(float(pattern.split(' ')[0])/(1000))
        elif molar == 'mM':
            return(float(pattern.split(' ')[0])*(1000))
        elif molar == 'uM':
            return(float(pattern.split(' ')[0]))
        elif molar == 'ug/mL' or molar == 'ug/ml':
            return(float(pattern.split(' ')[0])/mol_wt)
        #else: 
        # add other unit types as well if required
        #return

def std_conversion(std_val, std_unit, mol_wt, counter):
    if std_unit == 'nM':
        pass
    elif std_unit == 'uM':
        std_val = (float(std_val) * 1000)
    elif (std_unit == 'ng.mL-1'
          or std_unit == 'ng.ml-1'
          or std_unit == 'ng/mL'
          or std_unit == 'ng/ml'):
        std_val = (float(std_val) / mol_wt)
    elif (std_unit == 'ug.mL-1'
          or std_unit == 'ug.ml-1'
          or std_unit == 'ug/mL'
          or std_unit == 'ug/mL'):
        std_val = ((float(std_val) / mol_wt) * 1000)
    else:
        #print('add this "{}" new standard units'.format(std_units))
        # 'p.p.m', 'ppm', 'ug cm**-2', 'ug disk-1', 'ug mg-1', 'microg', 'ug', 'microg/cm3', 'uL/ml'
        # '10'-2micromol/ml', '10'-2 umol/ml', '10'-2umol', '10'-3uM/ml', '10'-3micromol/ml'
        counter += 1
    return(std_val)

In [4]:
def get_raw_data(records, counter, max_molwt, std_type):
    records['assay_conc_uM'] = [get_assay_conc(record, mol_wt)
                                if std_type == 'Inhibition' 
                                else numpy.nan 
                                for record, mol_wt in zip(records.assay_description
                                                          , records.mol_wt)]
    records['value'] = [std_conversion(std_val, std_unit
                                       , mol_wt, counter)
                        if std_type != 'Inhibition' 
                        else numpy.nan 
                        for std_val, std_unit, mol_wt in zip(records.standard_value
                                                             , records.standard_units
                                                             , records.mol_wt)]
    # replace '>=' to '>' and '<=' to '<' in RELATION column to ease the grouping of RELATION column
    # make changes in raw data
    records.standard_relation.replace({'>=': '>', '<=': '<'}, inplace=True)
    # convert data type of potency column from object to numeric
    records[['value', 'assay_conc_uM']] = records[['value', 'assay_conc_uM']].apply(pandas.to_numeric)#.astype('float64')
    # replace empty cells with Zeros and make changes in raw data
    records.document_year.fillna(0, inplace=True)
    # replace empty entries in raw data with #. eg:- '' to # and make changes in raw data
    records.standard_relation.fillna('#', inplace=True)
    return(records)

In [5]:
def customsort(group_keys):
    #assay_sort = {'B':0, 'F':1, 'A':2, 'T':3, 'P':4, 'U':5}
    #sign_sort = {'=':0, '#':1, '<':2, '>':3}
    try_dict = {}
    for key in group_keys:
        if (key == 'B' 
            or key == '=' 
            or key == 'vStrong' 
            or key == 4): try_dict[key]=0
        elif (key == 'F' 
              or key == '#'
              or key == 'Strong'
              or key == 3): try_dict[key]=1
        elif (key == 'A'
              or key == '<'
              or key == 'Moderate'
              or key == 2): try_dict[key]=2
        elif (key == 'T'
              or key == '>'
              or key == 'Weak'
              or key == 1): try_dict[key]=3
        elif (key == 'P'
              or key == 'Inactive'
              or key == 0): try_dict[key]=4
        elif (key == 'U'
              or key == 'vInactive'
              or key == 40): try_dict[key]=5
        elif (key == 'sInactive'
              or key == 30): try_dict[key]=6
        elif (key == 'mInactive'
              or key == 20): try_dict[key]=7
        elif (key == 'wInactive'
              or key == 10): try_dict[key]=8
        else : try_dict[key]=9
    sorted_dict = {key: value for key, value in sorted(try_dict.items(), key=lambda item: item[1])}
    return(list(sorted_dict.keys()))

In [6]:
def get_unique_mols(raw_data, std_type):
    # create a empty dataframe
    unique_data = test_data = pandas.DataFrame()
    id_groups = raw_data.groupby('mol_id', sort=False)
    molids = id_groups.groups.keys()
    for molid in molids:
        idgroup = id_groups.get_group(molid)
        dategroups = idgroup.groupby('document_year', sort=False)
        # sort dates in descending order
        dates = sorted(dategroups.groups.keys(), reverse=True)
        date = dategroups.get_group(dates[0])
        assays = date.groupby('assay_type', sort=False)
        sorted_assays = customsort(assays.groups.keys())
        assay = assays.get_group(sorted_assays[0])
        signs = assay.groupby('standard_relation', sort=False)
        sorted_signs = customsort(signs.groups.keys())
        entry = signs.get_group(sorted_signs[0])
        if len(entry) > 1: # len(entry) = number of rows
            if std_type != 'Inhibition':
                entry['pvalue'] = abs(numpy.log10(entry['value']*(10**-9)))
                entry['pvalue_diff'] = abs(entry['pvalue'].diff())
                lessthan_count = greaterthan_count = 0
                gt_idx_list = []
                for value_diff in entry['pvalue_diff']:
                    if value_diff > 0.5:
                        greaterthan_count += 1
                        # index.items() throwing error and .tolist() .values giving list and array
                        gt_idx_list = [idx for idx in entry[entry['pvalue_diff']==value_diff].index]
                    else:
                        lessthan_count += 1
                if lessthan_count > greaterthan_count:
                    entry.drop(gt_idx_list, inplace=True)
                    entry.drop('pvalue_diff', axis=1, inplace=True)
                    pvalue_mean = entry['pvalue'].mean()
                    entry.drop_duplicates('mol_id', inplace=True)
                    entry.pvalue = entry.pvalue.replace(entry.pvalue.iloc[0], pvalue_mean)
                    # add entries to temporary dataframe
                    unique_data = unique_data.append(entry, ignore_index=True)
                else: # add to test set dataframe
                    entry.drop('pvalue_diff', axis=1, inplace=True)
                    entry.drop_duplicates('mol_id', inplace=True)
                    # add entries to temporary dataframe
                    test_data = test_data.append(entry, ignore_index=True)
            else:
                entry['pvalue'] = numpy.nan
                lessthan_count = greaterthan_count = 0
                gt_idx_list = []
                for value_diff in entry['standard_value']:
                    if value_diff >= 50:
                        greaterthan_count += 1
                        # index.items() throwing error and .tolist() .values giving list and array
                        gt_idx_list = [idx for idx in entry[entry['standard_value']==value_diff].index]
                    else:
                        lessthan_count += 1
                        gt_idx_list = [idx for idx in entry[entry['standard_value']==value_diff].index]
                if lessthan_count == greaterthan_count:
                    entry.drop_duplicates('mol_id', inplace=True)
                    # add entries to temporary dataframe
                    test_data = test_data.append(entry, ignore_index=True)
                else: # add to test set dataframe
                    entry = entry.loc[gt_idx_list]
                    value_mean = entry['standard_value'].mean()
                    entry.drop_duplicates('mol_id', inplace=True)
                    entry.value = entry.value.replace(entry.value.iloc[0], value_mean)
                    # add entries to temporary dataframe
                    unique_data = unique_data.append(entry, ignore_index=True)                 
        else:
            if std_type != 'Inhibition':
                entry['pvalue'] = abs(numpy.log10(entry['value']*(10**-9)))
            else:
                entry['pvalue'] = numpy.nan
            # add entries to temporary dataframe
            unique_data = unique_data.append(entry, ignore_index=True)
    return(unique_data, test_data)

In [7]:
def stdsmiles(smiles):
    std = Standardizer()
    return(std.standardize(Chem.MolFromSmiles(smiles)))

def removesalt(std_mol):
    remover = SaltRemover()
    res = remover.StripMol(std_mol, dontRemoveEverything=True)
    if res.GetNumAtoms() <= 1: return
    else: return(Chem.MolToSmiles(res)) # return rdkit smiles

In [8]:
def get_std_data(dataframe, name, cutoffs, choice):
       
    if not dataframe.empty or cutoffs is not None:
        # drop the duplicate entries using 'ID' column
        dataframe.drop_duplicates('mol_id', inplace=True)
        # get standardized smiles
        dataframe['standard_smiles'] = [removesalt(stdsmiles(smiles)) 
                                        for smiles in dataframe.canonical_smiles]
        dataframe.drop_duplicates('standard_smiles', inplace=True)
        multiclass = []
        #options for the user to choose
        options = ['ChEMBL target list', 'Molecules list']
        
        if choice == options[0]:
            for std_value, assay_conc, percent_value, log_value in zip(dataframe.standard_type.values
                                                                       , dataframe.assay_conc_uM.values
                                                                       , dataframe.value.values
                                                                       , dataframe.pvalue.values):
                for i in range(0, len(cutoffs)):
                    if std_value  == 'Inhibition':
                        if assay_conc <= float(cutoffs[i]):
                            if percent_value >= 50: multiclass.append(len(cutoffs)-i)
                            else: multiclass.append((len(cutoffs)-i)*10)
                            break
                    else:
                        if log_value >= abs(numpy.log10(float(cutoffs[i])*(10**-6))):
                            multiclass.append(len(cutoffs)-i)
                            break
                else: multiclass.append(0)
        else:
            for log_value in dataframe.pvalue.values:
                for i in range(0, len(cutoffs)):
                    if log_value >= abs(numpy.log10(float(cutoffs[i])*(10**-6))):
                            multiclass.append(len(cutoffs)-i)
                            break
                else: multiclass.append(0)
        dataframe['multiclass'] = multiclass
        print('|_ {} {} molecules were found'.format(dataframe.shape[0], name))
    else:
        print('|_ {} {} molecules were found'.format(dataframe.shape[0], name))
    return(dataframe)

In [9]:
def get_data_from_chembldb(target_id, targ_dir, cutoffs, thresholds, choice):
    # define minimum molecular weight threshold
    max_molwt = 850
    print(target_id)
    std_types = ['IC50', 'Ki', 'Inhibition']#,'MIC' and 'Potency' include as well and check the code running status
    unique_data = test_data = pandas.DataFrame()
    for std_type in std_types:
        mol_query="""
        -- Retrieve compound activity details for a target
        SELECT md.CHEMBL_ID AS mol_id
            , cs.CANONICAL_SMILES
            , d.YEAR as document_year
            , act.STANDARD_TYPE
            , act.STANDARD_RELATION
            , act.STANDARD_VALUE
            , act.STANDARD_UNITS
            , a.ASSAY_TYPE
            , a.DESCRIPTION AS assay_description
            , cp.MW_FREEBASE as mol_wt
            , td.CHEMBL_ID AS target_id
            , td.PREF_NAME
        FROM target_dictionary td
            JOIN ASSAYS a ON td.TID = a.TID
            JOIN ACTIVITIES act ON a.ASSAY_ID = act.ASSAY_ID
            JOIN DOCS d ON act.DOC_ID = d.DOC_ID
            JOIN MOLECULE_DICTIONARY md ON act.MOLREGNO = md.MOLREGNO
            JOIN COMPOUND_STRUCTURES cs ON md.MOLREGNO = cs.MOLREGNO
            JOIN COMPOUND_PROPERTIES cp ON cs.MOLREGNO = cp.MOLREGNO
        WHERE cs.CANONICAL_SMILES IS NOT NULL
            AND (act.STANDARD_VALUE IS NOT NULL
                AND act.STANDARD_UNITS IS NOT NULL)
            AND (NOT (act.STANDARD_TEXT_VALUE LIKE 'inconclusive'
                    OR act.STANDARD_TEXT_VALUE LIKE 'undetermined')
                OR act.STANDARD_TEXT_VALUE IS NULL)
            AND a.CONFIDENCE_SCORE IN (9, 8)
            AND cp.MW_FREEBASE <= '{}'
            AND act.STANDARD_TYPE = '{}'
            AND td.CHEMBL_ID = '{}';
        """.format(max_molwt, std_type, target_id)
        
        records = pandas.read_sql(mol_query, engine)
        #records['assay_conc_uM'] = ''
        counter = 0
        if records.empty: print('|_ For activity type {} 0 records were found'.format(std_type))
        else:
            print('|_ For activity type {} {} records were found'.format(std_type, records.shape[0]))
            # get raw dataframe
            raw_data = get_raw_data(records, counter, max_molwt, std_type)
            print('|  |_ {} records were found in Raw data'.format(raw_data.shape[0]))
            raw_data.to_csv('{}_{}_rawdata.csv'.format(join(targ_dir, target_id)
                                                       , std_type), index=False)
            # get unique molecule entries
            unique_mols, test_mols = get_unique_mols(raw_data, std_type)
            print('|  |_ {} unique records were found'.format(unique_mols.shape[0]))
            print('|  |_ {} test records were found'.format(test_mols.shape[0]))
            unique_mols.to_csv('{}_{}_uniquemols.csv'.format(join(targ_dir, target_id)
                                                             , std_type), index=False)
            test_mols.to_csv('{}_{}_testmols.csv'.format(join(targ_dir, target_id)
                                                         , std_type), index=False)
            unique_data = unique_data.append(unique_mols, ignore_index=True)
            test_data = test_data.append(test_mols, ignore_index=True)                   
    if not unique_data.empty:
        std_records = get_std_data(unique_data, 'uniquedata', cutoffs, choice)
        std_records.to_csv('{}_uniquedata.csv'.format(join(targ_dir, target_id)), index=False)
    if not test_data.empty:
        test_records = get_std_data(test_data, 'testdata', cutoffs, choice)
        test_records.to_csv('{}_testdata.csv'.format(join(targ_dir, target_id)), index=False)
    else: test_records=None
    return(std_records, test_records)

In [10]:
def get_data_from_chemblwbc(target_id, targ_dir, cutoffs, thresholds, choice):
    # define minimum molecular weight threshold
    max_molwt = 850
    print(target_id)
    std_types = ['IC50', 'Ki', 'Inhibition']#,'MIC' and 'Potency' include as well and check the code running status
    unique_data = test_data = pandas.DataFrame()
    for std_type in std_types:
        records = new_client.activity.filter(target_chembl_id=target_id
                                             , standard_type=std_type
                                             , confidence_score=9|8
                                             , molecule_properties__mw_freebase__lte=850
                                             , standard_value__isnull=False
                                             , standard_units__isnull=False
                                             , canonical_smiles__isnull=False
                                            ).only(['molecule_chembl_id'
                                                    , 'canonical_smiles'
                                                    , 'document_year'
                                                    , 'standard_relation'
                                                    , 'standard_value'
                                                    , 'standard_units'
                                                    , 'assay_type'
                                                    , 'assay_description'
                                                    , 'standard_text_value'
                                                    , 'target_chembl_id'
                                                    , 'target_pref_name'])
        records = [record for record in records if not (record['standard_text_value']=='inconclusive'
                                                        or record['standard_text_value']=='undetermined'
                                                        or record['standard_text_value']!=None)]
        records = pandas.DataFrame(records, columns=['molecule_chembl_id', 'canonical_smiles'
                                                     , 'document_year', 'type'
                                                     , 'standard_relation', 'standard_value'
                                                     , 'standard_units', 'assay_type'
                                                     , 'assay_description', 'target_chembl_id'
                                                     , 'target_pref_name'])
        records.rename(columns={'molecule_chembl_id': 'mol_id', 'target_pref_name':'pref_name'
                                , 'type':'standard_type', 'target_chembl_id':'target_id'
                                , 'target_pref_name':'pref_name'}, inplace=True)
        molwt=[]
        for mol_id in records.mol_id.values:
            props = new_client.molecule.filter(molecule_chembl_id=mol_id).only(['molecule_properties'])
            for mw in props:
                molwt.append(mw['molecule_properties']['mw_freebase'])
        molwt = pandas.DataFrame(molwt, columns =['mol_wt'])
        records = pandas.concat([records, molwt], axis=1)
        counter = 0
        if records.empty: print('|_ For activity type {} 0 records were found'.format(std_type))
        else:
            print('|_ For activity type {} {} records were found'.format(std_type, records.shape[0]))
            # get raw dataframe
            raw_data = get_raw_data(records, counter, max_molwt, std_type)
            print('|  |_ {} records were found in Raw data'.format(raw_data.shape[0]))
            raw_data.to_csv('{}_{}_rawdata.csv'.format(join(targ_dir, target_id)
                                                       , std_type), index=False)
            # get unique molecule entries
            unique_mols, test_mols = get_unique_mols(raw_data, std_type)
            print('|  |_ {} unique records were found'.format(unique_mols.shape[0]))
            print('|  |_ {} test records were found'.format(test_mols.shape[0]))
            unique_mols.to_csv('{}_{}_uniquemols.csv'.format(join(targ_dir, target_id)
                                                             , std_type), index=False)
            test_mols.to_csv('{}_{}_testmols.csv'.format(join(targ_dir, target_id)
                                                         , std_type), index=False)
            unique_data = unique_data.append(unique_mols, ignore_index=True)
            test_data = test_data.append(test_mols, ignore_index=True)                   
    if not unique_data.empty:
        std_records = get_std_data(unique_data, 'uniquedata', cutoffs, choice)
        std_records.to_csv('{}_uniquedata.csv'.format(join(targ_dir, target_id)), index=False)
    if not test_data.empty:
        test_records = get_std_data(test_data, 'testdata', cutoffs, choice)
        test_records.to_csv('{}_testdata.csv'.format(join(targ_dir, target_id)), index=False)
    else: test_records=None
    return(std_records, test_records)

In [11]:
def smi_to_sdf(std_records, target_id, targ_dir ):
    # from the standard dataframe reads, the smiles and id column
    # using RDkit functions converts smiles to sdf molecule
    # includes the ID as name or title of the each entry
    # and returns the sdf filename  
    sd_filename = '{}.sdf'.format(join(targ_dir, target_id))
    sdwriter = Chem.SDWriter(sd_filename)
    for mol, title in zip(std_records.standard_smiles, std_records.mol_id):
        mol = Chem.MolFromSmiles(mol)
        hmol = Chem.AddHs(mol)
        hmol.SetProp("_Name", title.strip())
        sdwriter.write(hmol)
    sdwriter.close()
    return(sd_filename)

In [12]:
def build_xml(fp_type):
    
    def indent(elem, level=0):
        i = "\n" + level*'    '
        if len(elem):
            if not elem.text or not elem.text.strip(): elem.text = i + '    '
            if not elem.tail or not elem.tail.strip(): elem.tail = i
            for elem in elem:
                indent(elem, level+1)
            if not elem.tail or not elem.tail.strip(): elem.tail = i
        else:
            if level and (not elem.tail or not elem.tail.strip()): elem.tail = i
    root = ET.Element('Root')
    group = ET.SubElement(root, 'Group')
    group.set('name','Fingerprint')
    descriptor = ET.SubElement(group, 'Descriptor')
    descriptor.set('name',fp_type)
    descriptor.set('value','true')
    indent(root)
    # create a new XML file with the results
    mydata = ET.tostring(root)
    data = ET.ElementTree(root)
    xml_file = join(os.getcwd(), 'fp.xml')
    data.write(xml_file, encoding='utf-8', method="xml") #'    '
    return(xml_file)

In [13]:
def calc_desc_fp(sd_filename):
    # calculates the descriptors using PaDEL, Mordred and rdkit
    # generates individual pandas dataframes
    # and return them as a dict
    desc_frames = {}
    # padel descriptor calculation:
    # pass the smiles files
    # PaDEL-Descriptor should be kept in working directory
    _PADEL_PATH = join(os.getcwd(), 'PaDEL-Descriptor', 'PaDEL-Descriptor.jar')
    _JCM_PATH = join(os.getcwd(), 'jcompound-mapper', 'jCMapperCLI.jar')

    def pdl_desc_calc():
        pdl_output_file = '{}_pdl_desc.csv'.format(splitext(sd_filename)[0])
        # -maxruntime Maximum running time per molecule(in milliseconds). Use -1 for unlimited.
        # 300000 milliseconds = 2 minutes
        _PARAMETERS = "-maxruntime 120000 -threads -1 -2d -removesalt "
        _PARAMETERS += "-standardizenitro -detectaromaticity -retainorder "
        command = 'java -jar -splash:disable {}'.format(_PADEL_PATH)
        command += ' {}'.format(_PARAMETERS)
        command += ' -dir {}'.format(sd_filename)
        command += ' -file {}'.format(pdl_output_file)
        #running the command using subprocess.call
        #call([command], shell=True)
        os.system(command)
        # removing the first column of the dataframe as it contains both name and class
        # as comma separated and we are going to add them separately in the end
        pdl_frame = pandas.read_csv(pdl_output_file)
        pdl_frame.drop('Name', axis=1, inplace=True)
        print("|  |_ PaDEL Descriptors calculation was completed")
        return (pdl_frame)

    # mordred descriptor calculation:
    # pass a list of molecules or the rdkit smiles molecule supplier
    def mrd_desc_calc():
        #create mrd descriptor calculator:
        mrd_calc = Calculator(descriptors)
        #pass the smiles input file to rdkit using SDMolSupplier from rdkit
        mols = Chem.SDMolSupplier(sd_filename)
        #pandas method to calculate multiple molecules( returns pandas dataframe):
        mrd_frame = mrd_calc.pandas(mols, quiet=True)  # quiet=True is for don’t show progress bar
        print("|  |_ Mordred Descriptors calculation was completed")
        return (mrd_frame)

    # rdkit descriptor calculation:
    # pass a list of molecules or the rdkit smiles molecule supplier
    def rdk_desc_calc():
        # get the descriptors list to calculate:
        desc_names = list(numpy.array(Descriptors._descList)[:, 0])
        # create rdk descriptor calculator:
        rdk_calc = MolecularDescriptorCalculator(desc_names)
        #pass the smiles input file to rdkit using SDMolSupplier from rdkit
        mols = Chem.SDMolSupplier(sd_filename)
        rdk_desc = []
        for mol in mols:
            descriptor = rdk_calc.CalcDescriptors(mol)
            # using itertools
            items = list(itertools.chain(descriptor))
            rdk_desc.append(items)

        # rdk pandas dataframe:
        rdk_frame = pandas.DataFrame(rdk_desc, columns=desc_names)
        print("|  |_ RDKit Descriptors calculation was completed")
        return (rdk_frame)

    def pdl_fp_calc(fp_type):
        _fp_file = build_xml(fp_type)
        #_fp_file = join(os.getcwd(), xml_file)
        pdl_output_file = '{}_pdl_{}.csv'.format(splitext(sd_filename)[0], fp_type)
        _PARAMETERS = "-threads -1 -fingerprints -removesalt "
        _PARAMETERS += "-standardizenitro -detectaromaticity -retainorder -descriptortypes "
        command = 'java -jar -splash:disable {}'.format(_PADEL_PATH)
        command += ' {}{}'.format(_PARAMETERS, _fp_file)
        command += ' -dir {}'.format(sd_filename)
        command += ' -file {}'.format(pdl_output_file)
        #running the command using subprocess.call
        #call([command], shell=True)
        os.system(command)
        # padel descriptor output file to pandas data frame
        pdl_frame = pandas.read_csv(pdl_output_file)
        # removing the first column of the dataframe as it contains both name and class
        # as comma separated and we are going to add them separately in the end
        pdl_frame.drop('Name', axis=1, inplace=True)
        print("|  |_ PaDEL {} fingerprints calculation was completed".format(fp_type))
        return (pdl_frame)

    def rdk_fp_calc():
        fps_frames = {}

        #pass the smiles input file to rdkit using SDMolSupplier from rdkit
        mols = Chem.SDMolSupplier(sd_filename)
        # MorganFingerprint
        mg_fp = [[bit for bit in AllChem.GetMorganFingerprintAsBitVect(mol, radius=2).ToBitString()]
                 for mol in mols]
        mg_fp_name = ['RDMFP{}'.format(i) for i in range(0, len(mg_fp[0]))]
        mg_df = pandas.DataFrame(mg_fp, columns=mg_fp_name)
        fps_frames['MorganFingerprint'] = mg_df
        print("|  |_ RDKit MorganFingerprint calculation was completed")
        # RDKitFingerprint
        rd_fp = [[bit for bit in Chem.RDKFingerprint(mol).ToBitString()] for mol in mols]
        rd_fp_name = ['RDRFP{}'.format(i) for i in range(0, len(rd_fp[0]))]
        rd_df = pandas.DataFrame(rd_fp, columns=rd_fp_name)
        fps_frames['RDKitFingerprint'] = rd_df
        print("|  |_ RDKit RDKitFingerprint calculation was completed")
        # HashedAtomPairFingerprint
        ap_fp = [[bit for bit in AllChem.GetHashedAtomPairFingerprintAsBitVect(mol).ToBitString()]
                 for mol in mols]
        ap_fp_name = ['RDAFP{}'.format(i) for i in range(0, len(ap_fp[0]))]
        ap_df = pandas.DataFrame(ap_fp, columns=ap_fp_name)
        fps_frames['HashedAtomPairFingerprint'] = ap_df
        print("|  |_ RDKit HashedAtomPairFingerprint calculation was completed")
        # HashedTopologicalTorsionFingerprint
        tp_fp = [[bit for bit in AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(mol).ToBitString()]
                 for mol in mols]
        tp_fp_name = ['RDTFP{}'.format(i) for i in range(0, len(tp_fp[0]))]
        tp_df = pandas.DataFrame(tp_fp, columns=tp_fp_name)
        fps_frames['HashedTopologicalTorsionFingerprint'] = tp_df
        print("|  |_ RDKit HashedTopologicalTorsionFingerprint calculation was completed")
        # ErGFingerprint
        er_fp = [AllChem.GetErGFingerprint(mol) for mol in mols]
        er_fp_name = ['RDEFP{}'.format(i) for i in range(0, len(er_fp[0]))]
        er_df = pandas.DataFrame(er_fp, columns=er_fp_name)
        fps_frames['ErGFingerprint'] = er_df
        print("|  |_ RDKit ErGFingerprint calculation was completed")
        '''
        # FingerprintMol
        fp_fp = [[bit for bit in FingerprintMols.FingerprintMol(mol).ToBitString()] for mol in mols]
        fp_fp_name = ['RDFFP{}'.format(i) for i in range(0, 2048)]# using fp_fp[0] instead 2048 creates an error 
                                                                  # "AssertionError:/ValueError:
                                                                  # * columns passed, passed data had * columns"
        fp_df = pandas.DataFrame(fp_fp, columns=[fp_fp_name])
        fp_df.fillna(0, inplace=True)
        fps_frames['FingerprintMol'] = fp_df
        print("|  |_ RDKit FingerprintMol calculation was completed")
        '''
        # AvalonFingerprint
        av_fp = [[bit for bit in GetAvalonFP(mol).ToBitString()] for mol in mols]
        av_fp_name = ['RDVFP{}'.format(i) for i in range(0, len(av_fp[0]))]
        av_df = pandas.DataFrame(av_fp, columns=av_fp_name)
        fps_frames['AvalonFingerprint'] = av_df
        print("|  |_ RDKit AvalonFingerprint calculation was completed")

        # rdk pandas dataframe:
        #rdk_frame = pandas.concat(fps_frames, axis=1)
        print("|  |_ RDKit Fingerprints calculation was completed")
        return (fps_frames)
    
    def jcm_fp_calc(fp_type):
        jcm_output_file = '{}_jcm_{}.csv'.format(splitext(sd_filename)[0], fp_type)
        command = 'java -jar {}'.format(_JCM_PATH)
        command += ' -f {}'.format(sd_filename)
        command += ' -c {}'.format(fp_type)
        command += ' -ff FULL_CSV'
        command += ' -o {}'.format(jcm_output_file)
        os.system(command)
        jcm_frame = pandas.read_csv(jcm_output_file, header=None)
        jcm_frame.drop(jcm_frame.columns[0], axis=1, inplace=True)
        jcm_frame.columns = ['jCM-{}{}'.format(fp_type, i) for i in range(0, jcm_frame.shape[1])]
        print("|  |_ jCompound-Mapper {} fingerprints calculation was completed".format(fp_type))
        return(jcm_frame)

    #to calculate descriptors using padel descriptor jar file
    pdl_frame = pdl_desc_calc()
    desc_frames['padel'] = pdl_frame
    #to calculate descriptors using mordred
    mrd_frame = mrd_desc_calc()
    desc_frames['mordred'] = mrd_frame
    #to calculate descriptors using rdkit
    rdk_frame = rdk_desc_calc()
    desc_frames['rdkit'] = rdk_frame
    #to calculate fingerprints using padel descriptor jar file
    fingerprints = ['Fingerprinter', 'ExtendedFingerprinter'
                    , 'EStateFingerprinter', 'GraphOnlyFingerprinter'
                    , 'MACCSFingerprinter', 'PubchemFingerprinter'
                    , 'SubstructureFingerprinter', 'KlekotaRothFingerprinter'
                    , 'AtomPairs2DFingerprinter']
    for fp_type in fingerprints:
        pdl_fp_frame = pdl_fp_calc(fp_type)
        desc_frames[fp_type] = pdl_fp_frame
    #to calculate fingerprints using rdkit
    rdk_fp_frame = rdk_fp_calc()
    desc_frames.update(rdk_fp_frame)
    #to calculate fingerprints using jCompoundMapper jar file
    fingerprints = ['DFS', 'ASP', 'AT2D', 'CATS2D', 'PHAP2POINT2D'
                    , 'PHAP3POINT2D', 'ECFP', 'ECFPVariant', 'LSTAR']#, 'SHED']
    for fp_type in fingerprints:
        jcm_fp_frame = jcm_fp_calc(fp_type)
        desc_frames[fp_type+'Fingerprint'] = jcm_fp_frame
    return(desc_frames)

In [14]:
def get_class(std_dataframe, dataframe, threshold, cutoffs):
    dataframe = pandas.concat([std_dataframe.reset_index(drop=True)
                               , dataframe.reset_index(drop=True)], axis=1)
    if threshold == 'TNO': dataframe['class'] = dataframe.pvalue
    elif threshold == 'TMC': dataframe['class'] = dataframe.multiclass
    else:
        for i  in range(0, len(cutoffs)):
            if threshold == 'T{}'.format(cutoffs[i]):
                dataframe['class'] = dataframe.multiclass.apply(lambda x: 1 if (x >= len(cutoffs)-i
                                                                                and x<10) else 0)
    return(dataframe)

In [15]:
def get_sets(rows):
    test_set_size = round(rows.shape[0] * 0.2)
    # make a list of mols
    mol_list = [Chem.MolFromSmiles(mol) for mol in rows.standard_smiles]
    # make a list of fingerprints (fp)
    fp_list = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in mol_list]  #fp length to 2048
    start_with = 0
    how_many_to_pick = test_set_size
    mmp = SimDivFilters.MaxMinPicker()
    picks = mmp.LazyBitVectorPick(fp_list, len(fp_list)
                                  , start_with + how_many_to_pick
                                  , list(range(start_with)))
    trainframe = rows.drop(rows.index[picks])
    testframe = rows.iloc[picks]
    return(trainframe, testframe)

In [16]:
def get_train_test_sets(dataframe, threshold, drop_cols):
    train_frame = pandas.DataFrame()
    test_frame = pandas.DataFrame()
    
    if threshold == 'TNO':
        # check the highest count in the standard_type column values and return it
        key_max = dataframe.standard_type.value_counts().idxmax()
        # fetch rows equals to key_max only 
        max_type_rows = dataframe.loc[dataframe.standard_type.isin([key_max])]
        trainframe, testframe = get_sets(max_type_rows)
        train_frame = train_frame.append(trainframe, ignore_index=True)         
        test_frame = test_frame.append(testframe, ignore_index=True)
    else:
        # get specific rows based on the Class column values
        clas_group = dataframe.groupby('class')
        for clas in clas_group.groups:
            rows_selected = clas_group.get_group(clas)
            if rows_selected.shape[0] < 5:
                train_frame = train_frame.append(rows_selected, ignore_index=True)
            else:
                trainframe, testframe = get_sets(rows_selected)
                train_frame = train_frame.append(trainframe, ignore_index=True)
                test_frame = test_frame.append(testframe, ignore_index=True)
    train_frame.drop(drop_cols, axis=1, inplace=True)
    test_frame.drop(drop_cols, axis=1, inplace=True)
    return(train_frame, test_frame)

In [17]:
# Data Preprocess
# convert string to Nan
def str_to_Nan(dataframe):
    # to convert all non Numeric values to Nan
    # and return the dataframe
    # Iterate through columns of Pandas DataFrame
    # Where string value exist replace with Nan

    # Get list of DataFrame column names
    #cols = list(dataframe)
    # Loop through columns
    for column in dataframe.columns:
        if column != 'class':
            # Transfer column to independent series
            col_data = dataframe[column]
            # Replace string data with Nan
            string = pandas.to_numeric(col_data, errors='coerce')
            dataframe[column] = string
    return(dataframe)

# remove the columns with all Nan values:
def drop_all_Nan(dataframe):
    # to remove columns with all Nan values
    # and return the dataframe
    data = str_to_Nan(dataframe)
    dataframe = data.dropna(axis=1, how='all')  #, inplace=True)
    return(dataframe)

# Replace missing numerical data with median
def fillNan(dataframe):
    # to replace all Nan values with median of individual columns respectively
    # and return the dataframe
    data = drop_all_Nan(dataframe)
    data.replace([numpy.inf, -numpy.inf], numpy.nan, inplace=True)
    dataframe = data.fillna(data.median())
    return(dataframe)

In [18]:
# scaling and normalization
def data_transformation(set_name, desc_set, threshold):#frames
    
    transformed_frames = {}
    # Data preprocessing
    frame = fillNan(desc_set)
    #drop columns with same value and consider remaining columns
    frame = frame[[col for col in frame if not frame[col].nunique()==1]]
    # Define X and y variables
    X = frame.drop('class', 1)
    y = frame['class']
    # X = frame[frame.columns[:-1]]#.T
    # y = frame[frame.columns[-1]]
    # Extract feature names
    feature_name = X.columns.tolist()
    
    # no data transformation
    def no_transform():
        ntframe = frame
        return(ntframe)

    # scaling
    def standardize():
        scaler = StandardScaler()
        X_std = scaler.fit_transform(X)
        X_frame = pandas.DataFrame(X_std, columns=feature_name)
        stdframe = pandas.concat([X_frame, y], axis=1)
        return(stdframe)

    # robust scaling
    def robustscale():
        robscale = RobustScaler()
        X_rbst = robscale.fit_transform(X)
        X_frame = pandas.DataFrame(X_rbst, columns=feature_name)
        rbstframe = pandas.concat([X_frame, y], axis=1)
        return(rbstframe)

    # minmax normalization
    def normalize():
        normalizer = MinMaxScaler()
        X_norm = normalizer.fit_transform(X)
        X_frame = pandas.DataFrame(X_norm, columns=feature_name)
        normframe = pandas.concat([X_frame, y],axis=1)
        return(normframe)
    
    if set_name in ['padel', 'mordred', 'rdkit']:
        notrans = no_transform()
        transformed_frames[threshold + '_notrans_'+ set_name]=notrans

        std = standardize()
        transformed_frames[threshold + '_std_'+ set_name]=std

        rbst = robustscale()
        transformed_frames[threshold + '_rbst_'+ set_name]=rbst

        norm = normalize()
        transformed_frames[threshold + '_minmax_'+ set_name]=norm

    else:
        notrans = no_transform()
        transformed_frames[threshold + '_notrans_'+ set_name]=notrans
    
    return(transformed_frames)

In [19]:
# t.test two sample unequal variance (binary class), f-oneway anova test (multi class) and Kolmogorov-Smirnov(ks) test (continous class)
def pvalue_calc(dataframe, threshold):
    # to calculate the pvalue of each column using two sample unequal variance t.test
    # for binary class dataset and f-oneway anova test for multiclass dataset
    # filter columns that pass the criteria of pvalue <= 0.05
    # transpose the filtered dataframe
    # append it to the class column
    # return the whole pvalue dataframe
    if threshold == 'TNO':
        pvalue_frame = pandas.DataFrame()
        for col in dataframe.columns:
            t, p = stats.kstest(dataframe[col], 'norm')
            if p <= 0.05: pvalue_frame = pvalue_frame.append(dataframe[col])
        if not pvalue_frame.empty:
            pvalue_frame = pvalue_frame.T
            pvalue_frame['class'] = dataframe['class']
            return(pvalue_frame)
        else:
            print('|  |_No Descriptor is having a pvalue less than 0.05')
            return(dataframe)
    else:
        pvalue_frame = pandas.DataFrame()
        row_list = []
        clas_group = dataframe.groupby('class')
        for clas in clas_group.groups:
            rows_selected = clas_group.get_group(clas)
            row_list.append(rows_selected)
        for col in dataframe.columns:
            if col != 'class':
                if len(row_list) == 2:
                    t, p = stats.ttest_ind(row_list[0][col], row_list[1][col], equal_var=False)
                    if p <= 0.05: pvalue_frame = pvalue_frame.append(dataframe[col])
                elif len(row_list) == 3:
                    t, p = stats.f_oneway(row_list[0][col], row_list[1][col], row_list[2][col])
                    if p <= 0.05: pvalue_frame = pvalue_frame.append(dataframe[col])
                elif len(row_list) == 4:
                    t, p = stats.f_oneway(row_list[0][col], row_list[1][col]
                                          , row_list[2][col], row_list[3][col])
                    if p <= 0.05: pvalue_frame = pvalue_frame.append(dataframe[col])
                elif len(row_list) == 5:
                    t, p = stats.f_oneway(row_list[0][col], row_list[1][col]
                                          , row_list[2][col], row_list[3][col]
                                          , row_list[4][col])
                    if p <= 0.05: pvalue_frame = pvalue_frame.append(dataframe[col])
                elif len(row_list) == 6:
                    t, p = stats.f_oneway(row_list[0][col], row_list[1][col]
                                          , row_list[2][col], row_list[3][col] 
                                          ,row_list[4][col], row_list[5][col])
                    if p <= 0.05: pvalue_frame = pvalue_frame.append(dataframe[col])
                elif len(row_list) == 7:
                    t, p = stats.f_oneway(row_list[0][col], row_list[1][col]
                                          , row_list[2][col], row_list[3][col]
                                          , row_list[4][col], row_list[5][col]
                                          , row_list[6][col])
                    if p <= 0.05: pvalue_frame = pvalue_frame.append(dataframe[col])
        if not pvalue_frame.empty:
            pvalue_frame = pvalue_frame.T
            pvalue_frame['class'] = dataframe['class']
            return(pvalue_frame)
        else:
            print('|  |_No Descriptor is having a pvalue less than 0.05')
            return(dataframe)

In [20]:
def group_replace(dataframe):
    clas_group =  dataframe.groupby('class')
    sorted_clas = customsort(clas_group.groups.keys())
    '''
    for group in sorted_clas:
        for i in range(0, len(sorted_clas)):
            if group == sorted_clas[i]:
                dataframe['class'].replace({group: len(sorted_clas)-i}, inplace=True)
                break
    '''
    for group in sorted_clas:
        if group == 'vStrong': dataframe['class'].replace({group: 4}, inplace=True)
        elif group == 'Strong': dataframe['class'].replace({group: 3}, inplace=True)
        elif group == 'Moderate': dataframe['class'].replace({group: 2}, inplace=True)
        elif (group == 'Weak'
              or group == 'Active'): dataframe['class'].replace({group: 1}, inplace=True)
        elif group == 'Inactive': dataframe['class'].replace({group: 0}, inplace=True)
        elif group == 'vInactive': dataframe['class'].replace({group: 40}, inplace=True)
        elif group == 'sInactive': dataframe['class'].replace({group: 30}, inplace=True)
        elif group == 'mInactive': dataframe['class'].replace({group: 20}, inplace=True)
        elif group == 'wInactive': dataframe['class'].replace({group: 10}, inplace=True)
        elif group == 4: dataframe['class'].replace({group: 'vStrong'}, inplace=True)
        elif group == 3: dataframe['class'].replace({group: 'Strong'}, inplace=True)
        elif group == 2: dataframe['class'].replace({group: 'Moderate'}, inplace=True)
        elif group == 1:
            if len(clas_group) == 2: dataframe['class'].replace({group: 'Active'}, inplace=True)
            else: dataframe['class'].replace({group: 'Weak'}, inplace=True)
        elif group == 0: dataframe['class'].replace({group: 'Inactive'}, inplace=True)
        elif group == 40: dataframe['class'].replace({group: 'vInactive'}, inplace=True)
        elif group == 30: dataframe['class'].replace({group: 'sInactive'}, inplace=True)
        elif group == 20: dataframe['class'].replace({group: 'mInactive'}, inplace=True)
        elif group == 10: dataframe['class'].replace({group: 'wInactive'}, inplace=True)

    return(dataframe)

In [21]:
# WEKA
def listToString(s):
    # return string   
    return('{{{}}}'.format(','.join(s)))

def dataframetoarff(frame, threshold):
    if threshold == 'TNO': frame = frame
    else: frame = group_replace(frame) 
    arff = join(os.getcwd(), 'temp.arff')
    rows = frame.to_csv(header=None, index=None).split('\n')
    f = open(arff, "w")
    f.write('@relation temp\n\n')
    for name in frame.columns:
        if name != 'class': f.write('@attribute {} numeric\n'.format(name))#numeric REAL real
        else:
            if threshold == 'TNO': f.write('@attribute {} numeric\n'.format(name))#nominal numeric REAL real
            else:
                clas_group =  frame.groupby('class')
                sorted_clas = customsort(clas_group.groups.keys())
                dtyp = listToString(sorted_clas)
                f.write('@attribute {} {}\n'.format(name, dtyp))
    f.write('@data\n')
    for row in rows:
        f.write('{}\n'.format(row))
    f.close()
    return(arff)

# arff file to pandas dataframe;
def arfftodataframe(arffFile, threshold):
    # total memory in bytes are converted to gigabytes and 70% is going to provide for weka
    memory = round(((virtual_memory().total)/1024**3)*0.7)
    csv = join(os.getcwd(), 'temp-arff.csv')
    _WEKA_PATH = join(os.getcwd(), 'weka-3-9-3', 'weka.jar')
    _PARAMETERS = 'weka.core.converters.CSVSaver'
    command = 'java -Xmx{}g -cp {}'.format(memory, _WEKA_PATH)
    command += ' {}'.format(_PARAMETERS)
    command += ' -i {}'.format(arffFile)
    command += ' -o {}'.format(csv)
    #call([command], shell=True)
    os.system(command)
    dataframe = pandas.read_csv(csv)
    if threshold == 'TNO': return(dataframe)
    else: return(group_replace(dataframe))

In [22]:
def feature_selection(frames, threshold, featset_dir):
    # Filter: variance, correlation(pearson's), chi-square
    # wrapper: recurssive feature elimination(RFE), mlxtend.SequentialFeatureSelector([fwd, bwd, bid]) and mlxtend.ExhaustiveFeatureSelector()
    # embeded or hybrid: log.reg.(LASSO Regularization[L1 penality]) random forest, lightgbm, lasso, weka([fwd, bwd, bid])
    selected_frames = {}
    for name, frame in frames.items():
        # Define X and y variables
        X = frame.drop('class', 1)
        y = frame['class']
        # X = frame[frame.columns[:-1]]
        # y = frame[frame.columns[-1]]

        # Extract feature names
        feature_name = X.columns.tolist()

        def variance():
            # filter method
            '''
            As an example, suppose that we have a dataset with boolean features, 
            and we want to remove all features that are either one or zero (on or off) 
            in more than 80% of the samples. Boolean features are Bernoulli random variables, 
            and the variance of such variables is given by 
            Var[X] = p(1-p)
            so we can select using the threshold .8 * (1 - .8)
            '''
            try:
                # Create and fit selector
                variance_filter = VarianceThreshold(threshold=(0.8*(1-0.8)))  #threshold=0
                features = variance_filter.fit_transform(X)
                # Get columns to keep
                variance_support = variance_filter.get_support()
                variance_feature = X.loc[:, variance_support].columns.tolist()

                frame_name = ('|  |_ For {} descriptor set Variance'
                              ' selected {} descriptors'.format(name, len(variance_feature)))

                if len(variance_feature) > 0:
                    # Create new dataframe with only desired columns
                    variance_features = frame[variance_feature]
                    variance_frame = pandas.concat([variance_features, y], axis=1)
                    print(frame_name)
                    variance_frame.to_csv('{}.csv'.format(join(featset_dir, name+'_Var')), index=False)
                    return(variance_frame)
                else: print(frame_name)
            except ValueError:
                print('|  |_ For {} descriptor set Variance selected {} descriptors'.format(name, 0))

        def correl():
            # filter method
            # Get correlation coefficients of all columns
            cor = X.corr()
            columns = numpy.full((cor.shape[0], ), True, dtype=bool)
            for i in range(cor.shape[0]):
                for j in range(i + 1, cor.shape[0]):
                    if cor.iloc[i, j] >= 0.75:
                        if columns[j]:
                            columns[j] = False
            # Get columns to keep i.e below a thershold of 0.75 pearson correlation
            cor_feature = X.columns[columns]

            frame_name = ('|  |_ For {} descriptor set Correlation'
                          ' selected {} descriptors'.format(name, len(cor_feature)))

            if len(cor_feature) > 0:
                # Create new dataframe with only desired columns
                cor_features = X[cor_feature]
                cor_frame = pandas.concat([cor_features, y], axis=1)
                print(frame_name)
                cor_frame.to_csv('{}.csv'.format(join(featset_dir, name+'_Cor')), index=False)
                return(cor_frame)
            else: print(frame_name)

        def chi_square():
            # filter method
            # Normalize the X values
            X_norm = MinMaxScaler().fit_transform(X)
            # Create and fit selector
            if X_norm.shape[1] <10:
                chi_selector = SelectKBest(chi2, k='all')
            else:
                chi_selector = SelectKBest(chi2)
            chi_selector.fit(X_norm, y)
            # Get columns to keep
            chi_support = chi_selector.get_support()
            chi_feature = X.loc[:, chi_support].columns.tolist()

            frame_name = ('|  |_ For {} descriptor set Chi-Square'
                          ' selected {} descriptors'.format(name, len(chi_feature)))
            if len(chi_feature) > 0:
                # Create new dataframe with only desired columns
                chi_features = frame[chi_feature]
                chi_frame = pandas.concat([chi_features, y], axis=1)
                print(frame_name)
                chi_frame.to_csv('{}.csv'.format(join(featset_dir, name+'_Chi')), index=False)
                return(chi_frame)
            else: print(frame_name)

        def rfe():
            # wrapper method
            # Create and fit selector
            rfe_selector = RFE(estimator)#DecisionTreeClassifier RandomForestClassifier(n_jobs=-1) LogisticRegression(n_jobs=-1)
            rfe_selector.fit(X, y)
            # Get columns to keep
            rfe_support = rfe_selector.get_support()
            rfe_feature = X.loc[:, rfe_support].columns.tolist()

            frame_name = ('|  |_ For {} descriptor set Recursive Feature Elimination'
                          ' selected {} descriptors'.format(name, len(rfe_feature)))
            if len(rfe_feature) > 0:
                # Create new dataframe with only desired columns
                rfe_features = frame[rfe_feature]
                rfe_frame = pandas.concat([rfe_features, y], axis=1)
                print(frame_name)
                rfe_frame.to_csv('{}.csv'.format(join(featset_dir, name+'_Rfe')), index=False)
                return(rfe_frame)
            else: print(frame_name)

        def logreg():
            # embeded method
            # Create and fit selector
            lr_selector = SelectFromModel(LogisticRegression(penalty='l1'
                                                             , random_state=369
                                                             , solver='saga'
                                                             , n_jobs=-1))
            lr_selector.fit(X, y)
            # Get columns to keep
            lr_support = lr_selector.get_support()
            lr_feature = X.loc[:, lr_support].columns.tolist()

            frame_name = ('|  |_ For {} descriptor set Log.Reg.-L1'
                          ' selected {} descriptors'.format(name, len(lr_feature)))
            if len(lr_feature) > 0:
                # Create new dataframe with only desired columns
                lr_features = frame[lr_feature]
                lr_frame = pandas.concat([lr_features, y], axis=1)
                print(frame_name)
                lr_frame.to_csv('{}.csv'.format(join(featset_dir, name+'_Lor')), index=False)
                return(lr_frame)
            else: print(frame_name)

        def rf():
            # embeded method
            # Create and fit selector
            rf_selector = SelectFromModel(RandomForestClassifier(n_jobs=-1))
            rf_selector.fit(X, y)
            # Get columns to keep
            rf_support = rf_selector.get_support()
            rf_feature = X.loc[:, rf_support].columns.tolist()
            frame_name = ('|  |_ For {} descriptor set Random Forest'
                          ' selected {} descriptors'.format(name, len(rf_feature)))
            if len(rf_feature) > 0:
                # Create new dataframe with only desired columns
                rf_features = frame[rf_feature]
                rf_frame = pandas.concat([rf_features, y], axis=1)
                print(frame_name)
                rf_frame.to_csv('{}.csv'.format(join(featset_dir, name+'_Rf')), index=False)
                return(rf_frame)
            else: print(frame_name)
        
        def rfr():
            # embeded method
            # Create and fit selector
            rfr_selector = SelectFromModel(RandomForestRegressor(n_jobs=-1))
            rfr_selector.fit(X, y)
            # Get columns to keep
            rfr_support = rfr_selector.get_support()
            rfr_feature = X.loc[:, rfr_support].columns.tolist()
            frame_name = ('|  |_ For {} descriptor set Random Forest'
                          ' selected {} descriptors'.format(name, len(rfr_feature)))
            if len(rfr_feature) > 0:
                # Create new dataframe with only desired columns
                rfr_features = frame[rfr_feature]
                rfr_frame = pandas.concat([rfr_features, y], axis=1)
                print(frame_name)
                rfr_frame.to_csv('{}.csv'.format(join(featset_dir, name+'_rfr')), index=False)
                return(rfr_frame)
            else: print(frame_name)

        def lgb():
            # embeded method
            # Create and fit selector
            lgb_selector = SelectFromModel(LGBMClassifier(n_jobs=-1))
            lgb_selector.fit(X.astype(float), y)
            # Get columns to keep
            lgb_support = lgb_selector.get_support()
            lgb_feature = X.loc[:, lgb_support].columns.tolist()
            frame_name = ('|  |_ For {} descriptor set LightGBM'
                          ' selected {} descriptors'.format(name, len(lgb_feature)))
            if len(lgb_feature) > 0:
                # Create new dataframe with only desired columns
                lgb_features = frame[lgb_feature]
                lgb_frame = pandas.concat([lgb_features, y], axis=1)
                print(frame_name)
                lgb_frame.to_csv('{}.csv'.format(join(featset_dir, name+'_Lgb')), index=False)
                return(lgb_frame)
            else: print(frame_name)

        def lasso():
            # embeded method
            # Create and fit selector
            lasso_selector = SelectFromModel(Lasso())
            lasso_selector.fit(X, y)
            # Get columns to keep
            lasso_support = lasso_selector.get_support()
            lasso_feature = X.loc[:, lasso_support].columns.tolist()
            frame_name = ('|  |_ For {} descriptor set Lasso'
                          ' selected {} descriptors'.format(name, len(lasso_feature)))
            if len(lasso_feature) > 0:
                # Create new dataframe with only desired columns
                lasso_features = frame[lasso_feature]
                lasso_frame = pandas.concat([lasso_features, y], axis=1)
                print(frame_name)
                lasso_frame.to_csv('{}.csv'.format(join(featset_dir, name+'_Lasso')), index=False)
                return(lasso_frame)
            else: print(frame_name)
        
        def rfecv(estimator):
            # wrapper method
            # Create and fit selector
            rfecv_selector = RFECV(estimator, n_jobs=-1)
            rfecv_selector.fit(X, y)
            # Get columns to keep
            rfecv_support = rfecv_selector.get_support()
            rfecv_feature = X.loc[:, rfecv_support].columns.tolist()

            frame_name = ('|  |_ For {} descriptor set Recursive Feature Elimination CV'
                          ' selected {} descriptors'.format(name, len(rfecv_feature)))
            if len(rfecv_feature) > 0:
                # Create new dataframe with only desired columns
                rfecv_features = frame[rfecv_feature]
                rfecv_frame = pandas.concat([rfecv_features, y], axis=1)
                print(frame_name)
                rfecv_frame.to_csv('{}.csv'.format(join(featset_dir, name+'_Rfecv')), index=False)
                return(rfecv_frame)
            else: print(frame_name)

        def wekadescSelect():
            # total memory in bytes are converted to gigabytes and 70% is going to provide for weka
            memory = round(((virtual_memory().total)/1024**3)*0.7)
            _WEKA_PATH = join(os.getcwd(), 'weka-3-9-3', 'weka.jar')
            selector = "weka.filters.supervised.attribute.AttributeSelection"
            evaluator = '-E "weka.attributeSelection.CfsSubsetEval -P 1 -E 1"'  # evaluator method
            search = '-S "weka.attributeSelection.BestFirst -D'  #search method
            command = 'java -Xmx{}g -cp {}'.format(memory, _WEKA_PATH)
            command += ' {}'.format(selector)
            command += ' {}'.format(evaluator)
            command += ' {}'.format(search)
            desc_frames = {}
            frame_names = []
            arff = dataframetoarff(frame, threshold)
            desc = join(os.getcwd(), 'desc.arff')
            directions = ['fwd', 'bid']  # 0,1,2
            for direc in range(len(directions)):
                option = ' {} -N 5"'.format(direc+1)
                inp = ' -i {}'.format(arff)
                outp = ' -o {}'.format(desc)
                proc = command + option + inp + outp
                #call([proc], shell=True)
                os.system(proc)
                descframe = arfftodataframe(desc, threshold)
                desc_frames[name + '_' + directions[direc]]=descframe
                descframe.to_csv('{}.csv'.format(join(featset_dir, name+'_'+directions[direc])), index=False)
                proc = ''
            # check for similar sets in these three feature reduction sets
            if list(list(desc_frames.values())[0].columns) == list(list(desc_frames.values())[1].columns):
                desc_frames.pop(list(desc_frames.keys())[1], None)
                frame_name = ('|  |_ For {} descriptor set weka-cfs-Bestfirst'
                              '-{} selected {} descriptors'.format(name
                                                                   , list(desc_frames)[0].split('_')[-1]
                                                                   , descframe.shape[1]-1))
                print(frame_name)
                print('|  |  |_ fwd and bid are same. Deleting bid')
                return(desc_frames)
            else:
                for key in desc_frames.keys():
                    frame_name = ('|  |_ For {} descriptor set weka-cfs-Bestfirst'
                                  '-{} selected {} descriptors'.format(name, key.split('_')[-1], descframe.shape[1]-1))
                    print(frame_name)
                return(desc_frames)

                
        if '_nopvalue' in name or len(feature_name) <= 2:
            frame = group_replace(frame)
            selected_frames[name +'-noFeatSel'] = frame
            frame.to_csv('{}.csv'.format(join(featset_dir, name+'-noFeatSel')), index=False)
            print('|  |_ For {} descriptor set NO METHOD'
                  ' selected {} descriptors'.format(name, frame.shape[1]-1))
        else:
            if threshold == 'TNO': # for continous variables
                desc_variance = variance()
                if desc_variance is not None:
                    selected_frames[name +'_Var']=desc_variance

                # correl giving a lot of features which will lead to overfitting
                #desc_correl = correl()
                #selected_frames[name +'_Cor']=desc_correl

                #desc_lasso = lasso()
                #if desc_lasso is not None:
                #    selected_frames[name +'_Lasso']=desc_lasso
                
                #estimator = RandomForestRegressor(n_jobs=-1)
                #desc_rfecv = rfecv(estimator)
                #if desc_rfecv is not None:
                #    selected_frames[name +'_Rfecv']=desc_rfecv
                
                desc_rf = rfr()
                if desc_rf is not None:
                    selected_frames[name +'_Rf']=desc_rf

                desc_weka = wekadescSelect()
                if desc_weka is not None:
                    selected_frames.update(desc_weka)
            else: # for binary or multi class variables
                desc_variance = variance()
                if desc_variance is not None:
                    selected_frames[name +'_Var']=desc_variance

                # correl giving a lot of features which will lead to overfitting
                #desc_correl = correl()
                #selected_frames[name +'_Cor']=desc_correl

                # only defined number of features will be selected. Need auto feature selection method. Not manually defined.
                #desc_chisquare = chi_square()
                #selected_frames[name +'_Chi']=desc_chisquare

                #desc_rfe = rfe()
                #selected_frames[name +'_Rfe']=desc_rfe

                desc_logreg = logreg()
                if desc_logreg is not None:
                    selected_frames[name +'_Lor']=desc_logreg
                
                desc_rf = rf()
                if desc_rf is not None:
                    selected_frames[name +'_Rf']=desc_rf
                
                # LightGBMError: Do not support special JSON characters in feature name. -- have to figure out
                # Error is only with FingerprintMol
                #desc_lgb = lgb()
                #selected_frames[name +'_Lgb']=desc_lgb
                
                #desc_lasso = lasso()
                #if desc_lasso is not None:
                #    selected_frames[name +'_Lasso']=desc_lasso
                
                #estimator = RandomForestClassifier(n_jobs=-1)
                #desc_rfecv = rfecv(estimator)
                #if desc_rfecv is not None:
                #    selected_frames[name +'_Rfecv']=desc_rfecv

                desc_weka = wekadescSelect()
                if desc_weka is not None:
                    selected_frames.update(desc_weka)
    return(selected_frames)

In [23]:
def build_models(threshold, frame, featset, name, reg_algo, clas_algo, fix=False, imbl_method=None):
    exp_name = '{}_{}'.format(featset, name)
    #print('{}, {}, {}, {}, {}'.format(threshold, frame, exp_name, fix, imbl_method))
    if threshold == 'TNO':
        model_conf = regression.setup(data=frame, target='class', session_id=99
                                      , log_experiment=True, experiment_name=exp_name
                                      , silent=True, verbose=False)#, use_gpu=True)
        #['lasso', 'ridge', 'en', 'lar'
        #, 'llar', 'omp', 'br', 'ard'
        #, 'par', 'ransac', 'tr', 'huber'
        #, 'kr', 'knn', 'dt', 'et'
        #, 'ada', 'gbr', 'mlp', 'xgboost'
        #, 'lightgbm', 'catboost']
        models = regression.compare_models(exclude=reg_algo, verbose=False)
        results = regression.pull()
    else:
        model_conf = classification.setup(data=frame, target='class', fix_imbalance=fix
                                          , fix_imbalance_method=imbl_method, session_id=99
                                          , log_experiment=True, experiment_name=exp_name
                                          , silent=True, verbose=False)#, use_gpu=True)
        # return best model
        # cannot use include and exclude together..
        #verbose: bool, default = True. Score grid is not printed when verbose is set to False.
        #['knn', 'nb', 'dt', 'rbfsvm'
        #, 'gpc', 'mlp', 'ridge', 'qda'
        #, 'ada', 'gbc' , 'lda', 'et', 'svm'
        #, 'xgboost', 'lightgbm', 'catboost']
        models = classification.compare_models(exclude=clas_algo, verbose=False)
        results = classification.pull()
    name = exp_name.split('_')[-1]
    #if not results.empty:
    results['sampling_method'] = name
    #results['threshold'] = threshold
    results['thresh_std_feature_set'] = featset
    results['features_count'] = frame.shape[1]-1
    #results['features'] = list(frame.columns)
    return(results)

In [24]:
def ml_workflow(std_records, target_id, targ_dir, cutoffs, thresholds, reg_algo, clas_algo):
    data_end_time = time.time()
    # get the sdf filename
    sd_filename = smi_to_sdf(std_records, target_id, targ_dir)
    # get descriptor sets
    print('|_ Features generation was initiated')
    desc_sets = calc_desc_fp(sd_filename)
    desc_time = time.time()
    print('|_ Features generation was completed')
    if (desc_time-data_end_time) < 60:
        print('|_ {} sec for descriptors and fingerprints'
              ' calculation'.format(round((desc_time-data_end_time), 2)))
    elif ((desc_time-data_end_time)/60) < 60:
        print('|_ {} min for descriptors and fingerprints'
              ' calculation'.format(round((desc_time-data_end_time)/60, 2)))
    else: print('|_ {} hrs for descriptors and fingerprints'
                ' calculation'.format(round((desc_time-data_end_time)/3600, 2)))
    target_model_results = pandas.DataFrame()
    ngboost = NGBClassifier()
    #thresholds = ['TNO', 'TMC', 'T01K', 'T1K', 'T10K'] #'T01K', 'T1K', 'T10K', 'TMC', 'TNO'
    for threshold in thresholds:
        train_frames = {}
        test_frames = {}
        single_class = ''
        # empty dataframe for appending the threshold_results
        threshold_results = pandas.DataFrame()
        print('|_ For threshold {}, Feature transformation'
              ' and pvalue selection was initiated'.format(threshold))
        # get the feat time
        feat_trans_time0 = time.time()
        drop_cols = list(std_records.columns)
        for set_name, desc_frame in desc_sets.items():
            dataframe = get_class(std_records, desc_frame, threshold, cutoffs)
            if len(dataframe.groupby('class').groups.keys()) > 1:
                train_desc, test_desc = get_train_test_sets(dataframe, threshold, drop_cols)
                transformed_sets = data_transformation(set_name, train_desc, threshold)
                for trans_name, trans_frame in transformed_sets.items():
                    #trans_list = trans_frame.iloc[:, [0, 10, 30, 50, -1]] 
                    #print('{}:\n{}'.format(trans_name, trans_list))
                    if 'Fingerprint' in trans_name:
                        pvalue_frame = pvalue_calc(trans_frame, threshold)
                        train_frames[trans_name]=pvalue_frame
                        print('|  |_ For set:{}, {}/{} Selected_Features(pvalue)/Total_Features'.format(trans_name, pvalue_frame.shape[1]-1, train_desc.shape[1]-1))
                        name = '{}_nopvalue'.format(trans_name)
                        train_frames[name]=trans_frame
                        print('|  |_ For set:{}, {}/{} Selected_Features(pvalue)/Total_Features'.format(name, trans_frame.shape[1]-1, train_desc.shape[1]-1))
                    else:
                        pvalue_frame = pvalue_calc(trans_frame, threshold)
                        train_frames[trans_name]=pvalue_frame
                        print('|  |_ For set:{}, {}/{} Selected_Features(pvalue)/Total_Features'.format(trans_name, pvalue_frame.shape[1]-1, train_desc.shape[1]-1))
            
            else:
                single_class = dataframe['class'].unique().tolist()
                print('|  |_Found only single class: Class {}. Skipping feature transformation and pvalue selection.'.format(single_class[0]))
                break
        # get the feat time
        feat_trans_time1 = time.time()
        print('|_ For threshold: {}, feature transformation'
              ' and pvalue selection was completed'.format(threshold))
        # processing time
        if (feat_trans_time1-feat_trans_time0) < 60:
            print('|_ {} sec for feature transformation'
                  ' and pvalue selection'.format(round((feat_trans_time1-feat_trans_time0), 2)))
        elif ((feat_trans_time1-feat_trans_time0)/60) < 60:
            print('|_ {} min for feature transformation'
                  ' and pvalue selection'.format(round((feat_trans_time1-feat_trans_time0)/60, 2)))
        else: print('|_ {} hrs for feature transformation'
                    ' and pvalue based feature selection'.format(round((feat_trans_time1
                                                                        -feat_trans_time0)/3600, 2)))
        print('|_ For {}, a total of {} feature sets are transformed'
              ' and are ready for feature selection'.format(threshold, len(train_frames)))
        print('|_ For threshold: {}, feature selection was initiated'.format(threshold))

        featset_dir = join(targ_dir, '{}_{}'.format('feature-sets',threshold))
        os.mkdir(featset_dir)
        if not train_frames:# if train_frames are empty
            print('|  |_Found only single class: Class {}. Skipping feature selection.'.format(single_class[0]))
            print('|_Found only single class: Class {}. Skipping training the models.'.format(single_class[0]))
        else:
            # get the feat time
            feat_sel_time0 = time.time()
            # get the selected features
            feats_selected = feature_selection(train_frames, threshold, featset_dir)
            # get the feat time
            feat_sel_time1 = time.time()
            print('|_ For threshold: {}, feature selection was completed'.format(threshold))
            if (feat_sel_time1-feat_sel_time0) < 60:
                print('|_ {} sec for feature selection'.format(round((feat_sel_time1
                                                                      -feat_sel_time0), 2)))
            elif ((feat_sel_time1-feat_sel_time0)/60) < 60:
                print('|_ {} min for feature selection'.format(round((feat_sel_time1
                                                                      -feat_sel_time0)/60, 2)))
            else: print('|_ {} hrs for feature selection'.format(round((feat_sel_time1
                                                                        -feat_sel_time0)/3600, 2)))
            print('|_ For {}, a total of {} feature sets'
                  ' are ready for model generation'.format(threshold, len(feats_selected)))#count))

            # models
            # do model training and get training set results
            model_time0 = time.time()
            for featset, frame in feats_selected.items():
                #print('printing frame name: {} and shape: {}'.format(featset, frame.shape))
                try:
                    if threshold == 'TNO':
                        name = 'None'
                        results = build_models(threshold, frame, featset, name, reg_algo, clas_algo)
                        #print(results)
                        threshold_results = pandas.concat([threshold_results, results], ignore_index=True)
                        #threshold_results.drop('TT (Sec)', axis=1, inplace=True)
                        print('Results for {}: model results\n {}'.format(featset, threshold_results))
                    else:
                        sample_size = frame.groupby('class').size().values[0]# sample size of the minor class
                        if sample_size <=5: neighbors = sample_size-1
                        else: neighbors = 5
                        if sample_size < 2 :
                            name = 'None'
                            results = build_models(threshold, frame, featset, name, reg_algo, clas_algo)
                            threshold_results = pandas.concat([threshold_results, results], ignore_index=True)
                            #threshold_results.drop('TT (Sec)', axis=1, inplace=True)
                            print('Results for {}: model results\n {}'.format(featset, threshold_results))
                        else:
                            # sampling_strategy= 'not majority': resample all classes but the majority class;
                            imbl_dict = {'None':None
                                         , 'SMOTE':SMOTE(random_state=369
                                                         , sampling_strategy='not majority'
                                                         , k_neighbors=neighbors, n_jobs=14)
                                         , 'SVMSMOTE':SVMSMOTE(random_state=369
                                                               , sampling_strategy='not majority'
                                                               , k_neighbors=neighbors, n_jobs=14)
                                         , 'BLSMOTE':BorderlineSMOTE(random_state=369
                                                                     , sampling_strategy='not majority'
                                                                     , k_neighbors=neighbors, n_jobs=14)
                                         , 'ADASYN':ADASYN(random_state=369
                                                           , sampling_strategy='not majority'
                                                           , n_neighbors=neighbors, n_jobs=14)}
                                         #, 'KMeansSMOTE':KMeansSMOTE(random_state=369
                                         #                            , sampling_strategy='not majority'
                                         #                            , k_neighbors=neighbors, n_jobs=14)
                            for name, imbl_method in imbl_dict.items():
                                if name == 'None': fix=False
                                else: fix=True
                                results = build_models(threshold, frame, featset, name, reg_algo, clas_algo, fix, imbl_method)
                                threshold_results = pandas.concat([threshold_results, results]
                                                                  , ignore_index=True)
                                #threshold_results.drop('TT (Sec)', axis=1, inplace=True)
                            print('Results for {}: model results\n {}'.format(featset, threshold_results))
                except RuntimeError as rte:
                    print('RuntimeError for {} with shape {}'
                          '\nError: {}'.format(featset, frame.shape, rte))
                    print(frame['class'])
                    pass
                except ValueError as ve:
                    print('ValueError for {} with shape {}'
                          '\nError: {}'.format(featset, frame.shape, ve))
                    print(frame['class'])
                    pass
                #except:
                #    # handle all other exceptions
                #    pass

            if threshold == 'TNO':
                threshold_results.sort_values('MAE', axis=0, ascending=True, inplace=True) # MAE
                threshold_results.to_csv('{}_{}_training_set_results.csv'.format(join(targ_dir, target_id)
                                                                                 ,threshold), index=False)
            else:
                threshold_results.sort_values(['MCC', 'Accuracy'], axis=0
                                              , ascending = (False, False), inplace=True)
                threshold_results.to_csv('{}_{}_training_set_results.csv'.format(join(targ_dir, target_id)
                                                                                 , threshold), index=False)
            print(threshold_results)
            display(threshold_results)
            model_time1 = time.time()
            print('For threshold: {},'
                  ' Total Models generated: {}'.format(threshold, threshold_results.shape[0]))
            # processing time
            if (model_time1-model_time0) < 60:
                print('|_ {} sec for training the models'.format(round((model_time1-model_time0), 2)))
            elif ((model_time1-model_time0)/60) < 60:
                print('|_ {} min for training the models'.format(round((model_time1-model_time0)/60, 2)))
            else: print('|_ {} hrs for training the models'.format(round((model_time1
                                                                          -model_time0)/3600, 2)))
            target_model_results = pandas.concat([target_model_results, threshold_results]
                                                 , ignore_index=True)
    # get the end time
    end_time = time.time()
    print('For Target: {}, Total Models generated: {}'.format(target_id, target_model_results.shape[0]))
    if (end_time-start_time) < 60:
        print('|_ {} sec for data to models generation'.format(round((end_time-start_time), 2)))
    elif ((end_time-start_time)/60) < 60:
        print('|_ {} min for data to models generation'.format(round((end_time-start_time)/60, 2)))
    else: print('|_ {} hrs for data to models generation'.format(round((end_time-start_time)/3600, 2)))
    target_model_results.to_csv('{}_training_set_results.csv'.format(join(targ_dir, target_id))
                                , index=False)
    print('\n\n\n')
        
    return

In [25]:
def get_thresholds():
    cutoffs = easygui.multchoicebox(msg='Choose atleast one from given thresholds in μM (microMolar):'
                                   , title='Threshold for model building'
                                   , choices=[0.1, 1, 10, 30]
                                   , preselect=[2])
    if not cutoffs:
        return(cutoffs, ['TNO'])
    else:
        if len(cutoffs)>1:
            thresholds = ['T{}'.format(c) for c in cutoffs]
            thresholds.extend(['TMC', 'TNO'])
        else:
            thresholds = ['T{}'.format(''.join(cutoffs)),'TNO']
        return(cutoffs, thresholds)

In [26]:
def set_aglorithms(threshold):
    if threshold == 'TNO':
        algo_list = ['AdaBoost Regressor','Automatic Relevance Determination'
                     ,'Bayesian Ridge', 'CatBoost Regressor'
                     , 'Decision Tree Regressor', 'Elastic Net'
                     , 'Extra Trees Regressor', 'Gradient Boosting Regressor'
                     , 'Huber Regressor', 'K Neighbors Regressor'
                     , 'Kernel Ridge', 'Least Angle Regression'
                     , 'Lasso Regression', 'Light Gradient Boosting Machine'
                     , 'Lasso Least Angle Regression', 'Linear Regression'
                     , 'MLP Regressor', 'Orthogonal Matching Pursuit'
                     , 'Passive Aggressive Regressor', 'Random Sample Consensus'
                     , 'Random Forest Regressor', 'Ridge Regression'
                     , 'Support Vector Regression', 'TheilSen Regressor'
                     , 'Extreme Gradient Boosting']
        algo_dict = {'ada' : 'AdaBoost Regressor', 'ard' : 'Automatic Relevance Determination'
                     , 'br' : 'Bayesian Ridge', 'catboost' : 'CatBoost Regressor'
                     , 'dt' : 'Decision Tree Regressor', 'en' : 'Elastic Net'
                     , 'et' : 'Extra Trees Regressor', 'gbr' : 'Gradient Boosting Regressor'
                     , 'huber' : 'Huber Regressor', 'knn' : 'K Neighbors Regressor'
                     , 'kr' : 'Kernel Ridge', 'lar' : 'Least Angle Regression'
                     , 'lasso' : 'Lasso Regression', 'lightgbm' : 'Light Gradient Boosting Machine'
                     , 'llar' : 'Lasso Least Angle Regression', 'lr' : 'Linear Regression'
                     , 'mlp' : 'MLP Regressor', 'omp' : 'Orthogonal Matching Pursuit'
                     , 'par' : 'Passive Aggressive Regressor', 'ransac' : 'Random Sample Consensus'
                     , 'rf' : 'Random Forest Regressor', 'ridge' : 'Ridge Regression'
                     , 'svm' : 'Support Vector Regression', 'tr' : 'TheilSen Regressor'
                     , 'xgboost' : 'Extreme Gradient Boosting'}
    else:
        algo_list = ['Ada Boost Classifier', 'CatBoost Classifier'
                     , 'Decision Tree Classifier', 'Extra Trees Classifier'
                     , 'Gradient Boosting Classifier', 'Gaussian Process Classifier'
                     , 'K Neighbors Classifier', 'Linear Discriminant Analysis'
                     , 'Light Gradient Boosting Machine', 'Logistic Regression'
                     , 'MLP Classifier', 'Naive Bayes'
                     , 'Quadratic Discriminant Analysis', 'Radial Kernel'
                     , 'Random Forest Classifier', 'Ridge Classifier'
                     , 'Linear Kernel', 'Extreme Gradient Boosting']
        
        algo_dict = {'ada' : 'Ada Boost Classifier',  'catboost' : 'CatBoost Classifier'
                     , 'dt' : 'Decision Tree Classifier', 'et' : 'Extra Trees Classifier'
                     , 'gbc' : 'Gradient Boosting Classifier', 'gpc' : 'Gaussian Process Classifier'
                     , 'knn' : 'K Neighbors Classifier', 'lda' : 'Linear Discriminant Analysis'
                     , 'lightgbm' : 'Light Gradient Boosting Machine', 'lr' : 'Logistic Regression'
                     , 'mlp' : 'MLP Classifier', 'nb' : 'Naive Bayes'
                     , 'qda' : 'Quadratic Discriminant Analysis', 'rbfsvm' : 'SVM - Radial Kernel'
                     , 'rf' : 'Random Forest Classifier', 'ridge' : 'Ridge Classifier'
                     , 'svm' : 'SVM - Linear Kernel', 'xgboost' : 'Extreme Gradient Boosting'}
    return(algo_list, algo_dict)

def get_algorithms(threshold):
    algo_list, algo_dict = set_aglorithms(threshold)
    if threshold == 'TNO':
        typ = 'Regression'
        select = [20]
    else:
        typ = 'Classification'
        select = [14]
    model_keys = easygui.multchoicebox(msg='Choose the Algorithms for {}'.format(typ)
                                   , title='Models for Training the {} models'.format(typ)
                                   , choices=algo_list
                                   , preselect=select)
    algorithms = []
    for key, value in algo_dict.items():
        for model in model_keys:
            if model == value:
                algorithms.append(key)
    algorithms = list(set(algo_dict.keys()) - set(algorithms))
    return(algorithms)

In [None]:
if __name__ == "__main__":
    warnings.filterwarnings('ignore')
    warnings.simplefilter("ignore")
    # get the start time
    start_time = time.time()
    
    #options for the user to choose
    options = ['ChEMBL target list', 'Molecules list']
    # create a easygui choice box window to get input from user
    choice = easygui.choicebox(msg='Choose the input file type:'
                               , title='Input list and file'
                               , choices=options
                               , preselect=[])
    print('Selected {} as input list.'.format(choice))
    
    # input for the ChEMBL Target ID List
    if choice == options[0]:
        input_file = easygui.fileopenbox(title='Select the {} file'.format(choice),
                                         default='*txt', filetypes=['*.txt', 'TEXT files'])
        print('Input file is {}\n'.format(input_file))
        cutoffs, thresholds = get_thresholds()
        if cutoffs is not None:
            clas_algo = get_algorithms(thresholds[0])
            reg_algo = get_algorithms(thresholds[-1])
        else:
            clas_algo = None
            reg_algo = get_algorithms(thresholds[-1])
        try:
            # creates an engine to connect to the database using a URL containing
            # databasedialect[+driver]://user:password@host/dbname
            engine = create_engine("postgresql+psycopg2://sarath:jbl@cci@localhost/chembl_27")
            engine.connect()
            chembl_web = False
        except Exception as e:
            chembl_web = True
            print('Error: {}'.format(e))
        with open(input_file, 'r') as target_ids:
            for target_id in target_ids:
                target_id = target_id.strip()
                # start time for the data fetching
                target_start_time = time.time()
                # detect the current working directory and create a new directory using the input target id
                targ_dir = join(os.getcwd(), target_id)
                os.mkdir(targ_dir)
                if not chembl_web:
                    print('Retrieving the data from chembl_27 database')
                    # get data from chembl database
                    std_records, test_records = get_data_from_chembldb(target_id, targ_dir, cutoffs, thresholds, choice)
                else:
                    from chembl_webresource_client.new_client import new_client
                    print('Kill the process and rerun the code'
                          ' or will retrieve data using chembl webresource client\n'
                          '... ... ... ... ...\n... ... ... ... ...\n'
                          'Retrieving the data by using chembl webresource client')
                    # get data thru chembl web client
                    std_records, test_records = get_data_from_chemblwbc(target_id, targ_dir, cutoffs, thresholds, choice)
                    #get_data_from_chemblwbc(target_id, targ_dir)
                # stop time for the data fetching
                data_end_time = time.time()
                if (data_end_time-target_start_time) < 60:
                    print('|_ {} sec to fetch the standard data'
                          ' from ChEMBL-27 database'.format(round(data_end_time-target_start_time, 2)))
                else: print('|_ {} min to fetch the standard data'
                            ' from ChEMBL-27 database'.format(round((data_end_time-target_start_time)/60, 2)))
                ml_workflow(std_records, target_id, targ_dir, cutoffs, thresholds, reg_algo, clas_algo)
        
    # input for Molecule List
    elif choice == options[1]:
        # for user input, csv file must contain following column names to execute the code properly
        # mol_id, standard_type, pvalue, canonical_smiles
        input_file = easygui.fileopenbox(title='Select the {} file'.format(choice)
                                         , default='*csv', filetypes=['*.csv', 'CSV files'])
        print('Input file is {}\n'.format(input_file))
        target_id = splitext(basename(input_file))[0]
        print(basename(input_file))
        cutoffs, thresholds = get_thresholds()
        if cutoffs is not None:
            clas_algo = get_algorithms(thresholds[0])
            reg_algo = get_algorithms(thresholds[-1])
        else:
            clas_algo = None
            reg_algo = get_algorithms(thresholds[-1])
        # start time for the data fetching
        target_start_time = time.time()
        # detect the current working directory and create a new directory using the input target id
        targ_dir = join(os.getcwd(),target_id)
        os.mkdir(targ_dir)
        #std_records = pandas.read_csv(input_file)
        dataframe = pandas.read_csv(input_file)
        std_records = get_std_data(dataframe, 'user_data', cutoffs, choice)
        std_records.to_csv('{}_uniquedata.csv'.format(join(targ_dir, target_id)), index=False)
        # stop time for the data fetching
        data_end_time = time.time()
        if (data_end_time-target_start_time) < 60:
            print('|_ {} sec to read and standardize the data'
                  ' from {}'.format(round(data_end_time-target_start_time, 2), basename(input_file)))
        else: print('|_ {} sec to read and standardize the data'
                  ' from {}'.format(round((data_end_time-target_start_time)/60, 2)
                    , basename(input_file)))
        ml_workflow(std_records, target_id, targ_dir, cutoffs, thresholds, reg_algo, clas_algo)
        
    else: # Error message
        easygui.msgbox(msg='Re-run !!! the code and choose a input type.',
                       title='Error !!!', ok_button='OK')
    print('Job is done')

[]
Selected Molecules list as input list.
Input file is D:\user_std-data.csv

user_std-data.csv
[2]
[14]
[20]
|_ 47 user_data molecules were found
|_ 1.13 sec to read and standardize the data from user_std-data.csv
|_ Features generation was initiated
|  |_ PaDEL Descriptors calculation was completed
|  |_ Mordred Descriptors calculation was completed
|  |_ RDKit Descriptors calculation was completed
|  |_ PaDEL Fingerprinter fingerprints calculation was completed
|  |_ PaDEL ExtendedFingerprinter fingerprints calculation was completed
|  |_ PaDEL EStateFingerprinter fingerprints calculation was completed
|  |_ PaDEL GraphOnlyFingerprinter fingerprints calculation was completed
|  |_ PaDEL MACCSFingerprinter fingerprints calculation was completed
|  |_ PaDEL PubchemFingerprinter fingerprints calculation was completed
|  |_ PaDEL SubstructureFingerprinter fingerprints calculation was completed
|  |_ PaDEL KlekotaRothFingerprinter fingerprints calculation was completed
|  |_ PaDEL AtomPa

|  |_No Descriptor is having a pvalue less than 0.05
|  |_ For set:T10_notrans_AT2DFingerprint, 2664/4096 Selected_Features(pvalue)/Total_Features
|  |_ For set:T10_notrans_AT2DFingerprint_nopvalue, 2664/4096 Selected_Features(pvalue)/Total_Features
|  |_No Descriptor is having a pvalue less than 0.05
|  |_ For set:T10_notrans_CATS2DFingerprint, 0/4096 Selected_Features(pvalue)/Total_Features
|  |_ For set:T10_notrans_CATS2DFingerprint_nopvalue, 0/4096 Selected_Features(pvalue)/Total_Features
|  |_No Descriptor is having a pvalue less than 0.05
|  |_ For set:T10_notrans_PHAP2POINT2DFingerprint, 8/4096 Selected_Features(pvalue)/Total_Features
|  |_ For set:T10_notrans_PHAP2POINT2DFingerprint_nopvalue, 8/4096 Selected_Features(pvalue)/Total_Features
|  |_No Descriptor is having a pvalue less than 0.05
|  |_ For set:T10_notrans_PHAP3POINT2DFingerprint, 160/4096 Selected_Features(pvalue)/Total_Features
|  |_ For set:T10_notrans_PHAP3POINT2DFingerprint_nopvalue, 160/4096 Selected_Features(p

|  |_ For T10_notrans_GraphOnlyFingerprinter descriptor set Random Forest selected 54 descriptors
|  |_ For T10_notrans_GraphOnlyFingerprinter descriptor set weka-cfs-Bestfirst-fwd selected 25 descriptors
|  |  |_ fwd and bid are same. Deleting bid
|  |_ For T10_notrans_GraphOnlyFingerprinter_nopvalue descriptor set NO METHOD selected 348 descriptors
|  |_ For T10_notrans_MACCSFingerprinter descriptor set Variance selected 18 descriptors
|  |_ For T10_notrans_MACCSFingerprinter descriptor set Log.Reg.-L1 selected 7 descriptors
|  |_ For T10_notrans_MACCSFingerprinter descriptor set Random Forest selected 20 descriptors
|  |_ For T10_notrans_MACCSFingerprinter descriptor set weka-cfs-Bestfirst-fwd selected 14 descriptors
|  |  |_ fwd and bid are same. Deleting bid
|  |_ For T10_notrans_MACCSFingerprinter_nopvalue descriptor set NO METHOD selected 84 descriptors
|  |_ For T10_notrans_PubchemFingerprinter descriptor set Variance selected 35 descriptors
|  |_ For T10_notrans_PubchemFingerp

|  |_ For T10_notrans_PHAP2POINT2DFingerprint descriptor set weka-cfs-Bestfirst-fwd selected 1 descriptors
|  |  |_ fwd and bid are same. Deleting bid
|  |_ For T10_notrans_PHAP2POINT2DFingerprint_nopvalue descriptor set NO METHOD selected 8 descriptors
|  |_ For T10_notrans_PHAP3POINT2DFingerprint descriptor set Variance selected 5 descriptors
|  |_ For T10_notrans_PHAP3POINT2DFingerprint descriptor set Log.Reg.-L1 selected 33 descriptors
|  |_ For T10_notrans_PHAP3POINT2DFingerprint descriptor set Random Forest selected 31 descriptors
|  |_ For T10_notrans_PHAP3POINT2DFingerprint descriptor set weka-cfs-Bestfirst-fwd selected 18 descriptors
|  |  |_ fwd and bid are same. Deleting bid
|  |_ For T10_notrans_PHAP3POINT2DFingerprint_nopvalue descriptor set NO METHOD selected 160 descriptors
|  |_ For T10_notrans_ECFPFingerprint descriptor set Variance selected 53 descriptors
|  |_ For T10_notrans_ECFPFingerprint descriptor set Log.Reg.-L1 selected 11 descriptors
|  |_ For T10_notrans_ECF

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Results for T10_notrans_padel_Var: model results
                       Model  Accuracy  AUC  Recall   Prec.    F1  Kappa  MCC  \
0  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   

   TT (Sec) sampling_method thresh_std_feature_set  features_count  
0     4.305            None  T10_notrans_padel_Var             536  
Results for T10_notrans_padel_Rf: model results
                       Model  Accuracy  AUC  Recall   Prec.    F1  Kappa  MCC  \
0  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
1  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   

   TT (Sec) sampling_method thresh_std_feature_set  features_count  
0     4.305            None  T10_notrans_padel_Var             536  
1     0.175            None   T10_notrans_padel_Rf              64  
Results for T10_notrans_padel_fwd: model results
                       Model  Accuracy  AUC  Recall   Prec.    F1  Kappa  MCC  \
0  Random Forest Classif

Results for T10_rbst_padel_fwd: model results
                       Model  Accuracy  AUC  Recall   Prec.    F1  Kappa  MCC  \
0  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
1  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
2  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
3  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
4  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
5  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
6  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
7  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
8  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
9  Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   

   TT (Sec) sampling_method thresh_std_feature_set  features_count  
0  

Results for T10_notrans_mordred_Var: model results
                        Model  Accuracy  AUC  Recall   Prec.    F1  Kappa  MCC  \
0   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
1   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
2   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
3   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
4   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
5   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
6   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
7   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
8   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
9   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
10  Random Forest Classifier    0.9667  0.0     1.0  0.96

Results for T10_std_mordred_Var: model results
                        Model  Accuracy  AUC  Recall   Prec.    F1  Kappa  MCC  \
0   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
1   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
2   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
3   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
4   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
5   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
6   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
7   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
8   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
9   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
10  Random Forest Classifier    0.9667  0.0     1.0  0.9667  

Results for T10_std_mordred_fwd: model results
                        Model  Accuracy  AUC  Recall   Prec.    F1  Kappa  MCC  \
0   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
1   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
2   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
3   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
4   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
5   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
6   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
7   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
8   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
9   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
10  Random Forest Classifier    0.9667  0.0     1.0  0.9667  

Results for T10_rbst_mordred_fwd: model results
                        Model  Accuracy  AUC  Recall   Prec.    F1  Kappa  MCC  \
0   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
1   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
2   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
3   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
4   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
5   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
6   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
7   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
8   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
9   Random Forest Classifier    0.9667  0.0     1.0  0.9667  0.98    NaN  0.0   
10  Random Forest Classifier    0.9667  0.0     1.0  0.9667 

Results for T10_minmax_mordred_Lor: model results
                        Model  Accuracy  AUC  Recall   Prec.      F1  Kappa  \
0   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
1   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
2   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
3   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
4   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
5   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
6   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
7   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
8   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
9   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
10  Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
11  Random Fo

Results for T10_minmax_mordred_fwd: model results
                        Model  Accuracy  AUC  Recall   Prec.      F1  Kappa  \
0   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
1   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
2   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
3   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
4   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
5   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
6   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
7   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
8   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
9   Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
10  Random Forest Classifier    0.9667  0.0    1.00  0.9667  0.9800    NaN   
11  Random Fo

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "E:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-27-c95bc1027ee6>", line 100, in <module>
    ml_workflow(std_records, target_id, targ_dir, cutoffs, thresholds, reg_algo, clas_algo)
  File "<ipython-input-24-90bb7feda03d>", line 117, in ml_workflow
    results = build_models(threshold, frame, featset, name, reg_algo, clas_algo)
  File "<ipython-input-23-6b3325354612>", line 17, in build_models
    model_conf = classification.setup(data=frame, target='class', fix_imbalance=fix
  File "E:\Anaconda3\lib\site-packages\pycaret\classification.py", line 580, in setup
    return pycaret.internal.tabular.setup(
  File "E:\Anaconda3\lib\site-packages\pycaret\internal\tabular.py", line 1308, in setup
    test_data = prep_pipe.transform(test_data)
  File "E:\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 549, in _transform
    Xt = 

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "E:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-27-c95bc1027ee6>", line 100, in <module>
    ml_workflow(std_records, target_id, targ_dir, cutoffs, thresholds, reg_algo, clas_algo)
  File "<ipython-input-24-90bb7feda03d>", line 117, in ml_workflow
    results = build_models(threshold, frame, featset, name, reg_algo, clas_algo)
  File "<ipython-input-23-6b3325354612>", line 17, in build_models
    model_conf = classification.setup(data=frame, target='class', fix_imbalance=fix
  File "E:\Anaconda3\lib\site-packages\pycaret\classification.py", line 580, in setup
    return pycaret.internal.tabular.setup(
  File "E:\Anaconda3\lib\site-packages\pycaret\internal\tabular.py", line 1308, in setup
    test_data = prep_pipe.transform(test_data)
  File "E:\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 549, in _transform
    Xt = 