In [1]:
# METADATA:
__author__='Leela Sarath Kumar Konda'
#########################################################################################################
# README                                                                                                #
#########################################################################################################
# Before executing this code install the following                                                      #
#                                                                                                       #
# 1) anaconda        - Download the file from https://www.anaconda.com/ and                             #
#                    - follow the given instructions                                                    #
#                                                                                                       #
# 2) rdkit           - conda install -c conda-forge rdkit                                               #
#                                                                                                       #
# 3) MolVS           - pip install MolVS                                                                #
#                                                                                                       #
# 4) java                                                                                               #
#     linux          - sudo apt install default-jdk default-jre                                         #
#     windows        - download the file from                                                           #
#                    - https://www.oracle.com/in/java/technologies/javase-jdk15-downloads.html          #
#                                                                                                       #
# 5) easygui        - pip install easygui                                                               #
#                                                                                                       #
#########################################################################################################

In [1]:
# imports
import sys
import warnings
import os
from os.path import join
from os.path import splitext
from os.path import dirname
from os.path import basename

from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover
from molvs import Standardizer
from molvs.standardize import standardize_smiles

import pandas
import numpy

import itertools
from subprocess import call

# import easygui

In [2]:
# jar files
_padel_jar_ = os.path.join(os.getcwd(), 'PaDEL-Descriptor', 'PaDEL-Descriptor.jar')
_weka_jar_ = os.path.join(os.getcwd(), 'weka-3-9-3', 'weka.jar')

#print(_padel_jar_)
#print(_weka_jar_)

In [3]:
def std_smiles(smiles):
    std = Standardizer()
    mol = Chem.MolFromSmiles(smiles)
    std_mol = std.standardize(mol)
    #std_mol = standardize_smiles(smiles) # from molvs, contains smiles to mol to smiles inbuilt rdkit function
    return (std_mol)

In [4]:
def remove_salt(std_mol):
    remover = SaltRemover()
    #std_mol = Chem.MolFromSmiles(smiles) # uncomment this if we use standardize_smiles() method in std_smiles()
    res = remover.StripMol(std_mol, dontRemoveEverything=True)
    #res, deleted = remover.StripMolWithDeleted(mol)
    #[Chem.MolToSmarts(m) for m in deleted]
    if res.GetNumAtoms() <= 1:
        return
    else:
        smiles = Chem.MolToSmiles(res)
        return (smiles)

In [5]:
def standardframe(csv_file):
    dataframe = pandas.read_csv(csv_file)
    dataframe['Std_SMILES'] = [remove_salt(std_smiles(smiles)) for smiles in dataframe.SMILES]
    std_dataframe = dataframe.dropna()
    return (std_dataframe)

In [None]:
def smi_to_sdf(standard_records, csv_file):
    sd_filename = '{}.sdf'.format(os.path.splitext(csv_file)[0])
    sdwriter = Chem.SDWriter(sd_filename)
    for mol, title in zip(standard_records.Std_SMILES, standard_records.ID):
        mol = Chem.MolFromSmiles(mol)
        hmol = Chem.AddHs(mol)
        #AllChem.EmbedMolecule(hmol,AllChem.ETKDG())
        #mol = Chem.RemoveHs(hmol)
        hmol.SetProp("_Name", title.strip())
        sdwriter.write(hmol)
    sdwriter.close()
    return (sd_filename)

SyntaxError: unterminated string literal (detected at line 1) (1266064593.py, line 1)

In [7]:
# Data Preprocess
# convert string to Nan
def str_to_Nan(dataframe):
    # to convert all non Numeric values to Nan
    # and return the dataframe
    # Iterate through columns of Pandas DataFrame
    # Where string value exist replace with Nan

    # Get list of DataFrame column names
    cols = list(dataframe)
    # Loop through columns
    for column in cols:
        # Transfer column to independent series
        col_data = dataframe[column]
        # Replace string data with Nan
        string = pandas.to_numeric(col_data, errors='coerce')
        dataframe[column] = string
    return (dataframe)

# Replace missing numerical data with median
def fillNan(dataframe):
    # to replace all Nan values with median of individual columns respectively
    # and return the dataframe

    data = str_to_Nan(dataframe)
    #dataframe = data.fillna(data.median())#, inplace=True)
    dataframe = data.fillna(data.median())
    return (dataframe)

In [8]:
def calc_features(std_dataframe, csv_file):

    sdf_filename = smi_to_sdf(std_dataframe, csv_file)

    # padel descriptor calculation:
    # pass the smiles files
    
    _PARAMETERS = "-threads -1 -2d -fingerprints -removesalt -standardizenitro -detectaromaticity -retainorder -descriptortypes "
    _fp_file = os.path.join(os.getcwd(), 'PaDEL-Descriptor', 'descriptors.xml')
    pdl_output_file = '{}_pdl_desc.csv'.format(os.path.splitext(sdf_filename)[0])

    command = 'java -jar -splash:disable {}'.format(_padel_jar_)
    command += ' {}{}'.format(_PARAMETERS, _fp_file)
    command += ' -dir {}'.format(sdf_filename)
    command += ' -file {}'.format(pdl_output_file)

    #running the command using subprocess.call
    #call([command], shell=True)
    os.system(command)

    # impute with median values
    # padel descriptor output file to pandas data frame
    # and identify and replace the null values with
    # median of respective column
    dataframe = pandas.read_csv(pdl_output_file)
    pdl_frame = fillNan(dataframe)
    print("PaDEL Descriptor calculation was completed")
    return (pdl_frame)

In [9]:
def feature_selection(desc_set, feat_file):
    features = pandas.read_csv(feat_file, header=None)
    selected_features = desc_set[desc_set.columns.intersection(features[0].tolist())]
    y_variable = features[0].tolist()[-1]
    selected_features[y_variable] = '?'
    return (selected_features, y_variable)

In [10]:
# WEKA
def dataframetoarff(frame, y_variable, dtyp):  # pass the frame
    arff = 'temp.arff'
    colnames = frame.columns
    datatypes = frame.dtypes
    rows = frame.to_csv(header=None, index=None).split('\n')

    f = open(arff, "w")
    f.write('@relation temp\n\n')

    for name, dtype in zip(colnames, datatypes):
        if name != y_variable:
            f.write('@attribute {} numeric\n'.format(name))
        else:
            f.write('@attribute {} {}\n'.format(name, dtyp))

    f.write('\n@data\n')

    for row in rows:
        f.write('{}\n'.format(row))

    f.close()

    return (arff)

In [11]:
def get_ml_predictions(test_file, algo, model):
    _arg_ = '-classifications weka.classifiers.evaluation.output.prediction.CSV'
    csv = 'temp_result.csv'
    command = 'java -splash:disable -Xmx8g -cp {} {}'.format(_weka_jar_, algo)
    command += ' -l {}'.format(model)
    command += ' -T {}'.format(test_file)
    command += ' {}'.format(_arg_)
    command += ' > {}'.format(csv)
    #call([command], shell=True)
    os.system(command)
    predictions = pandas.read_csv(csv, delimiter=',', skiprows=4)
    predictions.drop(predictions.columns[[0, 1, 3]], axis=1, inplace=True)
    predictions.predicted.replace(regex={r'1:':'', '2:':''}, inplace=True)
    #print(predictions)
    return (predictions)

def get_dl_predictions(test_frame, y_variable, model):
    model = h2o.load_model(model)
    #test_frame[y_variable].replace({'?': 1}, inplace=True)
    test_frame = h2o.H2OFrame(test_frame[test_frame.columns[:-1]])
    predictions = model.predict(test_frame)
    predictions = predictions.as_data_frame(use_pandas=True, header=True)
    #print(predictions)
    return(predictions)

In [12]:
def get_consensus(t1, t2, t3):
    consensus = []
    for pred6, pred5, pred4 in zip(t1, t2, t3):
        if pred6 == "Active":
            consensus.append("Strong")
        else:
            if pred5 == "Active":
                consensus.append("Moderate")
            else:
                if pred4 == "Active":
                    consensus.append("Weak")
                else:
                    consensus.append(pred4)
    return (consensus)

In [14]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
    # get the input file
    # input_file = 'selected_vs-hits_list.csv'
    # input_file = easygui.fileopenbox(title='Select the csv file',
    #                                  default='*csv',
    #                                  filetypes=['*.csv', 'CSV files'])
    # print('Input file is {}\n'.format(input_file))
    input_file = 'example_file.csv'
    # get standardized dataframe
    std_dataframe = standardframe(input_file)
    # get descriptor sets
    desc_set = calc_features(std_dataframe, input_file)
    # empty dataframe to add the endpoint results
    endpoint_pred = pandas.DataFrame()
    endpoint_pred['ID'] = std_dataframe.ID
    endpoint_pred['SMILES'] = std_dataframe.Std_SMILES
    # scan the current working directory
    for folder in os.scandir(os.getcwd()):
        if folder.is_dir():
            #for BBB predictions
            if folder.path == os.path.join(os.getcwd(), 'BBB'):
                for feat_file in os.listdir(folder):
                    if feat_file.endswith(".txt"):
                        feat_file = os.path.join(folder, feat_file)
                        # get selected features
                        selected_features, y_variable = feature_selection(desc_set, feat_file)
                        dtyp = '{BBB+,BBB-}'
                        arff = dataframetoarff(selected_features, y_variable, dtyp)
                        arff = os.path.join(os.getcwd(), arff)
                        if feat_file == os.path.join(folder, 'bid_T2.txt'):
                            for algo, model in zip(['MultilayerPerceptron', 'SMO'], ['MLP_BID_T2.model', 'SMO_BID_T2.model']):
                                algo = 'weka.classifiers.functions.{}'.format(algo)
                                model = os.path.join(os.getcwd(), 'BBB', model)
                                predictions = get_ml_predictions(arff, algo, model)
                                if 'SMO' in algo:
                                    predictions.predicted.replace({'BBB+': 0.97, 'BBB-': -0.65}, inplace=True)
                                    endpoint_pred['BBB-T2-SMO'] = predictions['predicted']
                                else:
                                    predictions.predicted.replace({'BBB+': 0.93, 'BBB-': -0.73}, inplace=True)
                                    endpoint_pred['BBB-T2-MLP'] = predictions['predicted'] * predictions['prediction']
                            
                        elif feat_file == os.path.join(folder, 'bwd_T1.txt'):
                            algo = 'weka.classifiers.functions.MultilayerPerceptron'
                            model = os.path.join(os.getcwd(), 'BBB', 'MLP_BWD.model')
                            predictions = get_ml_predictions(arff, algo, model)
                            predictions.predicted.replace({'BBB+': 0.83, 'BBB-': -0.64}, inplace=True)
                            endpoint_pred['BBB-T1-MLP'] = predictions['predicted'] * predictions['prediction']
                            
                        elif feat_file == os.path.join(folder, 'fwd_T1.txt'):
                            algo = 'weka.classifiers.functions.SMO'
                            model = os.path.join(os.getcwd(), 'BBB', 'SMO_FWD.model')
                            predictions = get_ml_predictions(arff, algo, model)
                            predictions.predicted.replace({'BBB+': 0.93, 'BBB-': -0.55}, inplace=True)
                            endpoint_pred['BBB-T1-SMO'] = predictions['predicted']
                endpoint_pred['BBB-T1-CONS'] = endpoint_pred[['BBB-T1-MLP', 'BBB-T1-SMO']].mean(axis=1).map(lambda x: 'BBB+' if x>0 else 'BBB-')
                endpoint_pred.drop(['BBB-T1-MLP', 'BBB-T1-SMO'], axis=1, inplace=True)
                endpoint_pred['BBB-T2-CONS'] = endpoint_pred[['BBB-T2-MLP', 'BBB-T2-SMO']].mean(axis=1).map(lambda x: 'BBB+' if x>0 else 'BBB-')
                endpoint_pred.drop(['BBB-T2-MLP', 'BBB-T2-SMO'], axis=1, inplace=True)
                                    
            # for hERG predictions
            elif folder.path == os.path.join(os.getcwd(), 'hERG'):
                for feat_file in os.listdir(folder):
                    if feat_file.endswith(".txt"):
                        feat_file = os.path.join(folder, feat_file)
                        # get selected features
                        selected_features, y_variable = feature_selection(desc_set, feat_file)
                        dtyp = '{Active,Inactive}'
                        arff = dataframetoarff(selected_features, y_variable, dtyp)
                        arff = os.path.join(os.getcwd(), arff)
                        algo = 'weka.classifiers.trees.RandomForest'
                        if feat_file == os.path.join(folder, '6_fwd.txt'):
                            model = os.path.join(os.getcwd(), 'hERG', '6_fwd_RF.model')
                            predictions = get_ml_predictions(arff, algo, model)
                            endpoint_pred['hERG-01'] = predictions['predicted']
                            #print('hERG-01\n{}'.format(endpoint_pred['hERG-01']))
                        elif feat_file == os.path.join(folder, '5_bwd.txt'):
                            model = os.path.join(os.getcwd(), 'hERG', '5_bwd_RF.model')
                            predictions = get_ml_predictions(arff, algo, model)
                            endpoint_pred['hERG-10'] = predictions['predicted']
                            #print('hERG-10\n{}'.format(endpoint_pred['hERG-10']))
                        elif feat_file == os.path.join(folder, '405_fwd.txt'):
                            model = os.path.join(os.getcwd(), 'hERG', '4o5_fwd_RF.model')
                            predictions = get_ml_predictions(arff, algo, model)
                            predictions.predicted.replace({'Active': 1, 'Inactive': -1}, inplace=True)
                            endpoint_pred['hERG-30-fwd'] = predictions['predicted'] * predictions['prediction']
                            #print('hERG-30-fwd\n{}'.format(endpoint_pred['hERG-30-fwd']))
                        elif feat_file == os.path.join(folder, '405_bwd.txt'):
                            model = os.path.join(os.getcwd(), 'hERG', '4o5_bwd_RF.model')
                            predictions = get_ml_predictions(arff, algo, model)
                            predictions.predicted.replace({'Active': 1, 'Inactive': -1}, inplace=True)
                            endpoint_pred['hERG-30-bwd'] = predictions['predicted'] * predictions['prediction']
                            #print('hERG-30-bwd\n{}'.format(endpoint_pred['hERG-30-bwd']))
                endpoint_pred['hERG-30'] = endpoint_pred[['hERG-30-fwd', 'hERG-30-bwd']].sum(axis=1).map(lambda x: 'Active' if x>0 else 'Inactive')
                endpoint_pred.drop(['hERG-30-fwd','hERG-30-bwd'], axis=1, inplace=True)
                consensus = get_consensus(endpoint_pred['hERG-01'], endpoint_pred['hERG-10'], endpoint_pred['hERG-30'])
                endpoint_pred['hERG-CONS'] = consensus   
            # for NaV predictions
            # for CaV predictions
            # for ABK predictions
            # for FLT3 predictions

    dir_name = dirname(input_file)
    output_file = splitext(basename(input_file))[0]
    endpoint_pred.to_csv('{}_results.csv'.format(join(dir_name, output_file), index=False))

NameError: name 'smi_to_sdf' is not defined

In [None]:
endpoint_pred