In [1]:
import bioalerts
from bioalerts import LoadMolecules, Alerts, FPCalculator
import numpy as np 
from rdkit.Chem.Draw import IPythonConsole 
from rdkit.Chem import PandasTools, MolFromSmiles, AllChem
from rdkit import DataStructs
import pandas as pd 

from sklearn.neighbors import KNeighborsRegressor #knn
from sklearn.ensemble import RandomForestRegressor #RF
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold

## A. calculate significant substructure feature

In [2]:
#load IC50 values and smis into dictionary
substD = {}#key = chemblID, val = (smi, bio)
fr = open("data/5-alpha-reductase1_finalIC50.csv", "r", encoding='UTF8')
lines = fr.readlines()
for i, line in enumerate(lines):
    if i == 0:
        continue
    splitted = line.split(",")
    smi = splitted[37]
    bio = splitted[4]
    chemblID = splitted[1]
    if bio == "":
        continue
    substD[chemblID] = (smi, bio)
    
fr.close()
#and shuffle the keys - usually dict does it for us but just do it(3.7 version of dict is ordered always)
tmpItems = list(substD.items())
np.random.shuffle(tmpItems)
substD = dict(tmpItems)


In [3]:
#divide the data into smi_training, smi_test, bio_training, and bio_test
tr_th = int(len(substD) * 0.8)#train has 80% of the whole data
smi_training = {}
smi_test = {}
bio_training = {}
bio_test = {}
for akey in substD.keys():
    if len(smi_training) < tr_th:
        smi_training[akey] = substD[akey][0]
        bio_training[akey] = float(substD[akey][1])
    else:
        smi_test[akey] = substD[akey][0]
        bio_test[akey] = float(substD[akey][1])
        


In [4]:
#extract all substructures - see the code in LoadMolecules.py
training_dataset_info = bioalerts.LoadMolecules.GetDataSetInfo(name_field=None)
training_dataset_info.extract_substructure_information(radii=[2,3,4,5,6], smi_dict=smi_training)

print(len(training_dataset_info.substructure_dictionary))

5159


In [5]:
#set the threshold for toxicity classification among the training data instances
def calculate_bioactivity_threshold(bio, threshold):
    sorted_bio = sorted(bio)
    threshold_bio_value = sorted_bio[int(len(sorted_bio)*threshold)]
    return threshold_bio_value

threshold_= 0.6
threshold_bio_value = calculate_bioactivity_threshold(list(bio_training.values()), threshold_)
threshold_bio_value

188.0

In [6]:
#extract substructures with high activity - see the code in Alerts.py
Alerts_continuous_high_activity = bioalerts.Alerts.CalculateContinuous(radii_ext=[2,3,4,5,6])
#the significant feature should be found only inside the training set.
significant_substructure = Alerts_continuous_high_activity.get_significant_substructure_with_high_bioactivity(
     smi_dict = smi_training,
     substructure_dictionary = training_dataset_info.substructure_dictionary,
     bioactivities = np.array(list(bio_training.values())),
     mols_ids = list(smi_training.keys()),
     threshold_nb_substructures = 5,
     threshold_pvalue = 0.05,
     threshold_ratio=0.2,
     threshold_high_act_nb_substructures=7,#not used for now
     threshold_high_act_ratio = 0.6,#not used for now                                                
     threshold_bioactivity=threshold_bio_value)

#threshold_high_act_nb_substructure = N of molecules involved significant substructure with high activity  
#threshold_high_act_ratio = N of molecules involved significant substructure with high activity / N of molecules involved significant substructure
#threshold_bioactivity = threshold defined above

print(len(significant_substructure))
#key, value
#rdkit generated substructure id, chembl id
#3454070996 ['CHEMBL15917', ... 'CHEMBL341004']

rev_sig_substructure = {}
for akey in significant_substructure:#key is a substructure id of rdkit
    for chemblid in significant_substructure[akey]:
        if chemblid in rev_sig_substructure:
            rev_sig_substructure[chemblid].append(akey)
        else:
            rev_sig_substructure[chemblid] = [akey]

1


## B. insert feature

In [7]:
#MACCS fingerprint
wholeD = {}
fr = open("data/5-alpha-reductase-maccs.csv", "r", encoding='UTF8')
lines = fr.readlines()
for i, line in enumerate(lines):
    if i == 0:
        continue
    splitted = line.split(",")
    bio = splitted[1]
    if bio == "":
        continue
    wholeD[splitted[0]] =  [float(ele) for ele in splitted[2:]]#key is chembl id, and we exclude the bio(we already have it)
fr.close()

maccsD_training = {}
maccsD_test = {}
#yD = {}#we have already bio_training and bio_test
for akey in wholeD.keys():
    if akey in smi_training:
        maccsD_training[akey] = wholeD[akey]
    else:
        maccsD_test[akey] = wholeD[akey]
        


In [8]:
#throught the testing, morgan is worse than MACCS - weird??
morgan_training = {}
morgan_test = {}
radii_ext = [2,3,4,5,6]
for akey in substD.keys():
    mol = MolFromSmiles(substD[akey][0])
    mfp = AllChem.GetMorganFingerprintAsBitVect(mol, max(radii_ext), nBits=1024)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(mfp, arr)
    #print(mfp)
    if akey in smi_training:
        morgan_training[akey] = arr
    else:
        morgan_test[akey] = arr

In [9]:
import copy
X_training, X_test = maccsD_training, maccsD_test#morgan_training, morgan_test
X_training_wo, X_test_wo = copy.deepcopy(maccsD_training), copy.deepcopy(maccsD_test)

In [10]:
#insert the significant feature into train dataset and test dataset
existsF =    [1. for i in range(100)]
nonExistsF = [0. for i in range(100)]

for akey in X_training.keys():
    if akey in rev_sig_substructure:
        X_training[akey] = np.append(X_training[akey], existsF, axis=0)
    else:
        X_training[akey] = np.append(X_training[akey], nonExistsF, axis=0)
        
for akey in X_test.keys():
    if akey in rev_sig_substructure:
        X_test[akey] = np.append(X_test[akey], existsF, axis=0)
    else:
        X_test[akey] = np.append(X_test[akey], nonExistsF, axis=0)
     

# C. model train

In [11]:
#preprocessing for y/bio values
for akey in bio_training.keys():
    bio_training[akey] = 9 - np.log10(bio_training[akey])
for akey in bio_test.keys():
    bio_test[akey] = 9 - np.log10(bio_test[akey])
    
#make lists of train_x, train_y and those of the test
x_train, x_test, y_train, y_test = [], [], [], []
for akey in X_training:
    x_train.append(X_training[akey])
    y_train.append(bio_training[akey])
for akey in X_test:
    x_test.append(X_test[akey])
    y_test.append(bio_test[akey])

#make above list without features
x_train_wo, x_test_wo = [], []
for akey in X_training_wo:
    x_train_wo.append(X_training_wo[akey])
for akey in X_test_wo:
    x_test_wo.append(X_test_wo[akey])
    
#print(x_train[0])
#print( x_train_wo[0])

### C.1 knn 

In [12]:

neigh = KNeighborsRegressor(n_neighbors = 5, metric='jaccard', weights = 'distance')
#model training with all train data - with features
neigh.fit(x_train, y_train) # > model train
y_test_pred = neigh.predict(x_test) # > model predidction
rmse = np.sqrt(mean_squared_error(y_test,y_test_pred))


neigh2 = KNeighborsRegressor(n_neighbors = 5, metric='jaccard', weights = 'distance')
#model training with all train data - without features  
neigh2.fit(x_train_wo, y_train) # > model train
y_test_pred2 = neigh2.predict(x_test_wo) # > model predidction
rmse2 = np.sqrt(mean_squared_error(y_test, y_test_pred2))

print("rmse:", rmse, "(feature equipped) /", rmse2, "(without feature)")

#1st-rmse: 0.6665674996275878(feature equipped) / 0.6506157409564809 (without feature)
#2nd-rmse: 0.6616526937642756(feature equipped) / 0.6470203235135135 (without feature)
#3rd-rmse: 0.7400772882780433(feature equipped) / 0.7191645589172299 (without feature)
#4th-rmse: 0.5530300099595452(feature equipped) / 0.5530300099595452 (without feature)
#5th-rmse: 0.6194732073893036 (feature equipped) / 0.5797533674300656 (without feature)
#6th-rmse: 0.8833799018674932 (feature equipped) / 0.8712460159188644 (without feature)

#conclusion - too much variance between performances + feature is not that helpful

rmse: 0.7107505198683638 (feature equipped) / 0.620211320461115 (without feature)


### C.2 RF

In [13]:

RF = RandomForestRegressor(n_estimators= 30, random_state=0)
#model training with all train data  
RF.fit(x_train, y_train) # > model train
y_test_pred = RF.predict(x_test) # > model predidction
rmse = np.sqrt(mean_squared_error(y_test,y_test_pred))

RF2 = RandomForestRegressor(n_estimators= 30, random_state=0)
#model training with all train data - without features  
RF2.fit(x_train_wo, y_train) # > model train
y_test_pred2 = RF2.predict(x_test_wo) # > model predidction
rmse2 = np.sqrt(mean_squared_error(y_test,y_test_pred2))

print("rmse:", rmse, "(feature equipped) / ", rmse2, "(without feature)")
#1st-rmse: 0.5607542696180774(feature equipped) / 0.5951389693139828 (without feature)
#2nd-rmse: 0.5456417432809881(feature equipped) / 0.5537994883596704 (without feature)
#3rd-rmse: 0.6635918018542765(feature equipped) / 0.6596744885913872 (without feature)
#4th-rmse: 0.6097764195923148(feature equipped) / 0.5979487754062859 (without feature)
#5th-rmse: 0.5374167128801874 (feature equipped) /  0.5277560043097669 (without feature)
#6th-rmse: 0.7463623098161197 (feature equipped) /  0.7443525413936887 (without feature)

#conclusion - too much variance between performances + feature is not that helpful

rmse: 0.6108052363979027 (feature equipped) /  0.5945098451686438 (without feature)


### C.3 SVR 