In [1]:
import bioalerts
from bioalerts import LoadMolecules, Alerts, FPCalculator
import sys

In [2]:
import numpy as np 

from rdkit.Chem.Draw import IPythonConsole 
from rdkit.Chem import PandasTools 

import pandas as pd 

from sklearn.neighbors import KNeighborsRegressor #knn
from sklearn.ensemble import RandomForestRegressor #RF
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold

## A. calculate significant substructure feature

In [3]:
molecules = bioalerts.LoadMolecules.LoadMolecules("./tutorial/datasets/5AR.smi",name_field=None)
molecules.ReadMolecules()

Format of the structures file = SMILES
All molecules in the input file were processed correctly


In [4]:
cox_bio = np.genfromtxt('./tutorial/datasets/5AR.bio.txt',skip_header=0) 
cox_bio.shape
arr = np.arange(0,len(cox_bio))
mask = np.ones(arr.shape,dtype=bool)
mask[molecules.molserr]=0
cox_bio = cox_bio[mask]
print len(cox_bio)
print len(molecules.mols)

278
278


In [5]:
dataset_info = bioalerts.LoadMolecules.GetDataSetInfo(name_field=None)
dataset_info.extract_substructure_information(radii=[2,3,4,5,6],mols=molecules.mols)

In [6]:
Alerts_continuous_high_activity = bioalerts.Alerts.CalculatePvaluesContinuous(radii_ext=[2,3,4,5,6])

In [7]:
def calculate_bioactivity_threshold(bio,threshold):
    
    sorted_bio = sorted(bio)
    threshold_bio_value = sorted_bio[int(len(sorted_bio)*threshold)]
    return threshold_bio_value

threshold_= 0.6
threshold_bio_value = calculate_bioactivity_threshold(cox_bio,threshold_)
threshold_bio_value

7.416801226

In [8]:
significant_substructure = Alerts_continuous_high_activity.get_significant_substructure_with_high_bioactivity(
mols = molecules.mols,
     substructure_dictionary = dataset_info.substructure_dictionary,
     bioactivities = cox_bio,
     mols_ids = molecules.mols_ids[:],
     threshold_nb_substructures = 5,
     threshold_pvalue = 0.05,
     threshold_ratio=0.2,
     threshold_high_act_nb_substructures=10,
     threshold_high_act_ratio = 0.6,                                                                        
     threshold_bioactivity=threshold_bio_value)

#threshold_high_act_nb_substructure = N of molecules involved significant substructure with high activity  
#threshold_high_act_ratio = N of molecules involved significant substructure with high activity / N of molecules involved significant substructure
#threshold_bioactivity = Top 40% by total bioactivity

#return value is significant_substructure dictionary with high activity

In [9]:
mol_id_set = set()
for k,v in significant_substructure.items():
    for mol_id in v: 
        mol_id_set.add(mol_id)

## B. insert feature

In [23]:
wdir =  'C:/jupyter_devel/kist-europe/QSAR/AOP_data/'
csv = '5-alpha-reductase-maccs-remcols-stdval-dltnan.csv'
result_wdir = 'C:/jupyter_devel/kist-europe/QSAR/AOP_data/model_result/'

df = pd.read_csv(wdir+csv)

In [24]:
y = df['Standard Value']
x = df.drop(['Molecule','Standard Value'],axis=1)

In [25]:
y_pic50 = 9 - np.log10(y)

In [26]:
#insert feature into only train dataset
for iter_ in x.index:
    for i in range(8): 
        if iter_ in mol_id_set : 
            x.loc[iter_,'significant_feature_{}'.format(i)] = 1.0
        else :
            x.loc[iter_,'significant_feature_{}'.format(i)] = 0.0

In [27]:
test_size = 0.1 
x_train, x_test, y_train, y_test = train_test_split(x,y_pic50,test_size = test_size, shuffle = True)

## C. model train

In [28]:
x_test_idx = []
for row in x_test.index:
    x_test_idx.append(row)

### C.1 knn 

In [29]:
#fold setting
kf = RepeatedKFold(n_splits = 5,n_repeats=10)
kf.get_n_splits(x_train)
#kf
#>>> KFold(n_splits=5, random_state=None, shuffle=False)

#for numbering k folds
k=0
cv_rmse = []
cv_rmse_mean = []
tmp = []

#DataFrame generate
y_pic_pred_result = pd.DataFrame()
y_pic_pred_result['idx'] = pd.Series([i for i in range(len(y))])


#model generate
neigh = KNeighborsRegressor(n_neighbors = 5,metric='jaccard',weights = 'distance')

#cross-val training
for train_idx, test_idx in kf.split(x_train):
    print("iteration num : {}".format(k))
    #print("train : ", train_idx, "\ntest : ", test_idx)
    x_cv_train , x_cv_test = x_train.iloc[train_idx], x_train.iloc[test_idx]
    y_cv_train , y_cv_test = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    neigh.fit(x_cv_train,y_cv_train)
    
    y_cv_testidx = test_idx
    y_cv_pred = neigh.predict(x_cv_test)
    cv_rmse.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    cv_rmse_mean.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    
    y_pic_pred_result['y_cv_testidx_{}'.format(k)] = pd.Series()
    y_pic_pred_result['y_cv_pred_{}'.format(k)] = pd.Series()
    
    for j in range(len(y_cv_testidx)):
        y_pic_pred_result.loc[y_cv_testidx[j],'y_cv_testidx_{}'.format(k)] = y_cv_testidx[j]
        y_pic_pred_result.loc[y_cv_testidx[j],'y_cv_pred_{}'.format(k)] = y_cv_pred[j]
        
    
    
    #y_pic_pred_result['y_cv_testidx_{}'.format(i)] = pd.Series(y_cv_testidx)
    #y_pic_pred_result['y_cv_pred_{}'.format(i)] = pd.Series(y_cv_pred)
    k += 1 
    
    if k % 5 == 0 : 
        tmp.append(np.mean(cv_rmse_mean))
        print(cv_rmse_mean) # per fold rmse value in one cv 
        print("mean : ",np.mean(cv_rmse_mean)) # RMSE mean value per iteration of cv  
        del cv_rmse_mean[:]
        

print("total fold mean : ", round(np.mean(cv_rmse),4)) # mean of 50 cv rmse (1회 iter돌 때의 모든 rmse의 평균 (50회의 평균)) 
print("total cv mean : ",round(np.mean(tmp),4)) # mean of cv's mean of 1 iter (각 cv rmse 평균의 평균) # we need var of this value 

#model retrain with all train data  
neigh.fit(x_train,y_train) # > model train
y_pic_test_pred = neigh.predict(x_test) # > model predidction

# 
y_pic_test_pred_df = pd.DataFrame(columns=['y_pic_test_idx','y_pic_test_pred'])
for j in range(len(x_test_idx)):
        y_pic_test_pred_df.loc[x_test_idx[j],'y_pic_test_idx'] = x_test_idx[j]
        y_pic_test_pred_df.loc[x_test_idx[j],'y_pic_test_pred'] = y_pic_test_pred[j]
#y_pic_test_pred_df = pd.DataFrame(y_pic_test_pred,columns=['y_pic_test_pred'])
y_pic_pred_result = pd.concat([y_pic_pred_result, y_pic_test_pred_df], axis=1, ignore_index=False)

#save csv 
#y_pic_pred_result.to_csv(result_wdir+'5-alpha-reductase_dltnan_y_pic_pred_result+with_significant_feature.csv',index=True)


iteration num : 0
iteration num : 1
iteration num : 2
iteration num : 3
iteration num : 4
[0.5816, 0.9582, 0.6899, 0.8618, 0.5794]
('mean : ', 0.73418)
iteration num : 5
iteration num : 6
iteration num : 7
iteration num : 8
iteration num : 9
[0.7596, 0.7974, 0.6246, 0.8691, 0.7737]
('mean : ', 0.76488)
iteration num : 10
iteration num : 11
iteration num : 12
iteration num : 13
iteration num : 14
[0.8788, 0.7533, 0.7233, 0.7713, 0.5937]
('mean : ', 0.7440800000000001)
iteration num : 15
iteration num : 16
iteration num : 17
iteration num : 18
iteration num : 19
[0.7135, 0.7799, 0.7286, 0.7669, 0.6463]
('mean : ', 0.72704)
iteration num : 20
iteration num : 21
iteration num : 22
iteration num : 23
iteration num : 24
[0.6745, 0.8123, 0.71, 0.7142, 0.6853]
('mean : ', 0.71926)
iteration num : 25
iteration num : 26
iteration num : 27
iteration num : 28
iteration num : 29
[0.7615, 0.7864, 0.6471, 0.6456, 0.8358]
('mean : ', 0.7352799999999999)
iteration num : 30
iteration num : 31
iteration 

In [30]:
#RESULT
print("total fold rmse mean : ", round(np.mean(cv_rmse),4)) # mean of 50 cv rmse (1회 iter돌 때의 모든 rmse의 평균 (50회의 평균)) 
print("total cv rmse mean : ",round(np.mean(tmp),4)) # mean of cv's mean of 1 iter (각 cv rmse 평균의 평균) # we need var of this value 
print("each cv rmse average : \n{} \nvariance of rmse of every folds : {} ".format(tmp,round(np.var(cv_rmse),4)))
print("train data size : {} ({}%)\n".format(len(x_train),round((len(x_train)/len(x)),2)), "test data size : {} ({}%)".format(len(x_test),round((len(x_test)/len(x)),2)))

('total fold rmse mean : ', 0.7407)
('total cv rmse mean : ', 0.7407)
each cv rmse average : 
[0.73418, 0.76488, 0.7440800000000001, 0.72704, 0.71926, 0.7352799999999999, 0.73804, 0.73514, 0.7575000000000001, 0.7519199999999999] 
variance of rmse of every folds : 0.0098 
('train data size : 250 (0.0%)\n', 'test data size : 28 (0.0%)')


### C.2 RF

In [31]:
#fold setting
kf = RepeatedKFold(n_splits = 5,n_repeats=10)
kf.get_n_splits(x_train)
#kf
#>>> KFold(n_splits=5, random_state=None, shuffle=False)

#for numbering k folds
k=0
cv_rmse = []
cv_rmse_mean = []
tmp = []

#DataFrame generate
y_pic_pred_result = pd.DataFrame()
y_pic_pred_result['idx'] = pd.Series([i for i in range(len(y))])


#model generate
RF = RandomForestRegressor(n_estimators= 30,random_state=0)
#cross-val training
for train_idx, test_idx in kf.split(x_train):
    print("iteration num : {}".format(k))
    #print("train : ", train_idx, "\ntest : ", test_idx)
    x_cv_train , x_cv_test = x_train.iloc[train_idx], x_train.iloc[test_idx]
    y_cv_train , y_cv_test = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    RF.fit(x_cv_train,y_cv_train)
    
    y_cv_testidx = test_idx
    y_cv_pred = RF.predict(x_cv_test)
    cv_rmse.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    cv_rmse_mean.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    
    y_pic_pred_result['y_cv_testidx_{}'.format(k)] = pd.Series()
    y_pic_pred_result['y_cv_pred_{}'.format(k)] = pd.Series()
    
    for j in range(len(y_cv_testidx)):
        y_pic_pred_result.loc[y_cv_testidx[j],'y_cv_testidx_{}'.format(k)] = y_cv_testidx[j]
        y_pic_pred_result.loc[y_cv_testidx[j],'y_cv_pred_{}'.format(k)] = y_cv_pred[j]
        
    
    
    #y_pic_pred_result['y_cv_testidx_{}'.format(i)] = pd.Series(y_cv_testidx)
    #y_pic_pred_result['y_cv_pred_{}'.format(i)] = pd.Series(y_cv_pred)
    k += 1 
    
    if k % 5 == 0 : 
        tmp.append(np.mean(cv_rmse_mean))
        print(cv_rmse_mean) # per fold rmse value in one cv 
        print("mean : ",np.mean(cv_rmse_mean)) # RMSE mean value per iteration of cv  
        del cv_rmse_mean[:]
        

print("total fold mean : ", round(np.mean(cv_rmse),4)) # mean of 50 cv rmse (1회 iter돌 때의 모든 rmse의 평균 (50회의 평균)) 
print("total cv mean : ",round(np.mean(tmp),4)) # mean of cv's mean of 1 iter (각 cv rmse 평균의 평균) # we need var of this value 

#model retrain with all train data  
RF.fit(x_train,y_train) # > model train
y_pic_test_pred = RF.predict(x_test) # > model predidction

# 
y_pic_test_pred_df = pd.DataFrame(columns=['y_pic_test_idx','y_pic_test_pred'])
for j in range(len(x_test_idx)):
        y_pic_test_pred_df.loc[x_test_idx[j],'y_pic_test_idx'] = x_test_idx[j]
        y_pic_test_pred_df.loc[x_test_idx[j],'y_pic_test_pred'] = y_pic_test_pred[j]
#y_pic_test_pred_df = pd.DataFrame(y_pic_test_pred,columns=['y_pic_test_pred'])
y_pic_pred_result = pd.concat([y_pic_pred_result, y_pic_test_pred_df], axis=1, ignore_index=False)

#save csv 
#y_pic_pred_result.to_csv(result_wdir+'5-alpha-reductase_dltnan_y_pic_pred_result+with_significant_feature.csv',index=True)


iteration num : 0
iteration num : 1
iteration num : 2
iteration num : 3
iteration num : 4
[0.6012, 0.7671, 0.7016, 0.6691, 0.6997]
('mean : ', 0.68774)
iteration num : 5
iteration num : 6
iteration num : 7
iteration num : 8
iteration num : 9
[0.7866, 0.6576, 0.6153, 0.5898, 0.6186]
('mean : ', 0.65358)
iteration num : 10
iteration num : 11
iteration num : 12
iteration num : 13
iteration num : 14
[0.6836, 0.6286, 0.6577, 0.6397, 0.6106]
('mean : ', 0.6440400000000001)
iteration num : 15
iteration num : 16
iteration num : 17
iteration num : 18
iteration num : 19
[0.7629, 0.6269, 0.6652, 0.7297, 0.7313]
('mean : ', 0.7032)
iteration num : 20
iteration num : 21
iteration num : 22
iteration num : 23
iteration num : 24
[0.7477, 0.6807, 0.6734, 0.6605, 0.7694]
('mean : ', 0.70634)
iteration num : 25
iteration num : 26
iteration num : 27
iteration num : 28
iteration num : 29
[0.5529, 0.7337, 0.6762, 0.8005, 0.7487]
('mean : ', 0.7024)
iteration num : 30
iteration num : 31
iteration num : 32
it

In [32]:
#RESULT
print("total fold rmse mean : ", round(np.mean(cv_rmse),4)) # mean of 50 cv rmse (1회 iter돌 때의 모든 rmse의 평균 (50회의 평균)) 
print("total cv rmse mean : ",round(np.mean(tmp),4)) # mean of cv's mean of 1 iter (각 cv rmse 평균의 평균) # we need var of this value 
print("each cv rmse average : \n{} \nvariance of rmse of every folds : {} ".format(tmp,round(np.var(cv_rmse),4)))
print("train data size : {} ({}%)\n".format(len(x_train),round((len(x_train)/len(x)),2)), "test data size : {} ({}%)".format(len(x_test),round((len(x_test)/len(x)),2)))

('total fold rmse mean : ', 0.6782)
('total cv rmse mean : ', 0.6782)
each cv rmse average : 
[0.68774, 0.65358, 0.6440400000000001, 0.7032, 0.70634, 0.7024, 0.66996, 0.6620600000000001, 0.67676, 0.67584] 
variance of rmse of every folds : 0.0051 
('train data size : 250 (0.0%)\n', 'test data size : 28 (0.0%)')


### C.3 SVR 

In [None]:
#fold setting
kf = RepeatedKFold(n_splits = 5,n_repeats=10)
kf.get_n_splits(x_train)
#kf
#>>> KFold(n_splits=5, random_state=None, shuffle=False)

#for numbering k folds
k=0
cv_rmse = []
cv_rmse_mean = []
tmp = []

#DataFrame generate
y_pic_pred_result = pd.DataFrame()
y_pic_pred_result['idx'] = pd.Series([i for i in range(len(y))])


#model generate


#cross-val training
for train_idx, test_idx in kf.split(x_train):
    print("iteration num : {}".format(k))
    #print("train : ", train_idx, "\ntest : ", test_idx)
    x_cv_train , x_cv_test = x_train.iloc[train_idx], x_train.iloc[test_idx]
    y_cv_train , y_cv_test = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    RF.fit(x_cv_train,y_cv_train)
    
    y_cv_testidx = test_idx
    y_cv_pred = RF.predict(x_cv_test)
    cv_rmse.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    cv_rmse_mean.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    
    y_pic_pred_result['y_cv_testidx_{}'.format(k)] = pd.Series()
    y_pic_pred_result['y_cv_pred_{}'.format(k)] = pd.Series()
    
    for j in range(len(y_cv_testidx)):
        y_pic_pred_result.loc[y_cv_testidx[j],'y_cv_testidx_{}'.format(k)] = y_cv_testidx[j]
        y_pic_pred_result.loc[y_cv_testidx[j],'y_cv_pred_{}'.format(k)] = y_cv_pred[j]
        
    
    
    #y_pic_pred_result['y_cv_testidx_{}'.format(i)] = pd.Series(y_cv_testidx)
    #y_pic_pred_result['y_cv_pred_{}'.format(i)] = pd.Series(y_cv_pred)
    k += 1 
    
    if k % 5 == 0 : 
        tmp.append(np.mean(cv_rmse_mean))
        print(cv_rmse_mean) # per fold rmse value in one cv 
        print("mean : ",np.mean(cv_rmse_mean)) # RMSE mean value per iteration of cv  
        del cv_rmse_mean[:]
        

print("total fold mean : ", round(np.mean(cv_rmse),4)) # mean of 50 cv rmse (1회 iter돌 때의 모든 rmse의 평균 (50회의 평균)) 
print("total cv mean : ",round(np.mean(tmp),4)) # mean of cv's mean of 1 iter (각 cv rmse 평균의 평균) # we need var of this value 

#model retrain with all train data  
RF.fit(x_train,y_train) # > model train
y_pic_test_pred = RF.predict(x_test) # > model predidction

# 
y_pic_test_pred_df = pd.DataFrame(columns=['y_pic_test_idx','y_pic_test_pred'])
for j in range(len(x_test_idx)):
        y_pic_test_pred_df.loc[x_test_idx[j],'y_pic_test_idx'] = x_test_idx[j]
        y_pic_test_pred_df.loc[x_test_idx[j],'y_pic_test_pred'] = y_pic_test_pred[j]
#y_pic_test_pred_df = pd.DataFrame(y_pic_test_pred,columns=['y_pic_test_pred'])
y_pic_pred_result = pd.concat([y_pic_pred_result, y_pic_test_pred_df], axis=1, ignore_index=False)

#save csv 
#y_pic_pred_result.to_csv(result_wdir+'5-alpha-reductase_dltnan_y_pic_pred_result+with_significant_feature.csv',index=True)


In [None]:
#RESULT
print("total fold rmse mean : ", round(np.mean(cv_rmse),4)) # mean of 50 cv rmse (1회 iter돌 때의 모든 rmse의 평균 (50회의 평균)) 
print("total cv rmse mean : ",round(np.mean(tmp),4)) # mean of cv's mean of 1 iter (각 cv rmse 평균의 평균) # we need var of this value 
print("each cv rmse average : \n{} \nvariance of rmse of every folds : {} ".format(tmp,round(np.var(cv_rmse),4)))
print("train data size : {} ({}%)\n".format(len(x_train),round((len(x_train)/len(x)),2)), "test data size : {} ({}%)".format(len(x_test),round((len(x_test)/len(x)),2)))

In [None]:
'''
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect as _GetMorganFingerprintAsBitVect, GetMorganFingerprint as _GetMorganFingerprint
from bioalerts import LoadMolecules, Alerts, FPCalculator 
from rdkit import Chem 

# >>> _GetMorganFingerprint #Returns a Morgan fingerprint for a molecule
# >>> _GetMorganFingerprintAsBitVect  #Returns a Morgan fingerprint for a molecule as a bit vector

molecules = LoadMolecules.LoadMolecules("./tutorial/datasets/5AR.smi",name_field=None)
molecules.ReadMolecules() 
stride = int(len(molecules.mols)*0.9)
training = molecules.mols[0:stride]
test = molecules.mols[stride:len(molecules.mols)]
print (len(molecules.mols), len(test), len(training))

radii = [2,3,4,5,6]


def extract_substructure_information(radii,mols):
    substructure_dictionary = {}
    for i,m in enumerate(mols):
        info = {}
        fp = _GetMorganFingerprint(m,max(radii),bitInfo=info)
        for k,v in info.items():
            if v[0][1] in radii:
                if k in substructure_dictionary.keys():
                    substructure_dictionary[k].append(i)
                else : 
                    substructure_dictionary.update({k:[i]})
    return substructure_dictionary

'''