In [1]:
import bioalerts
from bioalerts import LoadMolecules, Alerts, FPCalculator
import sys

In [55]:
import numpy as np 
import scipy
import matplotlib.pyplot as plt

from rdkit.Chem.Draw import IPythonConsole 
from rdkit.Chem import PandasTools 

import pandas as pd 

from sklearn.neighbors import KNeighborsRegressor #knn
from sklearn.ensemble import RandomForestRegressor #RF
from sklearn.svm import SVR #SVR
from sklearn.neural_network import MLPRegressor #MLP 

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [4]:
import sys, numpy as np, scipy as sc, rdkit, matplotlib as pylab, pandas as pd, IPython
#print " Python:", sys.version, "\n"
#print " Numpy:", np.__version__
#print " Scipy:", sc.__version__
#print " Rdkit:", rdkit.rdBase.rdkitVersion
#print " Matplotlib:", pylab.__version__
#print " Pandas:", pd.__version__
#print " Ipython:", IPython.__version__
#print " Scikit-Learn:", sklearn.__version__
#print " Scipy:", scipy.__version__




## A. Calculating Morgan fingerprints

In this section, we will see how to calculate hashed and unhashed Morgan fingerprints for a set of molecules, and will train some models. 

In [5]:
AR_mols = bioalerts.LoadMolecules.LoadMolecules("./tutorial/datasets/5AR.smi",name_field=None) 
#COX2.smi is a set of only SMILES data

In [6]:
AR_mols.ReadMolecules()

Format of the structures file = SMILES
All molecules in the input file were processed correctly


In [7]:
AR_bio = np.genfromtxt('./tutorial/datasets/5AR.bio.txt',skip_header=0) 
print len(AR_mols.mols)
print len(AR_bio)

278
278


In [8]:
stride = int(len(AR_mols.mols) * 0.9)
training = AR_mols.mols[0:stride]
test = AR_mols.mols[stride:len(AR_mols.mols)]
print len(AR_mols.mols), len(test), len(training)

bio_training = AR_bio[0:stride]
bio_test = AR_bio[stride:len(AR_mols.mols)]
print len(AR_bio), len(bio_test), len(bio_training)

278 28 250
278 28 250


### A.1 Computating HASHED Morgan fingerprints for the TRAINING set

We first initialize the class bioalerts.FPCalculator.CalculateFPs()

In [9]:
fps_training = bioalerts.FPCalculator.CalculateFPs(radii=[0,1,2],mols=training)

In [10]:
fps_training.calculate_hashed_fps_binary_quick(nBits=128)

In [11]:
print fps_training.fps_hashed_binary_quick.shape

(250L, 128L)


The method "calculate_hashed_fps_binary_quick()" considers all substructures with a bond radius smaller or equal to the maximum value of the argument radii. In this example, 2. Thus, in this case this method will consider all substructures with a bond radius of 0, 1 and 2.

In [12]:
fps_training.calculate_hashed_fps(nBits=128)

In [13]:
print fps_training.fps_hashed_binary.shape
print fps_training.fps_hashed_counts.shape

(250L, 128L)
(250L, 128L)


The method **"calculate_hashed_fps()"** computes hashed fingerpints in **(i) count** and **(ii) also binary format**. The difference of the method *calculate_hashed_fps()* with respect to the method *calculate_hashed_fps_binary_quick()* is that the **former only consider the substructures with a bond radius in the list input** to the argument radii when instantiating the class bioalerts.FPCalculator.CalculateFPs(). In the example above: [1,2].

Therefore, the hashed fingerprints in binary format calculated by these two methods will be *identical* if the bond radii especified in the aforesaid argument radii comprise all natural number from 0 to the maximum bond radius. In this example, this would be correspond to setting the value of the argument radii to [0,1,2].

In [20]:
print (fps_training.fps_hashed_binary_quick == fps_training.fps_hashed_binary).all()

True


In [38]:
toto = fps_training.fps_hashed_counts
#toto[1:10,1:10]
toto.shape

(250L, 128L)

In [16]:
fps_training.fps_hashed_binary[1:10,1:10]

array([[1, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 1, 1, 0]])

In [17]:
fps_training.fps_hashed_binary_quick[1:10,1:10]

array([[1, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 1, 1, 0]])

### A.2 Computating UNHASHED Morgan fingerprints for the TRAINING set

In [21]:
fps_training.calculate_unhashed_fps(draw_substructures=True)

No input set of keys for the substructures. 
Thus, the substructures present in the input molecules will be considered for the calculation of unhashed fingerprints.


## A.3 Computating Morgan fingerprints for the TEST set 

### A.3.1 Computating UNHASHED Morgan fingerprints for the test set using a different set of molecules:

Since the positions of the substructures in the unhashed fingerprints depend on the training set, the method calculate_unhashed_fps() allows the computation of unhashed fingerprints for new compounds using a basis defined by the substructures present in the training set. This basis is defined by the keys of the substructure dictionary calculated for the molecules from the training set. This reference substructure dictionary is input to the class CalculateFPs() using the argument reference_substucture_keys.

*This ensures that substructures in new compounds map to the same locations on the fingerprint and allows enhanced model interpretation by noting which exact substructures are deemed important by the learning algorithm.*

In [22]:
#  Reference molecules
reference_molecules = bioalerts.LoadMolecules.GetDataSetInfo(name_field=None)

In [23]:
fps_training.substructure_ids

array([ 218993025, 3334525955,  757425542, ..., 1842898132, 1861965050,
       1101907775], dtype=int64)

In [24]:
reference_molecules.extract_substructure_information(radii=[0,1,2],mols=training)

In [26]:
reference_keys = reference_molecules.substructure_dictionary.keys()
print len(reference_keys)
reference_keys

1205


[2245273601L,
 171200514,
 3325788165L,
 1462992903,
 4257302536L,
 3341916169L,
 2074365963,
 559345677,
 203749390,
 844021776,
 2394513426L,
 2128345568,
 2132284755,
 2458968089L,
 203491355,
 2272196637L,
 3931877381L,
 2251845666L,
 3346153511L,
 1884262887,
 3315826729L,
 2344097836L,
 3820826669L,
 2246961199L,
 29763632,
 469438472,
 3628319739L,
 4004630581L,
 4277401654L,
 1423712863,
 2970601533L,
 2488692798L,
 331483199,
 251179073,
 1620013122,
 1970409539,
 3000223812L,
 2619926597L,
 3527817293L,
 4084308046L,
 74537039,
 2245384272L,
 414398545,
 2178795604L,
 1480134744,
 118059098,
 2477914203L,
 1973221468,
 1901242461,
 1026654305,
 891920484,
 1999904870,
 1140799591,
 272019561,
 1596002410,
 3873941611L,
 3614376044L,
 699682834,
 3890010222L,
 3420254321L,
 2245277810L,
 3880022399L,
 567853172,
 1362518133,
 1486598263,
 2086410360,
 2308118649L,
 3124242557L,
 1049880707,
 1224122500,
 2077658817,
 1740632203,
 2591432844L,
 3214112909L,
 1360035982,
 431433

In [29]:
fps_test = bioalerts.FPCalculator.CalculateFPs(radii=[0,1,2],
                                         mols=test,
                                         reference_substructure_keys=reference_keys)

In [30]:
fps_test.calculate_unhashed_fps()

In [31]:
fps_test.fps_unhashed_counts.shape

(28L, 1205L)

In [35]:
len(fps_training.columns_unhashed)

1205

### A.3.2 Computating HASHED Morgan fingerprints for the test set (e.g. using a different set of molecules)

In [39]:
fps_test.calculate_hashed_fps(nBits=128)

## A.4 Training models 

### A.4.1 Using hashed fps

In [40]:
#RF
seed = 23
RF_hashed_counts = RandomForestRegressor(n_estimators=100,random_state=seed,n_jobs=2)
RF_hashed_counts.fit(fps_training.fps_hashed_counts,bio_training)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
           oob_score=False, random_state=23, verbose=0, warm_start=False)

In [106]:
# MLP 
seed = 23
MLP_hashed_counts = MLPRegressor(hidden_layer_sizes=(50,25,10),activation='relu', solver='adam',learning_rate='adaptive', random_state = seed)
MLP_hashed_counts.fit(fps_training.fps_hashed_counts,bio_training)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 25, 10), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=23, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [None]:
# KNN 

### A.4.2 Using unhashed fps

In [41]:
#RF
seed = 23
RF_UNhashed_counts = RandomForestRegressor(n_estimators=100,random_state=seed,n_jobs=2)
RF_UNhashed_counts.fit(fps_training.fps_unhashed_counts,bio_training)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
           oob_score=False, random_state=23, verbose=0, warm_start=False)

In [107]:
#MLP
seed = 23 
MLP_UNhashed_counts = MLPRegressor(hidden_layer_sizes=(50,25,10),activation='relu', solver='adam',learning_rate='adaptive', random_state = seed)
MLP_UNhashed_counts.fit(fps_training.fps_unhashed_counts,bio_training)                                

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 25, 10), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=23, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [None]:
#KNN 

### (optional) A.4.3 10 samplings of 5 fold cross validation 

In [None]:
def Rsquared(pred,true):
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(true,pred)
    return r_value**2

def RMSE(pred,true):
    rmse = np.sqrt(mean_squared_error(true,pred))
    return rmse

In [61]:
test_size = 0
x_train, x_test, y_train, y_test = train_test_split(fps_training.fps_unhashed_counts,bio_training,test_size = test_size, shuffle = True)

In [63]:
#fold setting
kf = RepeatedKFold(n_splits = 5,n_repeats=10)
kf.get_n_splits(x_train)
#kf
#>>> KFold(n_splits=5, random_state=None, shuffle=False)

#for numbering k folds
k=0
cv_rmse = []
cv_rmse_mean = []
tmp = []

#cross-val training
for train_idx, test_idx in kf.split(x_train):
    #print("iteration num : {}".format(k))
    #print("train : ", train_idx, "\ntest : ", test_idx)
    x_cv_train , x_cv_test = x_train[train_idx], x_train[test_idx]
    y_cv_train , y_cv_test = y_train[train_idx], y_train[test_idx]
    
    RF_UNhashed_counts.fit(x_cv_train,y_cv_train)
    
    y_cv_testidx = test_idx
    y_cv_pred = RF_UNhashed_counts.predict(x_cv_test)
    cv_rmse.append(round(RMSE(y_cv_pred,y_cv_test),4))
    cv_rmse_mean.append(round(RMSE(y_cv_pred,y_cv_test),4))
    k += 1 
    
    if k % 5 == 0 : 
        tmp.append(np.mean(cv_rmse_mean))
        print(cv_rmse_mean) # per fold rmse value in one cv 
        print("mean : ",np.mean(cv_rmse_mean)) # RMSE mean value per iteration of cv  
        del cv_rmse_mean[:]
        

print("total fold mean : ", round(np.mean(cv_rmse),4)) # mean of 50 cv rmse (1회 iter돌 때의 모든 rmse의 평균 (50회의 평균)) 
print("total cv mean : ",round(np.mean(tmp),4)) # mean of cv's mean of 1 iter (각 cv rmse 평균의 평균) # we need var of this value 

[0.7401, 0.7627, 0.8313, 0.6866, 0.65]
('mean : ', 0.73414)
[0.6809, 0.6339, 0.7285, 0.5971, 0.5995]
('mean : ', 0.6479799999999999)
[0.8326, 0.5834, 0.5847, 0.8627, 0.7445]
('mean : ', 0.7215800000000001)
[0.5738, 0.7671, 0.5862, 0.5122, 0.8529]
('mean : ', 0.65844)
[0.5457, 0.622, 0.6515, 0.6079, 0.7839]
('mean : ', 0.6422)
[0.6767, 0.6661, 0.6645, 0.7467, 0.6842]
('mean : ', 0.68764)
[0.6533, 0.5886, 0.7403, 0.5842, 0.753]
('mean : ', 0.66388)
[0.6233, 0.6027, 0.7628, 0.614, 0.7549]
('mean : ', 0.67154)
[0.7843, 0.8002, 0.6866, 0.6713, 0.5514]
('mean : ', 0.69876)
[0.6551, 0.5743, 0.718, 0.6189, 0.712]
('mean : ', 0.6556599999999999)
('total fold mean : ', 0.6782)
('total cv mean : ', 0.6782)


## A.5  predict bioactivities for the test set

We define two functions for validating our predictions, namely: Pearson's correlation coefficient (R2) and root mean squared errro (RMSE)

In [42]:
def Rsquared(pred,true):
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(true,pred)
    return r_value**2

def RMSE(pred,true):
    rmse = np.sqrt(mean_squared_error(true,pred))
    return rmse

In [108]:
RF_preds_hashed = RF_hashed_counts.predict(fps_test.fps_hashed_counts)
MLP_preds_hashed = MLP_hashed_counts.predict(fps_test.fps_hashed_counts)
#KNN

In [109]:
RF_preds_UNhashed = RF_UNhashed_counts.predict(fps_test.fps_unhashed_counts)
MLP_preds_UNhashed = MLP_UNhashed_counts.predict(fps_test.fps_unhashed_counts) 
#KNN 

## A.6 Model validation 

In [110]:
print "RMSE RF hashed fps: ", RMSE(RF_preds_hashed, bio_test)
print "RMSE RF UNhashed fps: ", RMSE(RF_preds_UNhashed, bio_test)
print "RMSE MLP hashed fps: ", RMSE(MLP_preds_hashed, bio_test)
print "RMSE MLP UNhashed fps: ", RMSE(MLP_preds_UNhashed, bio_test)
#print "RMSE knn hashed fps: ", RMSE(preds_knn_hashed, bio_test)
#print "RMSE knn UNhashed fps: ", RMSE(preds_knn_UNhashed, bio_test)

RMSE RF hashed fps:  0.6389450617226295
RMSE RF UNhashed fps:  0.6550376431992654
RMSE MLP hashed fps:  0.6775207323120629
RMSE MLP UNhashed fps:  0.9532095476747148


In [111]:
print "R2 RF hashed fps: ", Rsquared(RF_preds_hashed, bio_test)
print "R2 RF UNhashed fps: ", Rsquared(RF_preds_UNhashed, bio_test)
print "R2 MLP hashed fps: ", Rsquared(MLP_preds_hashed, bio_test)
print "R2 MLP UNhashed fps: ", Rsquared(MLP_preds_UNhashed, bio_test)
#print "R2 knn hashed fps: ", Rsquared(preds_knn_hashed, bio_test)
#print "R2 knn UNhashed fps: ", Rsquared(preds_knn_UNhashed, bio_test)

R2 RF hashed fps:  0.675541585463033
R2 RF UNhashed fps:  0.641275872727945
R2 MLP hashed fps:  0.7049414537947494
R2 MLP UNhashed fps:  0.44078416469585924
