In [55]:
import bioalerts
from bioalerts import LoadMolecules, Alerts, FPCalculator
import sys

In [56]:
import numpy as np 
import scipy
import matplotlib.pyplot as plt

from rdkit.Chem.Draw import IPythonConsole 
from rdkit.Chem import PandasTools 

import pandas as pd 

from sklearn.neighbors import KNeighborsRegressor #knn
from sklearn.ensemble import RandomForestRegressor #RF
from sklearn.svm import SVR #SVR
from sklearn.neural_network import MLPRegressor #MLP 

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [57]:
import sys, numpy as np, scipy as sc, rdkit, matplotlib as pylab, pandas as pd, IPython

from rdkit.Chem import MACCSkeys                       #MACCS Keys
from rdkit import Chem, DataStructs

In [58]:
def fingerprint_bitgenerator(fps):
    np_fps = []
    
    for fp in fps:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp,arr)
        np_fps.append(arr)
        
    return np_fps

In [59]:
proteinTarget = 'mmc1_estrogen'
data_path = './tutorial/datasets/'

In [60]:
mols = bioalerts.LoadMolecules.LoadMolecules(data_path + proteinTarget +".smi",name_field=None) 

In [61]:
mols.ReadMolecules()

Format of the structures file = SMILES
3 molecules (starting at zero) could not be processed.

This information has been saved in the following file: incorrect_molecules.csv

NOTE: the indexes of the molecules start at zero. Thus the first molecule is molecule 0.


In [62]:
mols_bio = np.genfromtxt(data_path + proteinTarget +'.bio.txt',skip_header=0)
print len(mols.mols)
print len(mols_bio)

1275
1278


In [63]:
if len(mols.molserr)!= 0:
    mols_bio = np.delete(mols_bio,mols.molserr)

In [64]:
print len(mols.mols)
print len(mols_bio)

1275
1275


In [65]:
df = pd.DataFrame()
fr = open(data_path + proteinTarget +".smi","r")
lines = fr.readlines()
for i, line in enumerate(lines):
    df.loc[i,'SMILES'] = line 

In [66]:
df = df.drop(mols.molserr,axis=0)

In [67]:
smi = df['SMILES']
sd = [Chem.MolFromSmiles(m) for m in smi]
maccfps = [MACCSkeys.GenMACCSKeys(m) for m in sd]
maccfps_bit = fingerprint_bitgenerator(maccfps)
df_FP = pd.DataFrame(maccfps_bit,columns=list(range(len(maccfps_bit[0]))))

In [68]:
#Remove features with low variance
remove_cols = []
for c in df_FP.columns : 
    if df_FP[c].std() < 0.01:
        remove_cols.append(c)
df_FP = df_FP.drop(remove_cols,axis=1)
corr = df_FP.corr().iloc[0]

In [69]:
#  Reference molecules
reference_molecules = bioalerts.LoadMolecules.GetDataSetInfo(name_field=None)

In [70]:
reference_molecules.extract_substructure_information(radii=[0,1,2],mols=mols.mols)

In [71]:
reference_keys = reference_molecules.substructure_dictionary.keys()
reference_dict = reference_molecules.substructure_dictionary
print len(reference_keys)

4063


In [72]:
toxic = {59582328: [1.0],
 144862996: [0.8333333333333334],
 238332961: [0.875],
 243031988: [0.9],
 248538045: [0.8333333333333334],
 289742792: [0.7142857142857143],
 291625364: [0.8333333333333334],
 293645588: [0.75],
 303065576: [1.0],
 316861206: [0.8461538461538461],
 408935666: [1.0],
 441214936: [0.7142857142857143],
 464679642: [0.8571428571428571],
 487877969: [1.0],
 491206018: [0.7142857142857143],
 528413146: [1.0],
 529228269: [1.0],
 590401100: [0.8333333333333334],
 601301716: [0.7272727272727273],
 665316820: [0.8333333333333334],
 685620941: [0.8571428571428571],
 776025833: [0.875],
 780020950: [0.7142857142857143],
 817010262: [0.8333333333333334],
 856674059: [0.8333333333333334],
 885218005: [0.7272727272727273],
 907194548: [0.7142857142857143],
 973747042: [0.8181818181818182],
 1018193551: [0.9],
 1025187852: [0.8333333333333334],
 1040552245: [0.7142857142857143],
 1072783936: [0.8333333333333334],
 1210732435: [0.8181818181818182],
 1220447184: [1.0],
 1233810244: [0.8333333333333334],
 1269851781: [1.0],
 1299092894: [0.8333333333333334],
 1300724914: [0.8],
 1322529363: [0.8333333333333334],
 1378206585: [1.0],
 1437747071: [0.75],
 1518432449: [0.8461538461538461],
 1609125206: [0.7142857142857143],
 1616758320: [1.0],
 1632286252: [0.8333333333333334],
 1833028715: [0.8333333333333334],
 1837362276: [0.7272727272727273],
 1887419926: [0.8181818181818182],
 1904321503: [0.8333333333333334],
 1906629638: [0.875],
 1914835207: [0.7142857142857143],
 1926414473: [0.8181818181818182],
 2092096443: [0.8333333333333334],
 2096291241: [0.8181818181818182],
 2110713573: [0.8333333333333334],
 2128830929: [0.7142857142857143],
 2292238093L: [0.8333333333333334],
 2295371384L: [0.7777777777777778],
 2396384619L: [0.8333333333333334],
 2441201613L: [0.8333333333333334],
 2455943357L: [0.8333333333333334],
 2486452475L: [0.7272727272727273],
 2547260028L: [0.7142857142857143],
 2586477428L: [0.7692307692307693],
 2632265184L: [0.8461538461538461],
 2666267517L: [0.7142857142857143],
 2676693293L: [0.7142857142857143],
 2685957082L: [0.8571428571428571],
 2700364449L: [0.8571428571428571],
 2730581186L: [0.8571428571428571],
 2854154172L: [0.75],
 2927177866L: [0.8333333333333334],
 2956273779L: [0.8333333333333334],
 2960320475L: [1.0],
 2969511302L: [0.8333333333333334],
 3047048782L: [0.7777777777777778],
 3115087023L: [0.9],
 3432086929L: [0.8333333333333334],
 3445050409L: [0.8333333333333334],
 3479423841L: [0.8571428571428571],
 3491936091L: [0.8181818181818182],
 3663265415L: [0.8333333333333334],
 3669900705L: [1.0],
 3743793454L: [0.75],
 3744626343L: [0.7647058823529411],
 3774802668L: [0.7142857142857143],
 3864161222L: [0.8333333333333334],
 3926336955L: [1.0],
 4098238333L: [0.7142857142857143],
 4158306861L: [0.7727272727272727],
 4236605231L: [0.7777777777777778],
 4242283111L: [0.8333333333333334],
 4246700814L: [0.8333333333333334],
 4275705722L: [0.8333333333333334]}

In [84]:
for substr,indices in reference_dict.items() : 
    if substr in toxic.keys() :
        for idx in indices :
            if 0.7 <= toxic[substr][0] < 0.8 :
                for i in range(8):  
                    df_FP.loc[idx,'significant_feature_{}'.format(i)] = 1.0
                    #df_fps_bin.loc[idx,'significant_feature_{}'.format(i)] = 1.0
            elif 0.8 <= toxic[substr][0] < 0.9 : 
                for i in range(8):
                    df_FP.loc[idx,'significant_feature_{}'.format(i)] = 2.0
                    #df_fps_bin.loc[idx,'significant_feature_{}'.format(i)] = 2.0
            elif 0.9 <= toxic[substr][0] <1.0 : 
                for i in range(8):
                    df_FP.loc[idx,'significant_feature_{}'.format(i)] = 3.0
                    #df_fps_bin.loc[idx,'significant_feature_{}'.format(i)] = 3.0
            else:
                for i in range(8):
                    df_FP.loc[idx,'significant_feature_{}'.format(i)] = 0.0
                    #df_fps_bin.loc[idx,'significant_feature_{}'.format(i)] = 0.0

In [85]:
df_FP = df_FP.replace(np.nan,0.0)

In [86]:
print(len(df_FP[df_FP.significant_feature_1!=0.0]))
print(len(df_FP))

67
1275


In [87]:
# y label
df_bio = pd.DataFrame(mols_bio,columns=None)

In [88]:
test_size = 0.05 
fps_train, fps_test, bio_train, bio_test = train_test_split(df_FP,df_bio,test_size = test_size, shuffle = True)

In [89]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics 
from sklearn.metrics import f1_score

In [90]:
n_estimators = 100
min_samples_split= 2
max_depth = None

n_neighbors = 6 
metric = 'matching'

In [91]:
#RF
seed = 23
RF_hashed_counts = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,min_samples_split=min_samples_split,random_state=seed,n_jobs=2)
RF_hashed_counts.fit(fps_train, bio_train)

# KNN
KNN_hashed_counts = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance', algorithm='auto',metric=metric)
KNN_hashed_counts.fit(fps_train, bio_train)


  after removing the cwd from sys.path.
  


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='matching',
           metric_params=None, n_jobs=None, n_neighbors=6, p=2,
           weights='distance')

In [92]:
#fold setting
kf = RepeatedKFold(n_splits = 5,n_repeats=10)
kf.get_n_splits(fps_train)
#kf
#>>> KFold(n_splits=5, random_state=None, shuffle=False)

#for numbering k folds
k=0
cv_rmse = []
cv_rmse_mean = []
tmp = []
AUC = []
f1_List = []
fpr_RF, tpr_RF, thresholds_RF = None, None, None 

#cross-val training
for train_idx, test_idx in kf.split(fps_train):
    x_cv_train , x_cv_test = fps_train.iloc[train_idx], fps_train.iloc[test_idx]
    y_cv_train , y_cv_test = bio_train.iloc[train_idx], bio_train.iloc[test_idx]
    
    RF_hashed_counts.fit(x_cv_train,y_cv_train)
    
    y_cv_testidx = test_idx
    y_cv_pred = RF_hashed_counts.predict(x_cv_test)
    #cv_rmse.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    #cv_rmse_mean.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    cv_rmse.append(accuracy_score(y_cv_test,y_cv_pred))
    cv_rmse_mean.append(accuracy_score(y_cv_test,y_cv_pred))
    
    f1_List.append(f1_score(y_cv_test,y_cv_pred))
    
    fpr_RF, tpr_RF, thresholds_RF = metrics.roc_curve(y_cv_test, y_cv_pred, pos_label=1)
    AUC.append(metrics.auc(fpr_RF, tpr_RF))
    
    k += 1 
    
    if k % 5 == 0 : 
        tmp.append(np.mean(cv_rmse_mean))
        print(cv_rmse_mean) # per fold rmse value in one cv 
        print("mean : ",np.mean(cv_rmse_mean)) # RMSE mean value per iteration of cv  
        del cv_rmse_mean[:]
        
print
print ("each CV RMSE average : {}".format(tmp))
print
print "total fold mean : ", round(np.mean(cv_rmse),4)
print ("VARIANCE of RMSE of every folds : {}".format(round(np.var(cv_rmse),4)))
print "F1 SCORE : " , np.mean(f1_List)
print "AUC : ", np.mean(AUC)

#model retrain with all train data  
RF_hashed_counts.fit(fps_train,bio_train) # > model train



[0.9094650205761317, 0.9256198347107438, 0.9380165289256198, 0.9132231404958677, 0.9173553719008265]
('mean : ', 0.9207359793218378)
[0.9423868312757202, 0.9008264462809917, 0.9256198347107438, 0.9090909090909091, 0.9173553719008265]
('mean : ', 0.9190558786518382)
[0.934156378600823, 0.9256198347107438, 0.8925619834710744, 0.9504132231404959, 0.9421487603305785]
('mean : ', 0.9289800360507432)
[0.9300411522633745, 0.8801652892561983, 0.9380165289256198, 0.9504132231404959, 0.9214876033057852]
('mean : ', 0.9240247593782949)
[0.9259259259259259, 0.9297520661157025, 0.9214876033057852, 0.9338842975206612, 0.9173553719008265]
('mean : ', 0.9256810529537802)
[0.9053497942386831, 0.9049586776859504, 0.9297520661157025, 0.9380165289256198, 0.9338842975206612]
('mean : ', 0.9223922728973234)
[0.9423868312757202, 0.9380165289256198, 0.9173553719008265, 0.9462809917355371, 0.9049586776859504]
('mean : ', 0.9297996803047308)
[0.8888888888888888, 0.9545454545454546, 0.9338842975206612, 0.8966942



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=23, verbose=0, warm_start=False)

In [93]:
#fold setting
kf = RepeatedKFold(n_splits = 5,n_repeats=10)
kf.get_n_splits(fps_train)
#kf
#>>> KFold(n_splits=5, random_state=None, shuffle=False)

#for numbering k folds
k=0
cv_rmse = []
cv_rmse_mean = []
tmp = []
f1_List = []
AUC = []
fpr_KNN, tpr_KNN, thresholds_KNN = None, None, None 

#cross-val training
for train_idx, test_idx in kf.split(fps_train):
    x_cv_train , x_cv_test = fps_train.iloc[train_idx], fps_train.iloc[test_idx]
    y_cv_train , y_cv_test = bio_train.iloc[train_idx], bio_train.iloc[test_idx]
    
    KNN_hashed_counts.fit(x_cv_train,y_cv_train)
    
    y_cv_testidx = test_idx
    y_cv_pred = KNN_hashed_counts.predict(x_cv_test)
    #cv_rmse.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    #cv_rmse_mean.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    cv_rmse.append(accuracy_score(y_cv_test,y_cv_pred))
    cv_rmse_mean.append(accuracy_score(y_cv_test,y_cv_pred))
    
    f1_List.append(f1_score(y_cv_test,y_cv_pred))
    
    fpr_KNN, tpr_KNN, thresholds_KNN = metrics.roc_curve(y_cv_test, y_cv_pred, pos_label=1)
    AUC.append(metrics.auc(fpr_KNN, tpr_KNN))
    
    k += 1 
    
    if k % 5 == 0 : 
        tmp.append(np.mean(cv_rmse_mean))
        print(cv_rmse_mean) # per fold rmse value in one cv 
        print("mean : ",np.mean(cv_rmse_mean)) # RMSE mean value per iteration of cv  
        del cv_rmse_mean[:]
print
print ("each CV RMSE average : {}".format(tmp))
print
print "total fold mean : ", round(np.mean(cv_rmse),4)
print ("VARIANCE of RMSE of every folds : {}".format(round(np.var(cv_rmse),4)))
print "F1 SCORE : " , np.mean(f1_List)
print "AUC : ", np.mean(AUC)

#model retrain with all train data  
KNN_hashed_counts.fit(fps_train,bio_train) # > model train



[0.934156378600823, 0.9173553719008265, 0.9214876033057852, 0.9132231404958677, 0.9049586776859504]
('mean : ', 0.9182362343978505)
[0.9053497942386831, 0.9421487603305785, 0.871900826446281, 0.9297520661157025, 0.9545454545454546]
('mean : ', 0.9207393803353398)
[0.9012345679012346, 0.9297520661157025, 0.9421487603305785, 0.9173553719008265, 0.9214876033057852]
('mean : ', 0.9223956739108253)
[0.9218106995884774, 0.9297520661157025, 0.9214876033057852, 0.8842975206611571, 0.9338842975206612]
('mean : ', 0.9182464374383568)
[0.8888888888888888, 0.9421487603305785, 0.9090909090909091, 0.9132231404958677, 0.9338842975206612]
('mean : ', 0.917447199265381)
[0.9094650205761317, 0.9214876033057852, 0.9214876033057852, 0.9297520661157025, 0.9256198347107438]
('mean : ', 0.9215624256028295)
[0.9382716049382716, 0.9008264462809917, 0.9214876033057852, 0.9462809917355371, 0.9256198347107438]
('mean : ', 0.9264972961942659)
[0.9135802469135802, 0.9380165289256198, 0.9132231404958677, 0.921487603



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='matching',
           metric_params=None, n_jobs=None, n_neighbors=6, p=2,
           weights='distance')

In [94]:
def Rsquared(pred,true):
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(true,pred)
    return r_value**2

def RMSE(pred,true):
    rmse = np.sqrt(mean_squared_error(true,pred))
    return rmse

In [95]:
RF_preds_hashed = RF_hashed_counts.predict(fps_test)
KNN_preds_hashed = KNN_hashed_counts.predict(fps_test)

fpr_RF, tpr_RF, thresholds_RF = metrics.roc_curve(bio_test, RF_preds_hashed, pos_label=1)
fpr_KNN, tpr_KNN, thresholds_KNN = metrics.roc_curve(bio_test, KNN_preds_hashed, pos_label=1)


In [96]:
print "========= COUNTS ========= "
#print "RMSE RF hashed counts fps: ", RMSE(RF_preds_hashed, bio_test)
#print "RMSE KNN hashed counts fps: ", RMSE(KNN_preds_hashed, bio_test)
print "CA RF fps: ",accuracy_score(bio_test, RF_preds_hashed)
print "CA KNN fps: ", accuracy_score(bio_test, KNN_preds_hashed)
print
print "AUC RF fps: ", metrics.auc(fpr_RF, tpr_RF)
print "AUC KNN fps: ", metrics.auc(fpr_KNN, tpr_KNN)

CA RF fps:  0.9375
CA KNN fps:  0.921875

AUC RF fps:  0.8395989974937343
AUC KNN fps:  0.7681704260651628
