In [1]:
import bioalerts
from bioalerts import LoadMolecules, Alerts, FPCalculator
import sys

In [2]:
import numpy as np 
import scipy
import matplotlib.pyplot as plt

from rdkit.Chem.Draw import IPythonConsole 
from rdkit.Chem import PandasTools 

import pandas as pd 

from sklearn.neighbors import KNeighborsRegressor #knn
from sklearn.ensemble import RandomForestRegressor #RF
from sklearn.svm import SVR #SVR
from sklearn.neural_network import MLPRegressor #MLP 

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [3]:
import sys, numpy as np, scipy as sc, rdkit, matplotlib as pylab, pandas as pd, IPython
#print " Python:", sys.version, "\n"
#print " Numpy:", np.__version__
#print " Scipy:", sc.__version__
#print " Rdkit:", rdkit.rdBase.rdkitVersion
#print " Matplotlib:", pylab.__version__
#print " Pandas:", pd.__version__
#print " Ipython:", IPython.__version__
#print " Scikit-Learn:", sklearn.__version__
#print " Scipy:", scipy.__version__

## Extract toxic substructure 

In [4]:
#proteinTarget = 'NR-AR'
read_dir = './tutorial/datasets/tox21/'
smi = 'tox21_wholetraining.smi'
bio = 'tox21_wholetraining.bio'

In [5]:
molecules = bioalerts.LoadMolecules.LoadMolecules(read_dir + smi,name_field=None)
molecules.ReadMolecules()
print "Total number of input molecules: ", len(molecules.mols)

Format of the structures file = SMILES
All molecules in the input file were processed correctly
Total number of input molecules:  8332


In [6]:
print len(molecules.mols)

8332


In [7]:
molecules.mols_ids[0:5]

[0, 1, 2, 3, 4]

In [8]:
training_dataset_info = bioalerts.LoadMolecules.GetDataSetInfo()

In [9]:
training_dataset_info.extract_substructure_information(radii=[2,3,4],mols=molecules.mols)

In [10]:
print len(training_dataset_info.substructure_dictionary.keys())

97905


In [11]:
bioactivities = np.genfromtxt(read_dir + bio,
                              dtype='int',
                              skip_header=0)
arr = np.arange(0,len(bioactivities))
mask = np.ones(arr.shape,dtype=bool)
mask[molecules.molserr]=0
bioactivities =  bioactivities[mask]

In [12]:
print len(bioactivities)
print len(molecules.mols)

8332
8332


In [13]:
Alerts_categorical = bioalerts.Alerts.CalculatePvaluesCategorical(max_radius=4)
toxic, nontoxic = {}, {}

toxic, nontoxic = Alerts_categorical.calculate_toxic_and_nontoxic_substructure(mols=molecules.mols,
                                      substructure_dictionary=training_dataset_info.substructure_dictionary,
                                      bioactivities=bioactivities,
                                      mols_ids=training_dataset_info.mols_ids,
                                      threshold_nb_substructures = 5,
                                      threshold_pvalue = 0.05,
                                      threshold_frequency = 0.7,
                                      Bonferroni=True)

Number of substructures processed:  385705
Significant toxic substructures:  94 substructures
Significant non toxic substructures : 4517 substructures


In [18]:
toxic
#len(toxic) # >>> 94

{59582328: [1.0],
 144862996: [0.8333333333333334],
 238332961: [0.875],
 243031988: [0.9],
 248538045: [0.8333333333333334],
 289742792: [0.7142857142857143],
 291625364: [0.8333333333333334],
 293645588: [0.75],
 303065576: [1.0],
 316861206: [0.8461538461538461],
 408935666: [1.0],
 441214936: [0.7142857142857143],
 464679642: [0.8571428571428571],
 487877969: [1.0],
 491206018: [0.7142857142857143],
 528413146: [1.0],
 529228269: [1.0],
 590401100: [0.8333333333333334],
 601301716: [0.7272727272727273],
 665316820: [0.8333333333333334],
 685620941: [0.8571428571428571],
 776025833: [0.875],
 780020950: [0.7142857142857143],
 817010262: [0.8333333333333334],
 856674059: [0.8333333333333334],
 885218005: [0.7272727272727273],
 907194548: [0.7142857142857143],
 973747042: [0.8181818181818182],
 1018193551: [0.9],
 1025187852: [0.8333333333333334],
 1040552245: [0.7142857142857143],
 1072783936: [0.8333333333333334],
 1210732435: [0.8181818181818182],
 1220447184: [1.0],
 1233810244: [

In [None]:
nontoxic
#len(nontoxic) # >>> 4517

In [106]:
toxic = {59582328: [1.0],
 144862996: [0.8333333333333334],
 238332961: [0.875],
 243031988: [0.9],
 248538045: [0.8333333333333334],
 289742792: [0.7142857142857143],
 291625364: [0.8333333333333334],
 293645588: [0.75],
 303065576: [1.0],
 316861206: [0.8461538461538461],
 408935666: [1.0],
 441214936: [0.7142857142857143],
 464679642: [0.8571428571428571],
 487877969: [1.0],
 491206018: [0.7142857142857143],
 528413146: [1.0],
 529228269: [1.0],
 590401100: [0.8333333333333334],
 601301716: [0.7272727272727273],
 665316820: [0.8333333333333334],
 685620941: [0.8571428571428571],
 776025833: [0.875],
 780020950: [0.7142857142857143],
 817010262: [0.8333333333333334],
 856674059: [0.8333333333333334],
 885218005: [0.7272727272727273],
 907194548: [0.7142857142857143],
 973747042: [0.8181818181818182],
 1018193551: [0.9],
 1025187852: [0.8333333333333334],
 1040552245: [0.7142857142857143],
 1072783936: [0.8333333333333334],
 1210732435: [0.8181818181818182],
 1220447184: [1.0],
 1233810244: [0.8333333333333334],
 1269851781: [1.0],
 1299092894: [0.8333333333333334],
 1300724914: [0.8],
 1322529363: [0.8333333333333334],
 1378206585: [1.0],
 1437747071: [0.75],
 1518432449: [0.8461538461538461],
 1609125206: [0.7142857142857143],
 1616758320: [1.0],
 1632286252: [0.8333333333333334],
 1833028715: [0.8333333333333334],
 1837362276: [0.7272727272727273],
 1887419926: [0.8181818181818182],
 1904321503: [0.8333333333333334],
 1906629638: [0.875],
 1914835207: [0.7142857142857143],
 1926414473: [0.8181818181818182],
 2092096443: [0.8333333333333334],
 2096291241: [0.8181818181818182],
 2110713573: [0.8333333333333334],
 2128830929: [0.7142857142857143],
 2292238093L: [0.8333333333333334],
 2295371384L: [0.7777777777777778],
 2396384619L: [0.8333333333333334],
 2441201613L: [0.8333333333333334],
 2455943357L: [0.8333333333333334],
 2486452475L: [0.7272727272727273],
 2547260028L: [0.7142857142857143],
 2586477428L: [0.7692307692307693],
 2632265184L: [0.8461538461538461],
 2666267517L: [0.7142857142857143],
 2676693293L: [0.7142857142857143],
 2685957082L: [0.8571428571428571],
 2700364449L: [0.8571428571428571],
 2730581186L: [0.8571428571428571],
 2854154172L: [0.75],
 2927177866L: [0.8333333333333334],
 2956273779L: [0.8333333333333334],
 2960320475L: [1.0],
 2969511302L: [0.8333333333333334],
 3047048782L: [0.7777777777777778],
 3115087023L: [0.9],
 3432086929L: [0.8333333333333334],
 3445050409L: [0.8333333333333334],
 3479423841L: [0.8571428571428571],
 3491936091L: [0.8181818181818182],
 3663265415L: [0.8333333333333334],
 3669900705L: [1.0],
 3743793454L: [0.75],
 3744626343L: [0.7647058823529411],
 3774802668L: [0.7142857142857143],
 3864161222L: [0.8333333333333334],
 3926336955L: [1.0],
 4098238333L: [0.7142857142857143],
 4158306861L: [0.7727272727272727],
 4236605231L: [0.7777777777777778],
 4242283111L: [0.8333333333333334],
 4246700814L: [0.8333333333333334],
 4275705722L: [0.8333333333333334]}

##  Calculating Morgan fingerprints

In [79]:
proteinTarget = 'mmc1_estrogen'

In [80]:
AR_mols = bioalerts.LoadMolecules.LoadMolecules("./tutorial/datasets/"+ proteinTarget +".smi",name_field=None) 
#COX2.smi is a set of only SMILES data

In [81]:
AR_mols.ReadMolecules()

Format of the structures file = SMILES
3 molecules (starting at zero) could not be processed.

This information has been saved in the following file: incorrect_molecules.csv

NOTE: the indexes of the molecules start at zero. Thus the first molecule is molecule 0.


In [82]:
AR_bio = np.genfromtxt('./tutorial/datasets/'+ proteinTarget +'.bio.txt',skip_header=0)
print len(AR_mols.mols)
print len(AR_bio)

1275
1278


In [83]:
if len(AR_mols.molserr)!= 0:
    AR_bio = np.delete(AR_bio,AR_mols.molserr)

In [84]:
print len(AR_mols.mols)
print len(AR_bio)

1275
1275


## Input data preparation (feature insertion)

In [85]:
#  Reference molecules
reference_molecules = bioalerts.LoadMolecules.GetDataSetInfo(name_field=None)

In [86]:
reference_molecules.extract_substructure_information(radii=[0,1,2],mols=AR_mols.mols)

In [87]:
reference_keys = reference_molecules.substructure_dictionary.keys()
reference_dict = reference_molecules.substructure_dictionary
print len(reference_keys)

4063


In [88]:
fps_experiments = bioalerts.FPCalculator.CalculateFPs(radii=[0,1,2],mols=AR_mols.mols)

In [89]:
fps_experiments.calculate_hashed_fps(nBits=128)

In [90]:
print fps_experiments.fps_hashed_counts.shape
print fps_experiments.fps_hashed_binary.shape 

(1275L, 128L)
(1275L, 128L)


In [91]:
fps_exp_cnt = fps_experiments.fps_hashed_counts
fps_exp_bin = fps_experiments.fps_hashed_binary

In [92]:
df_fps_cnt = pd.DataFrame(fps_exp_cnt,columns=list(range(len(fps_exp_cnt[0]))))
df_fps_bin = pd.DataFrame(fps_exp_bin,columns=list(range(len(fps_exp_bin[0]))))

In [110]:
for substr,indices in reference_dict.items() : 
    if substr in toxic.keys() :
        for idx in indices :
            if 0.7 <= toxic[substr][0] < 0.8 :
                for i in range(8):  
                    df_fps_cnt.loc[idx,'significant_feature_{}'.format(i)] = 1.0
                    df_fps_bin.loc[idx,'significant_feature_{}'.format(i)] = 1.0
            elif 0.8 <= toxic[substr][0] < 0.9 : 
                for i in range(8):
                    df_fps_cnt.loc[idx,'significant_feature_{}'.format(i)] = 2.0
                    df_fps_bin.loc[idx,'significant_feature_{}'.format(i)] = 2.0
            elif 0.9 <= toxic[substr][0] <1.0 : 
                for i in range(8):
                    df_fps_cnt.loc[idx,'significant_feature_{}'.format(i)] = 3.0
                    df_fps_bin.loc[idx,'significant_feature_{}'.format(i)] = 3.0
            else:
                for i in range(8):
                    df_fps_cnt.loc[idx,'significant_feature_{}'.format(i)] = 0.0
                    df_fps_bin.loc[idx,'significant_feature_{}'.format(i)] = 0.0

In [111]:
df_fps_cnt = df_fps_cnt.replace(np.nan,0.0)
df_fps_bin = df_fps_bin.replace(np.nan,0.0)

In [112]:
print(len(df_fps_cnt[df_fps_cnt.significant_feature_1!=0.0]))
print(len(df_fps_cnt))
print
print(len(df_fps_bin[df_fps_bin.significant_feature_1!=0.0]))
print(len(df_fps_bin))

67
1275

67
1275


In [115]:
# y label
df_bio = pd.DataFrame(AR_bio,columns=None)

## Data Split

In [65]:
#if you wanna train models without additional features, 
#do not run above codes which insert features

In [116]:
test_size = 0.05 
fps_cnt_train, fps_cnt_test, bio_cnt_train, bio_cnt_test = train_test_split(df_fps_cnt,df_bio,test_size = test_size, shuffle = True)

In [117]:
test_size = 0.05 
fps_bin_train, fps_bin_test, bio_bin_train, bio_bin_test = train_test_split(df_fps_bin,df_bio,test_size = test_size, shuffle = True)

## Train models 

In [118]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics 
from sklearn.metrics import f1_score

In [119]:
n_estimators = 100
min_samples_split= 2
max_depth = None

n_neighbors = 6 
metric = 'matching'

In [120]:
#RF
seed = 23
RF_hashed_counts = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,min_samples_split=min_samples_split,random_state=seed,n_jobs=2)
RF_hashed_counts.fit(fps_cnt_train, bio_cnt_train)

# RF binary 
seed = 23 
RF_hashed_binary = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth, min_samples_split=min_samples_split,random_state=seed,n_jobs=2)
RF_hashed_binary.fit(fps_bin_train, bio_bin_train)

  after removing the cwd from sys.path.
  if __name__ == '__main__':


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=23, verbose=0, warm_start=False)

In [121]:
# KNN
KNN_hashed_counts = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance', algorithm='auto',metric=metric)
KNN_hashed_counts.fit(fps_cnt_train, bio_cnt_train)

#KNN binary
KNN_hashed_binary = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance', algorithm='auto',metric=metric)
KNN_hashed_binary.fit(fps_bin_train, bio_bin_train)

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='matching',
           metric_params=None, n_jobs=None, n_neighbors=6, p=2,
           weights='distance')

#### counts

In [122]:
#fold setting
kf = RepeatedKFold(n_splits = 5,n_repeats=10)
kf.get_n_splits(fps_cnt_train)
#kf
#>>> KFold(n_splits=5, random_state=None, shuffle=False)

#for numbering k folds
k=0
cv_rmse = []
cv_rmse_mean = []
tmp = []
AUC = []
f1_List = []
fpr_RF, tpr_RF, thresholds_RF = None, None, None 

#cross-val training
for train_idx, test_idx in kf.split(fps_cnt_train):
    x_cv_train , x_cv_test = fps_cnt_train.iloc[train_idx], fps_cnt_train.iloc[test_idx]
    y_cv_train , y_cv_test = bio_cnt_train.iloc[train_idx], bio_cnt_train.iloc[test_idx]
    
    RF_hashed_counts.fit(x_cv_train,y_cv_train)
    
    y_cv_testidx = test_idx
    y_cv_pred = RF_hashed_counts.predict(x_cv_test)
    #cv_rmse.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    #cv_rmse_mean.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    cv_rmse.append(accuracy_score(y_cv_test,y_cv_pred))
    cv_rmse_mean.append(accuracy_score(y_cv_test,y_cv_pred))
    
    f1_List.append(f1_score(y_cv_test,y_cv_pred))
    
    fpr_RF, tpr_RF, thresholds_RF = metrics.roc_curve(y_cv_test, y_cv_pred, pos_label=1)
    AUC.append(metrics.auc(fpr_RF, tpr_RF))
    
    k += 1 
    
    if k % 5 == 0 : 
        tmp.append(np.mean(cv_rmse_mean))
        print(cv_rmse_mean) # per fold rmse value in one cv 
        print("mean : ",np.mean(cv_rmse_mean)) # RMSE mean value per iteration of cv  
        del cv_rmse_mean[:]
        
print
print ("each CV RMSE average : {}".format(tmp))
print
print "total fold mean : ", round(np.mean(cv_rmse),4)
print ("VARIANCE of RMSE of every folds : {}".format(round(np.var(cv_rmse),4)))
print "F1 SCORE : " , np.mean(f1_List)
print "AUC : ", np.mean(AUC)

#model retrain with all train data  
RF_hashed_counts.fit(fps_cnt_train,bio_cnt_train) # > model train



[0.9053497942386831, 0.9421487603305785, 0.9545454545454546, 0.9504132231404959, 0.9380165289256198]
('mean : ', 0.9380947522361665)
[0.9465020576131687, 0.9586776859504132, 0.9338842975206612, 0.9090909090909091, 0.9338842975206612]
('mean : ', 0.9364078495391628)
[0.9135802469135802, 0.9421487603305785, 0.9256198347107438, 0.9297520661157025, 0.9628099173553719]
('mean : ', 0.9347821650851953)
[0.9629629629629629, 0.9008264462809917, 0.9504132231404959, 0.9256198347107438, 0.9421487603305785]
('mean : ', 0.9363942454851546)
[0.934156378600823, 0.9338842975206612, 0.9628099173553719, 0.9421487603305785, 0.9173553719008265]
('mean : ', 0.9380709451416521)
[0.9382716049382716, 0.9173553719008265, 0.9338842975206612, 0.9462809917355371, 0.9214876033057852]
('mean : ', 0.9314559738802162)
[0.8888888888888888, 0.9297520661157025, 0.9338842975206612, 0.9380165289256198, 0.9380165289256198]
('mean : ', 0.9257116620752985)
[0.9465020576131687, 0.9297520661157025, 0.9462809917355371, 0.9545454



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=23, verbose=0, warm_start=False)

In [123]:
#fold setting
kf = RepeatedKFold(n_splits = 5,n_repeats=10)
kf.get_n_splits(fps_cnt_train)
#kf
#>>> KFold(n_splits=5, random_state=None, shuffle=False)

#for numbering k folds
k=0
cv_rmse = []
cv_rmse_mean = []
tmp = []
f1_List = []
AUC = []
fpr_KNN, tpr_KNN, thresholds_KNN = None, None, None 

#cross-val training
for train_idx, test_idx in kf.split(fps_cnt_train):
    x_cv_train , x_cv_test = fps_cnt_train.iloc[train_idx], fps_cnt_train.iloc[test_idx]
    y_cv_train , y_cv_test = bio_cnt_train.iloc[train_idx], bio_cnt_train.iloc[test_idx]
    
    KNN_hashed_counts.fit(x_cv_train,y_cv_train)
    
    y_cv_testidx = test_idx
    y_cv_pred = KNN_hashed_counts.predict(x_cv_test)
    #cv_rmse.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    #cv_rmse_mean.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    cv_rmse.append(accuracy_score(y_cv_test,y_cv_pred))
    
    cv_rmse_mean.append(accuracy_score(y_cv_test,y_cv_pred))
    
    f1_List.append(f1_score(y_cv_test,y_cv_pred))
    
    fpr_KNN, tpr_KNN, thresholds_KNN = metrics.roc_curve(y_cv_test, y_cv_pred, pos_label=1)
    AUC.append(metrics.auc(fpr_KNN, tpr_KNN))
    
    k += 1 
    
    if k % 5 == 0 : 
        tmp.append(np.mean(cv_rmse_mean))
        print(cv_rmse_mean) # per fold rmse value in one cv 
        print("mean : ",np.mean(cv_rmse_mean)) # RMSE mean value per iteration of cv  
        del cv_rmse_mean[:]
        

#print("total fold mean : ", round(np.mean(cv_rmse),4)) # mean of 50 cv rmse (1회 iter돌 때의 모든 rmse의 평균 (50회의 평균)) 
#print("total cv mean : ",round(np.mean(tmp),4)) # mean of cv's mean of 1 iter (각 cv rmse 평균의 평균) # we need var of this value 
print
print ("each CV RMSE average : {}".format(tmp))
print
print "total fold mean : ", round(np.mean(cv_rmse),4)
print ("VARIANCE of RMSE of every folds : {}".format(round(np.var(cv_rmse),4)))
print "F1 SCORE : " , np.mean(f1_List)
print "AUC : ", np.mean(AUC)

#model retrain with all train data  
KNN_hashed_counts.fit(fps_cnt_train,bio_cnt_train) # > model train



[0.9176954732510288, 0.9297520661157025, 0.9297520661157025, 0.9173553719008265, 0.9462809917355371]
('mean : ', 0.9281671938237596)
[0.9053497942386831, 0.9380165289256198, 0.9297520661157025, 0.9380165289256198, 0.9297520661157025]
('mean : ', 0.9281773968642655)
[0.9711934156378601, 0.9008264462809917, 0.9008264462809917, 0.9380165289256198, 0.9256198347107438]
('mean : ', 0.9272965343672415)
[0.9218106995884774, 0.9049586776859504, 0.9462809917355371, 0.9338842975206612, 0.9090909090909091]
('mean : ', 0.923205115124307)
[0.9176954732510288, 0.9256198347107438, 0.9132231404958677, 0.9504132231404959, 0.9256198347107438]
('mean : ', 0.9265143012617759)
[0.9423868312757202, 0.9132231404958677, 0.9132231404958677, 0.9380165289256198, 0.9214876033057852]
('mean : ', 0.925667448899772)
[0.9259259259259259, 0.9338842975206612, 0.9338842975206612, 0.9297520661157025, 0.8966942148760331]
('mean : ', 0.9240281603917968)
[0.9053497942386831, 0.9462809917355371, 0.9173553719008265, 0.92975206



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='matching',
           metric_params=None, n_jobs=None, n_neighbors=6, p=2,
           weights='distance')

#### binary 


In [124]:
#fold setting
kf = RepeatedKFold(n_splits = 5,n_repeats=10)
kf.get_n_splits(fps_bin_train)
#kf
#>>> KFold(n_splits=5, random_state=None, shuffle=False)

#for numbering k folds
k=0
cv_rmse = []
cv_rmse_mean = []
tmp = []
AUC = []
f1_List = []
fpr_RF, tpr_RF, thresholds_RF = None, None, None 

#cross-val training
for train_idx, test_idx in kf.split(fps_bin_train):
    x_cv_train , x_cv_test = fps_bin_train.iloc[train_idx], fps_bin_train.iloc[test_idx]
    y_cv_train , y_cv_test = bio_bin_train.iloc[train_idx], bio_bin_train.iloc[test_idx]
    
    RF_hashed_binary.fit(x_cv_train,y_cv_train)
    
    y_cv_testidx = test_idx
    y_cv_pred = RF_hashed_binary.predict(x_cv_test)
    #cv_rmse.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    #cv_rmse_mean.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    cv_rmse.append(accuracy_score(y_cv_test,y_cv_pred))
    cv_rmse_mean.append(accuracy_score(y_cv_test,y_cv_pred))
    
    f1_List.append(f1_score(y_cv_test,y_cv_pred))
    
    fpr_RF, tpr_RF, thresholds_RF = metrics.roc_curve(y_cv_test, y_cv_pred, pos_label=1)
    AUC.append(metrics.auc(fpr_RF, tpr_RF))
    k += 1 
    
    if k % 5 == 0 : 
        tmp.append(np.mean(cv_rmse_mean))
        print(cv_rmse_mean) # per fold rmse value in one cv 
        print("mean : ",np.mean(cv_rmse_mean)) # RMSE mean value per iteration of cv  
        del cv_rmse_mean[:]
        
print
print ("each CV RMSE average : {}".format(tmp))
print
print "total fold mean : ", round(np.mean(cv_rmse),4)
print ("VARIANCE of RMSE of every folds : {}".format(round(np.var(cv_rmse),4)))
print "F1 SCORE : " , np.mean(f1_List)
print "AUC : ", np.mean(AUC)
#model retrain with all train data  
RF_hashed_binary.fit(fps_bin_train,bio_bin_train) # > model train



[0.9218106995884774, 0.9421487603305785, 0.9504132231404959, 0.9462809917355371, 0.9173553719008265]
('mean : ', 0.935601809339183)
[0.9382716049382716, 0.9256198347107438, 0.9173553719008265, 0.9256198347107438, 0.9214876033057852]
('mean : ', 0.9256708499132742)
[0.9094650205761317, 0.9462809917355371, 0.9338842975206612, 0.9214876033057852, 0.9545454545454546]
('mean : ', 0.933132673536714)
[0.9218106995884774, 0.9214876033057852, 0.9338842975206612, 0.9545454545454546, 0.9338842975206612]
('mean : ', 0.9331224704962079)
[0.9218106995884774, 0.9338842975206612, 0.9421487603305785, 0.9338842975206612, 0.9256198347107438]
('mean : ', 0.9314695779342245)
[0.9423868312757202, 0.9132231404958677, 0.9214876033057852, 0.9297520661157025, 0.9462809917355371]
('mean : ', 0.9306261265857225)
[0.9382716049382716, 0.9338842975206612, 0.9132231404958677, 0.8884297520661157, 0.9421487603305785]
('mean : ', 0.9231915110702988)
[0.9176954732510288, 0.9132231404958677, 0.9710743801652892, 0.94214876



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
            oob_score=False, random_state=23, verbose=0, warm_start=False)

In [125]:
#fold setting
kf = RepeatedKFold(n_splits = 5,n_repeats=10)
kf.get_n_splits(fps_bin_train)
#kf
#>>> KFold(n_splits=5, random_state=None, shuffle=False)

#for numbering k folds
k=0
cv_rmse = []
cv_rmse_mean = []
tmp = []
f1_List = []
AUC = []
fpr_KNN, tpr_KNN, thresholds_KNN = None, None, None 

#cross-val training
for train_idx, test_idx in kf.split(fps_bin_train):
    x_cv_train , x_cv_test = fps_bin_train.iloc[train_idx], fps_bin_train.iloc[test_idx]
    y_cv_train , y_cv_test = bio_bin_train.iloc[train_idx], bio_bin_train.iloc[test_idx]
    
    KNN_hashed_binary.fit(x_cv_train,y_cv_train)
    
    y_cv_testidx = test_idx
    y_cv_pred = KNN_hashed_binary.predict(x_cv_test)
    #cv_rmse.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    #cv_rmse_mean.append(round(np.sqrt(mean_squared_error(y_cv_test,y_cv_pred)),4))
    cv_rmse.append(accuracy_score(y_cv_test,y_cv_pred))
    cv_rmse_mean.append(accuracy_score(y_cv_test,y_cv_pred))
    
    f1_List.append(f1_score(y_cv_test,y_cv_pred))
    
    fpr_KNN, tpr_KNN, thresholds_KNN = metrics.roc_curve(y_cv_test, y_cv_pred, pos_label=1)
    AUC.append(metrics.auc(fpr_KNN, tpr_KNN))
    k += 1 
    
    if k % 5 == 0 : 
        tmp.append(np.mean(cv_rmse_mean))
        print(cv_rmse_mean) # per fold rmse value in one cv 
        print("mean : ",np.mean(cv_rmse_mean)) # RMSE mean value per iteration of cv  
        del cv_rmse_mean[:]
        
print
print ("each CV RMSE average : {}".format(tmp))
print
print "total fold mean : ", round(np.mean(cv_rmse),4)
print ("VARIANCE of RMSE of every folds : {}".format(round(np.var(cv_rmse),4)))
print "F1 SCORE : " , np.mean(f1_List)
print "AUC : ", np.mean(AUC)
#model retrain with all train data  
KNN_hashed_binary.fit(fps_bin_train,bio_bin_train) # > model train



[0.9094650205761317, 0.9132231404958677, 0.9132231404958677, 0.9752066115702479, 0.9214876033057852]
('mean : ', 0.92652110328878)
[0.9218106995884774, 0.9049586776859504, 0.9090909090909091, 0.9297520661157025, 0.9545454545454546]
('mean : ', 0.9240315614052989)
[0.9382716049382716, 0.9256198347107438, 0.9132231404958677, 0.9338842975206612, 0.9173553719008265]
('mean : ', 0.9256708499132742)
[0.9135802469135802, 0.9132231404958677, 0.9297520661157025, 0.9421487603305785, 0.9380165289256198]
('mean : ', 0.9273441485562698)
[0.9259259259259259, 0.9338842975206612, 0.9214876033057852, 0.8966942148760331, 0.9338842975206612]
('mean : ', 0.9223752678298134)
[0.9053497942386831, 0.9256198347107438, 0.8925619834710744, 0.9421487603305785, 0.9380165289256198]
('mean : ', 0.92073938033534)
[0.9176954732510288, 0.9380165289256198, 0.9173553719008265, 0.9380165289256198, 0.9173553719008265]
('mean : ', 0.9256878549807842)
[0.9176954732510288, 0.9338842975206612, 0.9297520661157025, 0.9173553719



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='matching',
           metric_params=None, n_jobs=None, n_neighbors=6, p=2,
           weights='distance')

##   predict bioactivities for the test set

In [126]:
def Rsquared(pred,true):
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(true,pred)
    return r_value**2

def RMSE(pred,true):
    rmse = np.sqrt(mean_squared_error(true,pred))
    return rmse

In [127]:
RF_preds_hashed = RF_hashed_counts.predict(fps_cnt_test)
KNN_preds_hashed = KNN_hashed_counts.predict(fps_cnt_test)

RF_preds_hashed_bin = RF_hashed_binary.predict(fps_bin_test)
KNN_preds_hashed_bin = KNN_hashed_binary.predict(fps_bin_test)

In [128]:
print "========= COUNTS ========= "
#print "RMSE RF hashed counts fps: ", RMSE(RF_preds_hashed, bio_cnt_test)
#print "RMSE KNN hashed counts fps: ", RMSE(KNN_preds_hashed, bio_cnt_test)
print "CA RF hashed counts fps: ",accuracy_score(bio_cnt_test, RF_preds_hashed)
print "CA KNN hashed counts fps: ", accuracy_score(bio_cnt_test, KNN_preds_hashed)
print
print "========= BINARY ========= "
#print "RMSE RF hashed binary fps: ", RMSE(RF_preds_hashed_bin, bio_bin_test)
#print "RMSE KNN hashed binary fps: ", RMSE(KNN_preds_hashed_bin, bio_bin_test)
print "RMSE RF hashed binary fps: ", accuracy_score(bio_bin_test, RF_preds_hashed_bin)
print "RMSE KNN hashed binary fps: ", accuracy_score(bio_bin_test,KNN_preds_hashed_bin)
print


CA RF hashed counts fps:  0.90625
CA KNN hashed counts fps:  0.859375

RMSE RF hashed binary fps:  0.953125
RMSE KNN hashed binary fps:  0.90625

