In [1]:
import bioalerts
from bioalerts import LoadMolecules, Alerts, FPCalculator
import sys

In [2]:
import numpy as np 
import scipy
import matplotlib.pyplot as plt

from rdkit.Chem.Draw import IPythonConsole 
from rdkit.Chem import PandasTools 

import pandas as pd 

from sklearn.neighbors import KNeighborsRegressor #knn
from sklearn.ensemble import RandomForestRegressor #RF
from sklearn.svm import SVR #SVR
from sklearn.neural_network import MLPRegressor #MLP 

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [3]:
import sys, numpy as np, scipy as sc, rdkit, matplotlib as pylab, pandas as pd, IPython
#print " Python:", sys.version, "\n"
#print " Numpy:", np.__version__
#print " Scipy:", sc.__version__
#print " Rdkit:", rdkit.rdBase.rdkitVersion
#print " Matplotlib:", pylab.__version__
#print " Pandas:", pd.__version__
#print " Ipython:", IPython.__version__
#print " Scikit-Learn:", sklearn.__version__
#print " Scipy:", scipy.__version__

## A. Calculating Morgan fingerprints

In this section, we will see how to calculate hashed and unhashed Morgan fingerprints for a set of molecules, and will train some models. 

In [4]:
AR_mols = bioalerts.LoadMolecules.LoadMolecules("./tutorial/datasets/5AR.smi",name_field=None) 
#COX2.smi is a set of only SMILES data

In [5]:
AR_mols.ReadMolecules()

Format of the structures file = SMILES
All molecules in the input file were processed correctly




In [6]:
AR_bio = np.genfromtxt('./tutorial/datasets/5AR.bio.txt',skip_header=0) 
print len(AR_mols.mols)
print len(AR_bio)

278
278


In [8]:
stride = int(len(AR_mols.mols) * 0.9)
training = AR_mols.mols[0:stride]
test = AR_mols.mols[stride:len(AR_mols.mols)]
training_mols_ids = AR_mols.mols_ids[0:stride] # for sig-sub-extract
print len(AR_mols.mols), len(test), len(training)

bio_training = AR_bio[0:stride]
bio_test = AR_bio[stride:len(AR_mols.mols)]
print len(AR_bio), len(bio_test), len(bio_training)

278 28 250
278 28 250


## A.0 substructure extraction

The length of the substructure dictionary indicates the total number of distinct substructures present in the training set **satisfying the bond radii indicated by the user**, whereas the number of keys for each dictionary element indicates the number of distinct molecules in which the substructure is present.

### A.0.1 whole substructure extraction

In [9]:
#  Reference molecules
reference_molecules = bioalerts.LoadMolecules.GetDataSetInfo(name_field=None)

In [None]:
#fps_training.substructure_ids

In [10]:
reference_molecules.extract_substructure_information(radii=[0,1,2],mols=training)

In [11]:
reference_keys = reference_molecules.substructure_dictionary.keys()
print len(reference_keys)

1205


### A.0.2 significant substructure extraction

In [12]:
significant_reference_molecules = bioalerts.LoadMolecules.GetDataSetInfo(name_field=None)
significant_reference_molecules.extract_substructure_information(radii=[0,1,2],mols=training)

In [13]:
Alerts_continuous_high_activity = bioalerts.Alerts.CalculatePvaluesContinuous(radii_ext=[0,1,2])

In [14]:
def calculate_bioactivity_threshold(bio,threshold):
    # a pIC50 is smaller, higher activity 
    
    sorted_bio = sorted(bio)
    threshold_bio_value = sorted_bio[int(len(sorted_bio)*threshold)]
    
    print("activity ratio is {}".format(1-threshold))
    print("threshold value is {}".format(threshold_bio_value))
    print("actual bioactivity value is {}".format(np.power(10,9 - threshold_bio_value)))
    return threshold_bio_value

In [15]:
threshold_= 0.33
threshold_bio_value = calculate_bioactivity_threshold(AR_bio,threshold_)

threshold_nb_substructures = 10
threshold_ratio = 0.3
threshold_high_act_nb_substructures = 3 #5
threshold_high_act_ratio = 0.15 #0.5                                                                  

# maybe no necessary to change
threshold_pvalue = 0.05 

activity ratio is 0.67
threshold value is 6.525783736
actual bioactivity value is 297.999999948


In [29]:
significant_substructure = {}
significant_substructure = Alerts_continuous_high_activity.get_significant_substructure_with_high_bioactivity(
mols = training,
     substructure_dictionary = significant_reference_molecules.substructure_dictionary,
     bioactivities = bio_training,
     mols_ids = training_mols_ids[:],
     threshold_nb_substructures = threshold_nb_substructures,
     threshold_pvalue = threshold_pvalue,
     threshold_ratio=threshold_ratio,
     threshold_high_act_nb_substructures=threshold_high_act_nb_substructures,
     threshold_high_act_ratio = threshold_high_act_ratio,                                                                        
     threshold_bioactivity=threshold_bio_value)

In [30]:
significant_reference_keys = significant_substructure.keys()
len(significant_reference_keys)

31

In [31]:
mol_id_set = set()
for k,v in significant_substructure.items():
    for mol_id in v: 
        mol_id_set.add(mol_id)

In [32]:
print('number of total substructure : {}'.format(len(significant_reference_molecules.substructure_dictionary)))
print('number of extracted significant substructure : {}'.format(len(significant_substructure)))
print('{} out of {} have the label which represent remaining significant substructure\n'.format(len(mol_id_set),len(bio_training)))

print ('THRESHOLD - \n\tnumber of substructures : {}\n\tp value : {}\n\tratio : {}\n\tnumber of high activity threshold : {}\n\thigh activity ratio : {}\n\thigh bioactivity threshold : {}\n'
       .format(threshold_nb_substructures,threshold_pvalue,threshold_ratio,threshold_high_act_nb_substructures,threshold_high_act_ratio,threshold_ ))

number of total substructure : 1205
number of extracted significant substructure : 31
250 out of 250 have the label which represent remaining significant substructure

THRESHOLD - 
	number of substructures : 10
	p value : 0.05
	ratio : 0.3
	number of high activity threshold : 3
	high activity ratio : 0.15
	high bioactivity threshold : 0.33



In [28]:
# run the code when you repair hyperparameter
if len(significant_substructure) is not 0 :
    significant_substructure.clear()
if len(mol_id_set) is not 0 :
    mol_id_set.clear()
if len(significant_reference_keys) is not 0:
    significant_reference_keys = []

### A.1 Computating HASHED Morgan fingerprints for the TRAINING set

We first initialize the class bioalerts.FPCalculator.CalculateFPs()

In [33]:
fps_training = bioalerts.FPCalculator.CalculateFPs(radii=[0,1,2],mols=training)
fps_training_with_refer = bioalerts.FPCalculator.CalculateFPs(radii=[0,1,2],mols=training,reference_substructure_keys=reference_keys)
fps_training_with_sig_refer = bioalerts.FPCalculator.CalculateFPs(radii=[2,3,4,5],mols=training,reference_substructure_keys=significant_reference_keys)

# reference keys have no effect to hashed, only to unhashed 

In [34]:
fps_training.calculate_hashed_fps_binary_quick(nBits=128)

In [35]:
print fps_training.fps_hashed_binary_quick.shape

(250, 128)


The method "calculate_hashed_fps_binary_quick()" considers all substructures with a bond radius smaller or equal to the maximum value of the argument radii. In this example, 2. Thus, in this case this method will consider all substructures with a bond radius of 0, 1 and 2.

In [36]:
fps_training.calculate_hashed_fps(nBits=128)

In [37]:
print fps_training.fps_hashed_binary.shape
print fps_training.fps_hashed_counts.shape

(250, 128)
(250, 128)


The method **"calculate_hashed_fps()"** computes hashed fingerpints in **(i) count** and **(ii) also binary format**. The difference of the method *calculate_hashed_fps()* with respect to the method *calculate_hashed_fps_binary_quick()* is that the **former only consider the substructures with a bond radius in the list input** to the argument radii when instantiating the class bioalerts.FPCalculator.CalculateFPs(). In the example above: [1,2].

Therefore, the hashed fingerprints in binary format calculated by these two methods will be *identical* if the bond radii especified in the aforesaid argument radii comprise all natural number from 0 to the maximum bond radius. In this example, this would be correspond to setting the value of the argument radii to [0,1,2].

In [38]:
print (fps_training.fps_hashed_binary_quick == fps_training.fps_hashed_binary).all()

True


In [39]:
toto = fps_training.fps_hashed_counts
toto
#toto.shape #(250L, 128L)

array([[0, 1, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [40]:
fps_training.fps_hashed_binary[1:10,1:10]

array([[1, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 1, 1, 0]])

In [41]:
fps_training.fps_hashed_binary_quick[1:10,1:10]

array([[1, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 1, 1, 0]])

### A.2 Computating UNHASHED Morgan fingerprints for the TRAINING set

#### A.2.1 UNHASHED without reference dict

In [42]:
fps_training.calculate_unhashed_fps(draw_substructures=True)

No input set of keys for the substructures. 
Thus, the substructures present in the input molecules will be considered for the calculation of unhashed fingerprints.


In [43]:
fps_training.fps_unhashed_binary.shape

(250, 1205)

#### A.2.2 UNHASHED with reference dict

In [44]:
fps_training_with_refer.calculate_unhashed_fps(draw_substructures=True)

In [45]:
fps_training_with_refer.fps_unhashed_counts.shape

(250, 1205)

check whether unhashed counts are same value or not between fps without refer and fps with refer

In [47]:
print (fps_training.fps_unhashed_counts == fps_training_with_refer.fps_unhashed_counts).all()

True


#### A.2.3 UNHASHED with significant reference dict

In [46]:
fps_training_with_sig_refer.calculate_unhashed_fps(draw_substructures=True)
fps_training_with_sig_refer.fps_unhashed_counts.shape

(250, 31)

### A.3 Computating Morgan fingerprints for the TEST set 

#### A.3.1 Computating UNHASHED Morgan fingerprints for the test set using a different set of molecules:

Since the positions of the substructures in the unhashed fingerprints depend on the training set, the method calculate_unhashed_fps() allows the computation of unhashed fingerprints for new compounds using a basis defined by the substructures present in the training set. This basis is defined by the keys of the substructure dictionary calculated for the molecules from the training set. This reference substructure dictionary is input to the class CalculateFPs() using the argument reference_substucture_keys.

*This ensures that substructures in new compounds map to the same locations on the fingerprint and allows enhanced model interpretation by noting which exact substructures are deemed important by the learning algorithm.*

In [136]:
#  Reference molecules
#reference_molecules = bioalerts.LoadMolecules.GetDataSetInfo(name_field=None)

In [137]:
#fps_training.substructure_ids

array([ 218993025, 3334525955,  757425542, ..., 1842898132, 1861965050,
       1101907775], dtype=int64)

array([ 218993025, 3334525955,  757425542, ..., 1842898132, 1861965050,
       1101907775], dtype=int64)

In [138]:
#reference_molecules.extract_substructure_information(radii=[0,1,2],mols=training)

In [139]:
#reference_keys = reference_molecules.substructure_dictionary.keys()
#print len(reference_keys)


1205
1205


In [48]:
#initialization 
fps_test = bioalerts.FPCalculator.CalculateFPs(radii=[0,1,2],
                                         mols=test,
                                         reference_substructure_keys=reference_keys)

fps_test_with_sig_refer = bioalerts.FPCalculator.CalculateFPs(radii=[2,3,4,5],
                                                             mols=test,
                                                             reference_substructure_keys=significant_reference_keys)

In [49]:
fps_test.calculate_unhashed_fps()
fps_test_with_sig_refer.calculate_unhashed_fps()

In [50]:
print fps_test.fps_unhashed_binary.shape
print fps_test_with_sig_refer.fps_unhashed_binary.shape

(28, 1205)
(28, 31)


In [51]:
len(fps_training.columns_unhashed)

1205

In [52]:
fps_training.fps_hashed_counts.shape

(250, 128)

#### A.3.2 Computating HASHED Morgan fingerprints for the test set (e.g. using a different set of molecules)

In [53]:
fps_test.calculate_hashed_fps(nBits=128)

### A.4 Training models 

#### A.4.1 Using hashed fps

In [65]:
#RF
seed = 23
RF_hashed_counts = RandomForestRegressor(n_estimators=100,random_state=seed,n_jobs=2)
RF_hashed_counts.fit(fps_training.fps_hashed_counts,bio_training)

# RF binary 
seed = 23 
RF_hashed_binary = RandomForestRegressor(n_estimators=100,random_state=seed,n_jobs=2)
RF_hashed_binary.fit(fps_training.fps_hashed_binary,bio_training)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
           oob_score=False, random_state=23, verbose=0, warm_start=False)

In [66]:
# KNN
KNN_hashed_counts = KNeighborsRegressor(n_neighbors=5, weights='distance', algorithm='auto',metric='jaccard')
KNN_hashed_counts.fit(fps_training.fps_hashed_counts,bio_training)

#KNN binary
KNN_hashed_binary = KNeighborsRegressor(n_neighbors=5, weights='distance', algorithm='auto',metric='jaccard')
KNN_hashed_binary.fit(fps_training.fps_hashed_binary,bio_training)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='jaccard',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='distance')

In [67]:
# MLP 
seed = 23
MLP_hashed_counts = MLPRegressor(hidden_layer_sizes=(50,25,10),activation='relu', solver='adam',learning_rate='adaptive', random_state = seed)
MLP_hashed_counts.fit(fps_training.fps_hashed_counts,bio_training)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 25, 10), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=23, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

#### A.4.2 Using unhashed fps

In [68]:
#RF
seed = 23
RF_UNhashed_counts = RandomForestRegressor(n_estimators=100,random_state=seed,n_jobs=2)
RF_UNhashed_counts.fit(fps_training.fps_unhashed_counts,bio_training)

RF_UNhashed_counts_with_sig_refer = RandomForestRegressor(n_estimators=100,random_state=seed,n_jobs=2)
RF_UNhashed_counts_with_sig_refer.fit(fps_training_with_sig_refer.fps_unhashed_counts,bio_training)

# RF binary 
seed = 23 
RF_UNhashed_binary = RandomForestRegressor(n_estimators=100,random_state=seed,n_jobs=2)
RF_UNhashed_binary.fit(fps_training.fps_unhashed_binary,bio_training)

RF_UNhashed_binary_with_sig_refer = RandomForestRegressor(n_estimators=100,random_state=seed,n_jobs=2)
RF_UNhashed_binary_with_sig_refer.fit(fps_training_with_sig_refer.fps_unhashed_binary,bio_training)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
           oob_score=False, random_state=23, verbose=0, warm_start=False)

In [69]:
# KNN
KNN_UNhashed_counts = KNeighborsRegressor(n_neighbors=5, weights='distance', algorithm='auto',metric='jaccard')
KNN_UNhashed_counts.fit(fps_training.fps_unhashed_counts,bio_training)

KNN_UNhashed_counts_with_sig_refer = KNeighborsRegressor(n_neighbors=5, weights='distance', algorithm='auto',metric='jaccard')
KNN_UNhashed_counts_with_sig_refer.fit(fps_training_with_sig_refer.fps_unhashed_counts,bio_training)

#KNN binary 
KNN_UNhashed_binary = KNeighborsRegressor(n_neighbors=5, weights='distance', algorithm='auto',metric='jaccard')
KNN_UNhashed_binary.fit(fps_training.fps_unhashed_binary,bio_training)

KNN_UNhashed_binary_with_sig_refer = KNeighborsRegressor(n_neighbors=5, weights='distance', algorithm='auto',metric='jaccard')
KNN_UNhashed_binary_with_sig_refer.fit(fps_training_with_sig_refer.fps_unhashed_binary,bio_training)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='jaccard',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='distance')

In [70]:
#MLP
seed = 23 
MLP_UNhashed_counts = MLPRegressor(hidden_layer_sizes=(50,25,10),activation='relu', solver='adam',learning_rate='adaptive', random_state = seed)
MLP_UNhashed_counts.fit(fps_training.fps_unhashed_counts,bio_training)                                

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 25, 10), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=23, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

## A.5  predict bioactivities for the test set

We define two functions for validating our predictions, namely: Pearson's correlation coefficient (R2) and root mean squared errro (RMSE)

In [71]:
def Rsquared(pred,true):
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(true,pred)
    return r_value**2

def RMSE(pred,true):
    rmse = np.sqrt(mean_squared_error(true,pred))
    return rmse

In [72]:
RF_preds_hashed = RF_hashed_counts.predict(fps_test.fps_hashed_counts)
MLP_preds_hashed = MLP_hashed_counts.predict(fps_test.fps_hashed_counts)
KNN_preds_hashed = KNN_hashed_counts.predict(fps_test.fps_hashed_counts)

RF_preds_hashed_bin = RF_hashed_binary.predict(fps_test.fps_hashed_binary)
KNN_preds_hashed_bin = KNN_hashed_binary.predict(fps_test.fps_hashed_binary)

In [73]:
RF_preds_UNhashed = RF_UNhashed_counts.predict(fps_test.fps_unhashed_counts)
KNN_preds_UNhashed = KNN_UNhashed_counts.predict(fps_test.fps_unhashed_counts)
MLP_preds_UNhashed = MLP_UNhashed_counts.predict(fps_test.fps_unhashed_counts)

RF_preds_UNhashed_bin = RF_UNhashed_binary.predict(fps_test.fps_unhashed_binary)
KNN_preds_UNhashed_bin = KNN_UNhashed_binary.predict(fps_test.fps_unhashed_binary)

RF_preds_UNhashed_with_sig_refer = RF_UNhashed_counts_with_sig_refer.predict(fps_test_with_sig_refer.fps_unhashed_counts)
KNN_preds_UNhashed_with_sig_refer = KNN_UNhashed_counts_with_sig_refer.predict(fps_test_with_sig_refer.fps_unhashed_counts)

RF_preds_UNhashed_with_sig_refer_bin = RF_UNhashed_binary_with_sig_refer.predict(fps_test_with_sig_refer.fps_unhashed_binary)
KNN_preds_UNhashed_with_sig_refer_bin = KNN_UNhashed_binary_with_sig_refer.predict(fps_test_with_sig_refer.fps_unhashed_binary)

## A.6 Model validation 

In [76]:
print "========= COUNTS ========= "
print "RMSE RF hashed counts fps: ", RMSE(RF_preds_hashed, bio_test)
print "RMSE KNN hashed counts fps: ", RMSE(KNN_preds_hashed, bio_test)
print "RMSE MLP hashed counts fps: ", RMSE(MLP_preds_hashed, bio_test)
print
print "RMSE RF UNhashed counts fps: ", RMSE(RF_preds_UNhashed, bio_test)
print "RMSE KNN UNhashed counts fps: ", RMSE(KNN_preds_UNhashed, bio_test)
print "RMSE MLP UNhashed counts fps: ", RMSE(MLP_preds_UNhashed, bio_test)
print "RMSE RF UNhashed counts fps with sig refer", RMSE(RF_preds_UNhashed_with_sig_refer,bio_test)
print "RMSE KNN UNhashed counts fps with sig refer", RMSE(KNN_preds_UNhashed_with_sig_refer,bio_test)
print
print "========= BINARY ========= "
print "RMSE RF hashed binary fps: ", RMSE(RF_preds_hashed_bin, bio_test)
print "RMSE KNN hashed binary fps: ", RMSE(KNN_preds_hashed_bin, bio_test)
print
print "RMSE RF UNhashed binary fps: ", RMSE(RF_preds_UNhashed_bin, bio_test)
print "RMSE KNN UNhashed binary fps: ", RMSE(KNN_preds_UNhashed_bin, bio_test)
print "RMSE RF UNhashed binary fps with sig refer", RMSE(RF_preds_UNhashed_with_sig_refer_bin,bio_test)
print "RMSE KNN UNhashed binary fps with sig refer", RMSE(KNN_preds_UNhashed_with_sig_refer_bin,bio_test)

RMSE RF hashed counts fps:  0.6349067825593026
RMSE KNN hashed counts fps:  0.5902037552372623
RMSE MLP hashed counts fps:  0.6775207323120627

RMSE RF UNhashed counts fps:  0.644878359553079
RMSE KNN UNhashed counts fps:  0.6843179083251608
RMSE MLP UNhashed counts fps:  0.9994362650390597
RMSE RF UNhashed counts fps with sig refer 0.8472336394014165
RMSE KNN UNhashed counts fps with sig refer 0.8744071498994512

RMSE RF hashed binary fps:  0.6864513272611216
RMSE KNN hashed binary fps:  0.6005301506526197

RMSE RF UNhashed binary fps:  0.6070642459961657
RMSE KNN UNhashed binary fps:  0.6736750332676493
RMSE RF UNhashed binary fps with sig refer 0.8456577374310799
RMSE KNN UNhashed binary fps with sig refer 0.8912418212730204


In [80]:
print "========= COUNTS ========= "
print "R2 RF hashed fps: ", Rsquared(RF_preds_hashed, bio_test)
print "R2 KNN hashed fps: ", Rsquared(KNN_preds_hashed, bio_test)
print "R2 MLP hashed fps: ", Rsquared(MLP_preds_hashed, bio_test)
print
print "R2 RF UNhashed fps: ", Rsquared(RF_preds_UNhashed, bio_test)
print "R2 KNN UNhashed fps: ", Rsquared(KNN_preds_UNhashed, bio_test)
print "R2 MLP UNhashed fps: ", Rsquared(MLP_preds_UNhashed, bio_test)
print "R2 RF Unhashed fps with sig refer", Rsquared(RF_preds_UNhashed_with_sig_refer, bio_test)
print "R2 KNN Unhashed fps with sig refer", Rsquared(KNN_preds_UNhashed_with_sig_refer, bio_test)
print
print "========= BINARY ========= "
print "R2 RF hashed binary fps: ", Rsquared(RF_preds_hashed_bin, bio_test)
print "R2 KNN hashed binary fps: ", Rsquared(KNN_preds_hashed_bin, bio_test)
print
print "R2 RF UNhashed binary fps: ", Rsquared(RF_preds_UNhashed_bin, bio_test)
print "R2 KNN UNhashed binary fps: ", Rsquared(KNN_preds_UNhashed_bin, bio_test)
print "R2 RF Unhashed bonary fps with sig refer", Rsquared(RF_preds_UNhashed_with_sig_refer_bin, bio_test)
print "R2 KNN Unhashed binary fps with sig refer", Rsquared(KNN_preds_UNhashed_with_sig_refer_bin, bio_test)

R2 RF hashed fps:  0.6544171312924008
R2 KNN hashed fps:  0.7831289073740177
R2 MLP hashed fps:  0.7049414537947494

R2 RF UNhashed fps:  0.6496775327858499
R2 KNN UNhashed fps:  0.6814441494073264
R2 MLP UNhashed fps:  0.43055998155483677
R2 RF Unhashed fps with sig refer 0.34041634598514803
R2 KNN Unhashed fps with sig refer 0.3530713140574211

R2 RF hashed binary fps:  0.5932456217916399
R2 KNN hashed binary fps:  0.7736173616322083

R2 RF UNhashed binary fps:  0.6895628319078321
R2 KNN UNhashed binary fps:  0.6888883921089402
R2 RF Unhashed bonary fps with sig refer 0.34065576463664526
R2 KNN Unhashed binary fps with sig refer 0.3411916317153259


### (optional) A.4.3 10 samplings of 5 fold cross validation 

In [None]:
### (optional) A.4.3 10 samplings of 5 fold cross validation 

########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ##########  

def Rsquared(pred,true):
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(true,pred)
    return r_value**2

def RMSE(pred,true):
    rmse = np.sqrt(mean_squared_error(true,pred))
    return rmse

########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ##########  

test_size = 0
x_train, x_test, y_train, y_test = train_test_split(fps_training.fps_unhashed_counts,bio_training,test_size = test_size, shuffle = True)

########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ########## ##########  

#fold setting
kf = RepeatedKFold(n_splits = 5,n_repeats=10)
kf.get_n_splits(x_train)
#kf
#>>> KFold(n_splits=5, random_state=None, shuffle=False)

#for numbering k folds
k=0
cv_rmse = []
cv_rmse_mean = []
tmp = []

#cross-val training
for train_idx, test_idx in kf.split(x_train):
    #print("iteration num : {}".format(k))
    #print("train : ", train_idx, "\ntest : ", test_idx)
    x_cv_train , x_cv_test = x_train[train_idx], x_train[test_idx]
    y_cv_train , y_cv_test = y_train[train_idx], y_train[test_idx]
    
    RF_UNhashed_counts.fit(x_cv_train,y_cv_train)
    
    y_cv_testidx = test_idx
    y_cv_pred = RF_UNhashed_counts.predict(x_cv_test)
    cv_rmse.append(round(RMSE(y_cv_pred,y_cv_test),4))
    cv_rmse_mean.append(round(RMSE(y_cv_pred,y_cv_test),4))
    k += 1 
    
    if k % 5 == 0 : 
        tmp.append(np.mean(cv_rmse_mean))
        print(cv_rmse_mean) # per fold rmse value in one cv 
        print("mean : ",np.mean(cv_rmse_mean)) # RMSE mean value per iteration of cv  
        del cv_rmse_mean[:]
        

print("total fold mean : ", round(np.mean(cv_rmse),4)) # mean of 50 cv rmse (1회 iter돌 때의 모든 rmse의 평균 (50회의 평균)) 
print("total cv mean : ",round(np.mean(tmp),4)) # mean of cv's mean of 1 iter (각 cv rmse 평균의 평균) # we need var of this value 