In [1]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import auc as calculate_auc
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from tqdm import tqdm

from sklearn.utils import shuffle 
from joblib import load, dump
import numpy as np
import pandas as pd
import os

In [2]:
from chembench import load_data, dataset
from molmap import feature

In [3]:
bitsinfo = feature.fingerprint.Extraction().bitsinfo
fp_types = bitsinfo.Subtypes.unique()
fp_types

array(['MorganFP', 'RDkitFP', 'AtomPairFP', 'TorsionFP', 'AvalonFP',
       'EstateFP', 'MACCSFP', 'PharmacoErGFP', 'PharmacoPFP', 'PubChemFP',
       'MHFP6', 'MAP4'], dtype=object)

In [4]:
from scipy.stats.stats import pearsonr
def r2(y_true, y_pred):
    pcc, _ = pearsonr(y_true,y_pred)
    return pcc[0]**2

def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)  
    return rmse


def PRC_AUC(y_true, y_score):
    precision, recall, threshold  = precision_recall_curve(y_true, y_score) #PRC_AUC
    auc = calculate_auc(recall, precision)
    return auc

def ROC_AUC(y_true, y_score):
    auc = roc_auc_score(y_true, y_score)
    return auc

In [5]:
esol = dataset.load_ESOL()
lipop = dataset.load_Lipop()
FreeSolv = dataset.load_FreeSolv()
PDBF = dataset.load_PDBF()

datasets = [esol, lipop, FreeSolv] #malaria

total samples: 1128
total samples: 4200
total samples: 642
total samples: 9880


In [6]:
performance = []

for data in datasets:
    for fp_type in fp_types:
        task_name = data.task_name
        print(task_name, fp_type)
        df, induces = load_data(task_name)
        X2 = load('/raid/shenwanxiang/10_FP_effect/tempignore/X2_%s_%s.data' % (task_name, fp_type) )
        n, w, c = X2.sum(axis=-1).shape
        X2 = X2.reshape(n, w*c)
        Y = data.y
        for i, idx in enumerate(induces):
            train_idx, valid_idx, test_idx = idx

            X = X2[train_idx]
            y = Y[train_idx]

            X_valid = X2[valid_idx]
            y_valid = Y[valid_idx]

            X_test = X2[test_idx]
            y_test = Y[test_idx] 

            # Set up possible values of parameters to optimize over
            n_neighbors_list = np.arange(1, 15, 2)
            weights_list =  ['uniform', 'distance']
            res = []
            for n_neighbors in tqdm(n_neighbors_list, ascii=True):
                for weights in weights_list:
                    clf = KNeighborsRegressor(n_neighbors=n_neighbors, weights = weights)
                    clf.fit(X, y)
                    score = clf.score(X_valid, y_valid)
                    res.append([n_neighbors, weights, score])

            dfr = pd.DataFrame(res, columns = ['n_neighbors', 'weights', 'score'])
            gidx = dfr['score'].idxmax()
            best_params = dfr.iloc[gidx].to_dict()
            best_params.pop('score')
            best_params

            clf = KNeighborsRegressor(**best_params)
            clf.fit(X, y, )

            test_r2 = r2(y_test, clf.predict(X_test))
            test_rmse = rmse(y_test, clf.predict(X_test))
            results = {"task_name":task_name, 'fp_type':fp_type,"split-time":i, "test_rmse":test_rmse , "test_r2": test_r2}

            performance.append(results)

  0%|          | 0/14 [00:00<?, ?it/s]

ESOL MorganFP
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:08<00:00,  1.59it/s]
100%|##########| 14/14 [00:08<00:00,  1.59it/s]
100%|##########| 14/14 [00:08<00:00,  1.59it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

ESOL RDkitFP
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:08<00:00,  1.59it/s]
100%|##########| 14/14 [00:08<00:00,  1.59it/s]
100%|##########| 14/14 [00:08<00:00,  1.59it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

ESOL AtomPairFP
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:08<00:00,  1.59it/s]
100%|##########| 14/14 [00:08<00:00,  1.59it/s]
100%|##########| 14/14 [00:08<00:00,  1.59it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

ESOL TorsionFP
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:08<00:00,  1.71it/s]
100%|##########| 14/14 [00:08<00:00,  1.71it/s]
100%|##########| 14/14 [00:08<00:00,  1.71it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

ESOL AvalonFP
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:08<00:00,  1.60it/s]
100%|##########| 14/14 [00:08<00:00,  1.60it/s]
100%|##########| 14/14 [00:08<00:00,  1.62it/s]
 36%|###5      | 5/14 [00:00<00:00, 49.14it/s]

ESOL EstateFP
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:00<00:00, 48.14it/s]
100%|##########| 14/14 [00:00<00:00, 50.20it/s]
100%|##########| 14/14 [00:00<00:00, 49.75it/s]
 14%|#4        | 2/14 [00:00<00:00, 19.19it/s]

ESOL MACCSFP
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:00<00:00, 18.38it/s]
100%|##########| 14/14 [00:00<00:00, 18.46it/s]
100%|##########| 14/14 [00:00<00:00, 18.43it/s]
  7%|7         | 1/14 [00:00<00:01,  7.43it/s]

ESOL PharmacoErGFP
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:01<00:00,  7.30it/s]
100%|##########| 14/14 [00:01<00:00,  7.28it/s]
100%|##########| 14/14 [00:01<00:00,  7.30it/s]
 14%|#4        | 2/14 [00:00<00:00, 12.09it/s]

ESOL PharmacoPFP
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:01<00:00, 11.58it/s]
100%|##########| 14/14 [00:01<00:00, 11.49it/s]
100%|##########| 14/14 [00:01<00:00, 11.55it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

ESOL PubChemFP
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:03<00:00,  4.41it/s]
100%|##########| 14/14 [00:03<00:00,  4.44it/s]
100%|##########| 14/14 [00:03<00:00,  4.43it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

ESOL MHFP6
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:08<00:00,  1.59it/s]
100%|##########| 14/14 [00:08<00:00,  1.58it/s]
100%|##########| 14/14 [00:08<00:00,  1.59it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

ESOL MAP4
loading dataset: ESOL number of split times: 3


100%|##########| 14/14 [00:08<00:00,  1.59it/s]
100%|##########| 14/14 [00:08<00:00,  1.59it/s]
100%|##########| 14/14 [00:08<00:00,  1.59it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop MorganFP
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [01:53<00:00,  8.16s/it]
100%|##########| 14/14 [01:54<00:00,  8.22s/it]
100%|##########| 14/14 [01:53<00:00,  8.17s/it]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop RDkitFP
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [01:53<00:00,  8.12s/it]
100%|##########| 14/14 [01:53<00:00,  8.15s/it]
100%|##########| 14/14 [01:53<00:00,  8.12s/it]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop AtomPairFP
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [01:54<00:00,  8.20s/it]
100%|##########| 14/14 [01:54<00:00,  8.22s/it]
100%|##########| 14/14 [01:54<00:00,  8.24s/it]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop TorsionFP
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [01:46<00:00,  7.72s/it]
100%|##########| 14/14 [01:55<00:00,  8.44s/it]
100%|##########| 14/14 [02:09<00:00,  9.73s/it]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop AvalonFP
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [02:51<00:00, 12.90s/it]
100%|##########| 14/14 [02:59<00:00, 13.11s/it]
100%|##########| 14/14 [03:01<00:00, 12.94s/it]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop EstateFP
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [00:04<00:00,  3.00it/s]
100%|##########| 14/14 [00:04<00:00,  3.07it/s]
100%|##########| 14/14 [00:04<00:00,  3.11it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop MACCSFP
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [00:13<00:00,  1.02it/s]
100%|##########| 14/14 [00:13<00:00,  1.04it/s]
100%|##########| 14/14 [00:13<00:00,  1.03it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop PharmacoErGFP
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [00:36<00:00,  2.50s/it]
100%|##########| 14/14 [00:34<00:00,  2.42s/it]
100%|##########| 14/14 [00:32<00:00,  2.38s/it]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop PharmacoPFP
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [00:18<00:00,  1.40s/it]
100%|##########| 14/14 [00:18<00:00,  1.39s/it]
100%|##########| 14/14 [00:18<00:00,  1.40s/it]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop PubChemFP
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [00:54<00:00,  3.86s/it]
100%|##########| 14/14 [00:45<00:00,  3.16s/it]
100%|##########| 14/14 [00:44<00:00,  3.14s/it]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop MHFP6
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [01:55<00:00,  8.14s/it]
100%|##########| 14/14 [01:53<00:00,  8.14s/it]
100%|##########| 14/14 [01:54<00:00,  8.15s/it]
  0%|          | 0/14 [00:00<?, ?it/s]

Lipop MAP4
loading dataset: Lipop number of split times: 3


100%|##########| 14/14 [01:53<00:00,  8.08s/it]
100%|##########| 14/14 [01:53<00:00,  8.10s/it]
100%|##########| 14/14 [01:57<00:00,  8.83s/it]
  0%|          | 0/14 [00:00<?, ?it/s]

FreeSolv MorganFP
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:04<00:00,  3.45it/s]
100%|##########| 14/14 [00:03<00:00,  3.54it/s]
100%|##########| 14/14 [00:04<00:00,  3.45it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

FreeSolv RDkitFP
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:04<00:00,  3.40it/s]
100%|##########| 14/14 [00:04<00:00,  3.31it/s]
100%|##########| 14/14 [00:04<00:00,  3.38it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

FreeSolv AtomPairFP
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:04<00:00,  3.40it/s]
100%|##########| 14/14 [00:04<00:00,  3.36it/s]
100%|##########| 14/14 [00:04<00:00,  3.44it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

FreeSolv TorsionFP
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:03<00:00,  3.55it/s]
100%|##########| 14/14 [00:04<00:00,  3.68it/s]
100%|##########| 14/14 [00:04<00:00,  3.51it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

FreeSolv AvalonFP
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:04<00:00,  3.34it/s]
100%|##########| 14/14 [00:04<00:00,  3.36it/s]
100%|##########| 14/14 [00:04<00:00,  3.38it/s]
100%|##########| 14/14 [00:00<00:00, 83.21it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

FreeSolv EstateFP
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:00<00:00, 95.45it/s]
100%|##########| 14/14 [00:00<00:00, 84.44it/s]
 36%|###5      | 5/14 [00:00<00:00, 39.85it/s]

FreeSolv MACCSFP
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:00<00:00, 38.17it/s]
100%|##########| 14/14 [00:00<00:00, 36.53it/s]
100%|##########| 14/14 [00:00<00:00, 36.34it/s]
 14%|#4        | 2/14 [00:00<00:00, 16.03it/s]

FreeSolv PharmacoErGFP
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:00<00:00, 15.20it/s]
100%|##########| 14/14 [00:00<00:00, 15.20it/s]
100%|##########| 14/14 [00:00<00:00, 15.19it/s]
 21%|##1       | 3/14 [00:00<00:00, 23.17it/s]

FreeSolv PharmacoPFP
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:00<00:00, 21.82it/s]
100%|##########| 14/14 [00:00<00:00, 22.53it/s]
100%|##########| 14/14 [00:00<00:00, 22.72it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

FreeSolv PubChemFP
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:01<00:00,  8.72it/s]
100%|##########| 14/14 [00:01<00:00,  8.49it/s]
100%|##########| 14/14 [00:01<00:00,  8.64it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

FreeSolv MHFP6
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:04<00:00,  3.12it/s]
100%|##########| 14/14 [00:04<00:00,  3.32it/s]
100%|##########| 14/14 [00:04<00:00,  3.24it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

FreeSolv MAP4
loading dataset: FreeSolv number of split times: 3


100%|##########| 14/14 [00:04<00:00,  3.25it/s]
100%|##########| 14/14 [00:04<00:00,  3.23it/s]
100%|##########| 14/14 [00:04<00:00,  3.20it/s]


In [8]:
pd.DataFrame(performance).to_csv('./knn_regression.csv')

In [16]:
pd.DataFrame(performance).groupby(['task_name','fp_type'])['test_r2'].apply(lambda x:x.mean())

task_name  fp_type      
ESOL       AtomPairFP       0.723782
           AvalonFP         0.703896
           EstateFP         0.518462
           MACCSFP          0.703028
           MAP4             0.570213
           MHFP6            0.642149
           MorganFP         0.524776
           PharmacoErGFP    0.560204
           PharmacoPFP      0.695061
           PubChemFP        0.730567
           RDkitFP          0.676306
           TorsionFP        0.508625
FreeSolv   AtomPairFP       0.684389
           AvalonFP         0.805264
           EstateFP         0.781209
           MACCSFP          0.818480
           MAP4             0.352572
           MHFP6            0.745899
           MorganFP         0.501347
           PharmacoErGFP    0.767449
           PharmacoPFP      0.608869
           PubChemFP        0.806931
           RDkitFP          0.761272
           TorsionFP        0.398211
Lipop      AtomPairFP       0.436795
           AvalonFP         0.390984
           Es