In [62]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from utils.baggingPU import BaggingClassifierPU
from sklearn.tree import DecisionTreeClassifier

In [63]:
df_ligands = pd.read_csv("../data/ligands.csv", index_col="id", usecols=["id", "SMILES"])
df_sequences = pd.read_csv("../data/sequences.csv", index_col=0)
df_binding = pd.read_csv("../data/lig2seq.csv")

In [64]:
min_f_count = 2
ngram_max = 8
max_features = 1000

In [65]:
tfidf_sequence = TfidfVectorizer(
    lowercase=False,
    analyzer='char',
    stop_words=None,
    ngram_range=(1,ngram_max),
    min_df=min_f_count,
    max_features=max_features
)
tfidf_sequence.fit(df_sequences["sequence"].values)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=1000, min_df=2,
        ngram_range=(1, 8), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [66]:
tfidf_ligand = TfidfVectorizer(
    lowercase=False,
    analyzer='char',
    stop_words=None,
    ngram_range=(1,ngram_max),
    min_df=min_f_count,
    max_features=max_features
)
tfidf_ligand.fit(df_ligands["SMILES"].values)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=1000, min_df=2,
        ngram_range=(1, 8), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [67]:
lig_num = 100
sample_size = 1000
estimators = 100

In [68]:
data = pd.DataFrame(columns=["lig_id", "SMILES", "seq_id", "sequence", "binding"])
df_binding_temp = df_binding.sample(1000)
data["lig_id"] = df_binding_temp["lig"].values
data["seq_id"] = df_binding_temp["seq"].values

while data.shape[0] < 2000:
    d_temp = pd.DataFrame(columns=["lig_id", "SMILES", "seq_id", "sequence", "binding"])
    df_binding_temp = df_binding.sample(2000-data.shape[0])
    d_temp["lig_id"] = df_binding_temp["lig"].values
    d_temp["seq_id"] = df_binding_temp["seq"].values
    data = data.append(d_temp, ignore_index=True).drop_duplicates()
print(data.shape)

data2 = data.loc[1000:,:]
data = data.loc[:999,:]
print(data.shape)

i = 0
while data.shape[0] < 10000:
    print(i, data.shape)
    d_temp = pd.DataFrame(columns=["lig_id", "SMILES", "seq_id", "sequence", "binding"])
    lig_ids = np.random.choice(df_ligands.index.values, 10000-data.shape[0], replace=True)
    seq_ids = np.random.choice(df_sequences.index.values, 10000-data.shape[0], replace=True)
    d_temp["lig_id"] = lig_ids
    d_temp["seq_id"] = seq_ids
    data = data.append(d_temp, ignore_index=True).drop_duplicates()
    i += 1
print(i, data.shape)

data.loc[:,"binding"] = 0
data.loc[:749,"binding"] = 1

(2000, 5)
(1000, 5)
0 (1000, 5)
1 (10000, 5)


In [69]:
data["SMILES"] = data["lig_id"].apply(lambda x: df_ligands.loc[df_ligands.index==x, "SMILES"].values[0])
data["sequence"] = data["seq_id"].apply(lambda x: df_sequences.loc[df_sequences.index==x, "sequence"].values[0])

data2["SMILES"] = data2["lig_id"].apply(lambda x: df_ligands.loc[df_ligands.index==x, "SMILES"].values[0])
data2["sequence"] = data2["seq_id"].apply(lambda x: df_sequences.loc[df_sequences.index==x, "sequence"].values[0])

X_a = tfidf_ligand.transform(data["SMILES"].values).toarray()
X_b = tfidf_sequence.transform(data["sequence"].values).toarray()

X_test_a = tfidf_ligand.transform(data2["SMILES"].values).toarray()
X_test_b = tfidf_sequence.transform(data2["sequence"].values).toarray()

X = np.concatenate((X_a, X_b), axis=1)
X_test = np.concatenate((X_test_a, X_test_b), axis=1)
y = data["binding"].values

In [70]:
bc = BaggingClassifierPU(DecisionTreeClassifier(), n_estimators=estimators, n_jobs=-1, max_samples=sum(y), verbose=1)
bc.fit(X, y)

[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:   10.3s remaining:  1.2min
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:   13.2s finished


BaggingClassifierPU(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          bootstrap=True, bootstrap_features=False, max_features=1.0,
          max_samples=750, n_estimators=100, n_jobs=-1, oob_score=True,
          random_state=None, verbose=1, warm_start=False)

In [71]:
print(bc.predict(X_test).mean())
print(bc.predict_proba(X_test)[:,1].mean())

[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.2s remaining:    1.4s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.6s finished


0.944


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.2s remaining:    1.4s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.6s finished


0.87595


In [72]:
fi_df = pd.DataFrame([tree.feature_importances_ for tree in bc.estimators_])
fi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.010519,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.004571,0.000000,0.00237,0.000000,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.004839,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.018523,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0
5,0.000000,0.000000,0.000000,0.000000,0.044986,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0
6,0.000000,0.000000,0.000000,0.002000,0.000000,0.000000,0.000000,0.0,0.002222,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0
9,0.000000,0.000000,0.010112,0.005035,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.002519,0.00000,0.000000,0.0,0.0


In [73]:
fi_df_sum = fi_df.mean(axis=0)
fi_list = list(fi_df_sum.sort_values(ascending=False).index)

In [74]:
for i in range(100, 2100, 100):
    bc.fit(X[:,fi_list[:i]], y)
    print(i, bc.predict(X_test[:,fi_list[:i]]).mean(), bc.predict_proba(X_test[:,fi_list[:i]])[:,1].mean())

[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.7s remaining:    4.7s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.8s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished


100 0.934 0.8909300000000001


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    1.2s remaining:    8.1s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    1.5s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished


200 0.937 0.8933900000000001


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    1.7s remaining:   11.9s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    2.1s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished


300 0.943 0.8853300000000001


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    2.2s remaining:   15.2s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    2.8s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished


400 0.937 0.8847200000000001


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    2.8s remaining:   19.4s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    3.7s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished


500 0.946 0.88151


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    3.4s remaining:   23.8s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    4.3s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.2s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.2s finished


600 0.941 0.8791400000000001


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    3.8s remaining:   26.4s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    5.0s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.2s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.2s finished


700 0.941 0.87698


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    4.3s remaining:   30.2s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    5.5s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.4s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.2s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.4s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.2s finished


800 0.943 0.8758800000000001


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    4.8s remaining:   33.8s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    6.0s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.4s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.2s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.4s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.2s finished


900 0.941 0.8788599999999999


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    5.4s remaining:   37.6s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    6.9s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.3s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.3s finished


1000 0.941 0.87757


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    5.8s remaining:   40.9s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    7.0s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.3s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.3s finished


1100 0.944 0.87346


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    5.9s remaining:   41.0s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    7.6s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.3s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.3s finished


1200 0.947 0.87929


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    6.2s remaining:   43.7s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    8.3s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.6s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.3s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.6s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.3s finished


1300 0.944 0.87685


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    6.4s remaining:   45.1s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    8.8s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.6s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.4s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.6s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.4s finished


1400 0.944 0.87337


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    7.0s remaining:   49.1s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    9.4s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.7s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.4s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.7s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.4s finished


1500 0.945 0.87346


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    7.0s remaining:   48.8s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    9.4s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.7s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.4s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.7s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.4s finished


1600 0.944 0.87254


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    7.7s remaining:   54.2s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    9.9s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.7s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.4s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.7s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.4s finished


1700 0.944 0.87021


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    7.9s remaining:   55.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:   10.7s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.8s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.5s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.8s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.5s finished


1800 0.946 0.8728199999999999


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    8.3s remaining:   58.1s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:   10.8s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.8s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.5s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.9s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.5s finished


1900 0.944 0.8719


[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    8.5s remaining:   59.2s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:   11.8s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.9s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.5s finished
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.1s remaining:    0.9s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.5s finished


2000 0.945 0.87461
