In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from baggingPU import BaggingClassifierPU
from sklearn.tree import DecisionTreeClassifier
from utils import Spinner
import sys
import pickle
import random
from scipy.sparse import hstack

# dataframes
df_ligands = pd.read_csv("../ligands.csv", index_col="id", usecols=["id", "SMILES"])
df_sequences = pd.read_csv("../sequences.csv", index_col=0)
df_binding = pd.read_csv("../lig2seq.csv")

spinner=Spinner()
# hyperparams
min_f_count = 4
ngram_max = 3 # put this >8 when you have more resources
max_features = 1000

lig_num = 100
sample_size = 1000
estimators = 100


In [9]:
tfidf_sequence = TfidfVectorizer(
    lowercase=False,
    analyzer='char',
    stop_words=None,
    ngram_range=(1,ngram_max),
    min_df=min_f_count,
    max_features=max_features
)

tfidf_ligand = TfidfVectorizer(
    lowercase=False,
    analyzer='char',
    stop_words=None,
    ngram_range=(1,ngram_max),
    min_df=min_f_count,
    max_features=max_features
)

spinner.start()
tfidf_sequence.fit(df_sequences["sequence"].values)
print("sequences trained tfidf success")
sys.stdout.write('\r')
tfidf_ligand.fit(df_ligands["SMILES"].values)
print("ligands trained tfidf success")
spinner.stop()

.............................................................................................................................................................................................................|.............................................................................................................................................................................................................................................

In [10]:
def train_test(p: int=10000, q: int=2000):
    data = pd.DataFrame(columns=["lig_id", "SMILES", "seq_id", "sequence", "binding"])
    df_binding_temp = df_binding.sample(1000)
    data["lig_id"] = df_binding_temp["lig"].values
    data["seq_id"] = df_binding_temp["seq"].values

    while data.shape[0] < q:
        d_temp = pd.DataFrame(columns=["lig_id", "SMILES", "seq_id", "sequence", "binding"])
        df_binding_temp = df_binding.sample(q-data.shape[0])
        d_temp["lig_id"] = df_binding_temp["lig"].values
        d_temp["seq_id"] = df_binding_temp["seq"].values
        data = data.append(d_temp, ignore_index=True).drop_duplicates()
    print(data.shape)

    data2 = data.loc[1000:,:]
    data = data.loc[:999,:]
    print(data.shape)

    i = 0
    while data.shape[0] < p:
        print(i, data.shape)
        d_temp = pd.DataFrame(columns=["lig_id", "SMILES", "seq_id", "sequence", "binding"])
        lig_ids = np.random.choice(df_ligands.index.values, p-data.shape[0], replace=True)
        seq_ids = np.random.choice(df_sequences.index.values, p-data.shape[0], replace=True)
        d_temp["lig_id"] = lig_ids
        d_temp["seq_id"] = seq_ids
        data = data.append(d_temp, ignore_index=True).drop_duplicates()
        i += 1
    print(i, data.shape)

    data.loc[:,"binding"] = 0
    data.loc[:749,"binding"] = 1
    
    data["SMILES"] = data["lig_id"].apply(lambda x: df_ligands.loc[df_ligands.index==x, "SMILES"].values[0])
    data["sequence"] = data["seq_id"].apply(lambda x: df_sequences.loc[df_sequences.index==x, "sequence"].values[0])

    data2["SMILES"] = data2["lig_id"].apply(lambda x: df_ligands.loc[df_ligands.index==x, "SMILES"].values[0])
    data2["sequence"] = data2["seq_id"].apply(lambda x: df_sequences.loc[df_sequences.index==x, "sequence"].values[0])

    return data, data2

In [11]:
train, test = train_test()

X_a = tfidf_ligand.transform(train["SMILES"].values).toarray()
X_b = tfidf_sequence.transform(train["sequence"].values).toarray()

X_test_a = tfidf_ligand.transform(test["SMILES"].values).toarray()
X_test_b = tfidf_sequence.transform(test["sequence"].values).toarray()

X = np.concatenate((X_a, X_b), axis=1)
X_test = np.concatenate((X_test_a, X_test_b), axis=1)
y = train["binding"].values

(2000, 5)
(1000, 5)
0 (1000, 5)
1 (10000, 5)


In [12]:
bc = BaggingClassifierPU(DecisionTreeClassifier(), n_estimators=estimators, n_jobs=3, max_samples=sum(y), verbose=1)
spinner.start()
bc.fit(X, y)
spinner.stop()

...|

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................-

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  1.5min remaining:  1.5min


................-

[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  1.6min finished


.................................................................................................................................................................................................................................................................................................................................................................................................................

  predictions.sum(axis=1)[:, np.newaxis])


In [21]:
%%time

def predict(ligid: int, seqid: int) -> float: 
    lig_vec = tfidf_sequence.transform([df_sequences.iloc[5].sequence])
    
    return bc.predict_proba(
        tfidf_ligand.transform(
            df_ligands.SMILES), 
        tfidf_sequence.transform(
            df_sequences.sequence))

predict(0,1)

TypeError: predict_proba() takes 2 positional arguments but 3 were given

In [62]:
# tfidf_ligand.transform(df_ligands.iloc[0].SMILES)

# tfidf_ligands.transform(df_ligands.SMILES)

#tfidf_sequence.transform(df_sequences.sequence)

#help(bc.predict_proba)
xx = hstack([tfidf_sequence.transform([df_sequences.iloc[5].sequence]), 
               tfidf_ligand.transform([df_ligands.iloc[5].SMILES])], 
              ).toarray()

bc.predict_proba(xx)[0][0]

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


0.49

In [66]:
def predict(ligid: int, seqid: int) -> float: 
    xx = hstack([tfidf_sequence.transform([df_sequences.iloc[seqid].sequence]), 
               tfidf_ligand.transform([df_ligands.iloc[ligid].SMILES])], 
              ).toarray()

    return bc.predict_proba(xx)[0][0]

predict(54325,5425)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


0.42

In [68]:

with open('predict_PROTOTYPE.pickle', 'wb') as pPp: 
    pickle.dump(predict, pPp)
    
!wc -c predict_PROTOTYPE.pickle

23 predict_PROTOTYPE.pickle
