In [63]:
import pickle
import itertools

import pandas as pd
import numpy as np

import ipywidgets as widgets

from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from baggingPU import BaggingClassifierPU
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, FeatureUnion

In [64]:
min_f_count = 2
ngram_max = 8
max_features = 500
estimators = 100

In [65]:
class SelectColumn:
    def __init__(self, col):
        self.col = col
    def fit(self, X):
        return X[:,self.col]
    
select_lig = SelectColumn(0)
select_seq = SelectColumn(1)

pipeline = Pipeline([
  ('features', FeatureUnion([
    ('preprocess_ligands', Pipeline([
        ('select_ligand', FunctionTransformer(select_lig.fit, validate=False)),
        ('tfidf_ligand', TfidfVectorizer(lowercase=False, analyzer='char',stop_words=None, ngram_range=(1,ngram_max), min_df=min_f_count, max_features=max_features))
    ])),
    ('preprocess_sequences', Pipeline([
        ('select_sequence', FunctionTransformer(select_seq.fit, validate=False)),
        ('tfidf_sequence', TfidfVectorizer(lowercase=False, analyzer='char',stop_words=None, ngram_range=(1,ngram_max), min_df=min_f_count, max_features=max_features))
    ]))
  ])),
  ('PUC', BaggingClassifierPU(DecisionTreeClassifier(), n_estimators=estimators, n_jobs=-1, max_samples=75, verbose=0))
])


In [66]:
df_ligands = pd.read_csv("../data/ligands.csv", index_col="id", usecols=["id", "SMILES"])
df_sequences = pd.read_csv("../data/sequences.csv", index_col=0)
df_binding = pd.read_csv("../data/lig2seq.csv")

In [67]:
data = pd.DataFrame(columns=["lig_id", "SMILES", "seq_id", "sequence", "binding"])
df_binding_temp = df_binding.sample(2000)
data["lig_id"] = df_binding_temp["lig"].values
data["seq_id"] = df_binding_temp["seq"].values

data2 = data.loc[1000:,:]
data = data.loc[:999,:]

i = 0
while data.shape[0] < 10000:
    d_temp = pd.DataFrame(columns=["lig_id", "SMILES", "seq_id", "sequence", "binding"])
    lig_ids = np.random.choice(df_ligands.index.values, 10000-data.shape[0], replace=True)
    seq_ids = np.random.choice(df_sequences.index.values, 10000-data.shape[0], replace=True)
    d_temp["lig_id"] = lig_ids
    d_temp["seq_id"] = seq_ids
    data = data.append(d_temp, ignore_index=True).drop_duplicates()
    i += 1

data["SMILES"] = data["lig_id"].apply(lambda x: df_ligands.loc[x, "SMILES"])
data["sequence"] = data["seq_id"].apply(lambda x: df_sequences.loc[x, "sequence"])

data2["SMILES"] = data2["lig_id"].apply(lambda x: df_ligands.loc[x, "SMILES"])
data2["sequence"] = data2["seq_id"].apply(lambda x: df_sequences.loc[x, "sequence"])

data.loc[:,"binding"] = 0
data.loc[:749,"binding"] = 1

X = data.drop(columns=["seq_id", "lig_id", "binding"]).values
X_test = data2.drop(columns=["seq_id", "lig_id", "binding"]).values
y = data["binding"]

In [68]:
params = {
    "features__preprocess_ligands__tfidf_ligand__min_df": 16,
    "features__preprocess_ligands__tfidf_ligand__max_features": 1000,
    "features__preprocess_sequences__tfidf_sequence__min_df": 16,
    "features__preprocess_sequences__tfidf_sequence__max_features": 1000,
    "PUC__n_estimators": 100
}
pipeline.set_params(**params)
pipeline.fit(X, y)
pipeline.predict(X_test).mean()

0.996

In [69]:
pipeline.predict_proba(X_test)[:,1].mean()

0.95008

In [70]:
import dill

with open("../models/predict.pk", "wb") as fp:
    dill.dump(pipeline, fp)

In [71]:
with open("../models/predict.pk", "rb") as fp:
    predict = dill.load(fp)
    print(predict.predict_proba(X_test)[:,1].mean())

0.95008


In [74]:
X_test[0]

array(['CC1CCC(CC1)CN2C3=C(N=C(N=C3N[C@H](C)C4CCC4)C5NC(=O)ON5)N=C2NC(C)C6=CSC(=N6)C',
       'SNASGQQVHRLLGNKLELASTGQTIYHQDINLNNHPWIGDHRVYDTPVIPGVSYIAMTLAAVGVPAAVEDINFQQPLFLAESNTTRETQLMLHTADNVGKQFVEVFSRDGAKQEEWQQHASMSVSENPPPPPTLSVDIPALCEQLRPLDTDTLTEIYASISLVYGPMLQAVRQAWIGEETSLLEIEVPKALAFQLAGEPIHPVLIDACTRLTPDLFDFSSDSGVFWAPWRVKEMTLSHPTPSRFYAYVEEPSRVNEQLQTRSYDIQLLDETGQAFGRINGFTVKRAPSQLFLK'],
      dtype=object)

In [75]:
pipeline.predict_proba(X_test[[0],:])[:,1].mean()

0.99