In [None]:
#@title
'''make vectorizer from sequences csv'''
# from pandarallel import pandarallel
# pandarallel.initialize()
from sklearn.feature_extraction.text import TfidfVectorizer  # type: ignore
from sklearn.tree import DecisionTreeClassifier # type: ignore
import pandas as pd  # type: ignore
import pickle
import numpy as np  # type: ignore
import argparse
from subprocess import call
from typing import Tuple, List
from functools import reduce

from baggingPU import BaggingClassifierPU
from tfidf import seq_vectorizer, raw_prefix

raw_prefix = 'https://raw.githubusercontent.com/prescriptive-possibilities-april-15-19/mocking/master/'

seed = 42
np.random.seed(seed)
lig_num = 1000
sample_size = 200
estimators = 100
s_ = sample_size//2 - 2

downsample_binding=230000
downsample_sequences=20000

tfidf = seq_vectorizer(ngram_max=4, downsample=30000)

tfidf

training data:  (30000, 2)
fitting........
all trained up!


TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=10000, min_df=10,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [None]:
def init_dat(lig_num: int = lig_num, 
             sample_size: int = sample_size,
             estimators: int = estimators, 
             downsample_binding: int = -1, 
             downsample_sequences: int = -1) -> Tuple[pd.DataFrame, List[int], pd.DataFrame]:

    def lig2seq_(): 
        return (pd.read_csv(raw_prefix+'lig2seq.csv')
                .rename({'lig': 'lig_idx','seq': 'seq_idx'}, axis=1))
  
    def sequences_(): 
        return pd.read_csv(raw_prefix+'sequences.csv',
                           index_col=0
                          )
  
    if downsample_binding < 16:
        lig2seq = lig2seq_()
    else:
        lig2seq = lig2seq_().sample(downsample_binding)

    if downsample_sequences < 16:
        sequences = sequences_()
    else:
        sequences = sequences_.sample(downsample_sequences)

    lig_id_vals = list(np.random.choice(lig2seq.lig_idx.unique(), size=lig_num))
    binding = lig2seq.loc[lig2seq.lig_idx.isin(lig_id_vals)]

    return sequences, lig_id_vals, binding


sequences, lig_id_vals, binding = init_dat(
    downsample_binding=downsample_binding, 
    #downsample_sequences=downsample_sequences
)

print(sequences.shape, binding.shape, len(lig_id_vals))

binding.head()


(105802, 1) (1913, 2) 1000


Unnamed: 0,lig_idx,seq_idx
1338243,509775,74779
1865254,713452,53558
704701,213454,73882
206089,25890,57601
1488210,556204,26289


In [10]:
def fitter_df_maker(lig_id: int) -> Tuple[pd.DataFrame, pd.DataFrame]: 
  
    def labeled_(psqs): 
        labeled_seqs = sequences.loc[sequences.index.isin(psqs)] # hidden_0
    
        labeled_seqs_known = labeled_seqs.sample(frac=0.75)
    
        labeled_seqs_hidden = labeled_seqs.loc[~labeled_seqs.index.isin(labeled_seqs_known.index)]
        return labeled_seqs_known, labeled_seqs_hidden
  
    positive_seq_ids = binding.loc[binding.lig_idx==lig_id, 'seq_idx'].values
    
    if len(positive_seq_ids) > s_: # we want unlabeled to be dominant
        positive_seq_ids = np.choice(positive_seq_ids, s_)
    if len(positive_seq_ids) > 5: 
        unlabeled_seqs = (sequences.loc[~sequences.index.isin(positive_seq_ids)]
                          .sample(n = sample_size-len(positive_seq_ids))) # 
    
        labeled_seqs_known, labeled_seqs_hidden = labeled_(positive_seq_ids)
    
        unlabeled_seqs['bind'] = np.zeros(unlabeled_seqs.shape[0]) # equiv to df_seq_sub_neg.loc[:,"bind"] = 0
    
        labeled_seqs_known['bind'] = np.ones(labeled_seqs_known.shape[0])
    
        labeled_seqs_hidden['bind'] = np.zeros(labeled_seqs_hidden.shape[0])
    
        df_fitter = pd.concat([unlabeled_seqs, labeled_seqs_known, labeled_seqs_hidden])
    
        X = pd.DataFrame(tfidf.transform(df_fitter.sequence.values).toarray(), columns=tfidf.get_feature_names(), index=df_fitter.index)
    
        y = df_fitter.bind
        print(lig_id, X.shape, y.shape)
        return X,y
    else: 
        raise Exception


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 19)

In [None]:
models = {}

for lig_id in lig_id_vals: 
    try: 
        X, y = fitter_df_maker(lig_id)
        bc = BaggingClassifierPU(DecisionTreeClassifier(),
                                 n_estimators=estimators, 
                                 #n_jobs=-1, 
                                 max_samples=int(sum(y.values)))

        bc.fit(X,y)
        models[lig_id] = bc
    except: 
        pass

with open('models.pickle', 'wb') as mp: 
    pickle.dump(models, mp)

(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)
(200, 10000) (200,)


# begin LSTM stuff

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Concatenate
from keras.layers import LSTM
import numpy as np
import pandas as pd
from functools import partial, reduce
from sklearn.model_selection import train_test_split

seed = 42
np.random.seed(seed)
rnn_num_samples=100
lstm_size = 128
dropout = 0.2
recurrent_dropout = 0.2
epochs = 20
batch_size = 128 # number of datapoints its fed each time it fits
pad_length_lig = 1000 # max size of smile string
pad_length_seq = 1000 # max size of sequence
num_puc_models = 30
num_bindings = 300

seq_vec_length = len(tfidf.vocabulary_)


In [None]:

def ligands_(dat: pd.DataFrame) -> pd.DataFrame: 
  ''''''
  return dat.rename({'SMILES': 'smiles'}, axis=1).assign() 


ligands = (pd.read_csv(raw_prefix+'ligands.csv', 
                       index_col='id', 
                       usecols=['id', 'SMILES'])
           .rename({'SMILES': 'smile'}, axis=1))


sequences = pd.read_csv(raw_prefix+'sequences.csv', index_col=0)#.rename({'Unnamed: 0': 'seq_id'}, axis=1)

ligands['smile_length'] = ligands.smile.apply(lambda x: len(x))
ligands = ligands.loc[ligands.smile_length<=pad_length_lig].loc[ligands.smile_length>0]

sequences['seq_length'] = sequences.sequence.apply(lambda x: len(x))
sequences = sequences.loc[sequences.seq_length<=pad_length_seq].loc[sequences.seq_length>0]

PUC_models_list = np.random.choice([k for k,v 
                                    in models.items() 
                                    if k in ligands.index 
                                    and k in sequences.index], num_puc_models)

lig2seq = pd.read_csv(raw_prefix+'lig2seq.csv').rename({'lig': 'lig_idx', 'seq': 'seq_idx'}, axis=1)

lig2seq = lig2seq.loc[lig2seq.lig_idx.isin(ligands.index)]
lig2seq = lig2seq.loc[lig2seq.seq_idx.isin(sequences.index)]

lig2seq = lig2seq.loc[~lig2seq.lig_idx.isin(PUC_models_list)].sample(n=num_bindings)

print(ligands.shape, sequences.shape, lig2seq.shape)

!ls

(499997, 2) (104874, 2) (300, 2)
models.pickle  sample_data


In [None]:
print(ligands.shape)
ligands.head()

(499997, 2)


Unnamed: 0_level_0,smile,smile_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,CN1CCC(CC1)COC2=C(C=C3C(=C2)N=CN=C3NC4=C(C=C(C...,57
1,C1=CC=C(C=C1)C[C@H]2N(C(=O)N([C@@H]([C@@H]([C@...,97
2,C1CC1CN2[C@@H]([C@@H]([C@H]([C@H](N(C2=O)C/C=C...,86
3,C1CC1CN2[C@@H]([C@@H]([C@H]([C@H](N(C2=O)CCCCC...,78
4,C1CC1CN2[C@@H]([C@@H]([C@H]([C@H](N(C2=O)CCCCC...,77


In [None]:
print(sequences.shape)
sequences.head()

(104874, 2)


Unnamed: 0,sequence,seq_length
0,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,154
1,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...,165
2,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,154
3,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...,167
4,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAA...,166


In [None]:
def ascii_onehot(length: int, s: str) -> np.ndarray: 
  ''' like a one-hot with an ascii-table as features '''
  vec = np.zeros((length, 128))
  ind = [ord(x) for x in s]
  for i, cod in enumerate(ind): 
    vec[i, cod] = 1.
  return vec

In [None]:
PUC_models_list

array([77193, 15804, 77193, 77193, 40961, 15804, 15804, 15804, 77193,
       40917, 15804, 77193, 40961, 40917, 40961, 40917, 77193, 11998,
       40917, 40961, 77193, 40917, 11998, 11998, 15804, 15804, 40961,
       40917, 40917, 15804])

In [None]:
def df_temp_(i: int) -> pd.DataFrame:
  # def dftemp_(i, mod)
  dftemp_cols = [
      'sequence', 'smile', 
      'seq_onehot', 'smile_onehot', 
      'pred_binding', 'proba_pred_binding'
  ]

  k=PUC_models_list[i]

  df_temp = pd.DataFrame(columns=dftemp_cols)
  bc_ = models[k]

  smile = ligands.loc[k, 'smile']

  seq_samp = sequences.sample(rnn_num_samples)
  #df_temp.seq_idx = seq_samp.index.values

  df_temp.sequence = seq_samp.sequence.values
  df_temp.seq_onehot = df_temp.sequence.apply(lambda x: ascii_onehot(length=pad_length_seq, s=x))

  df_temp.smile = [smile for _ in range(rnn_num_samples)]#ligands.loc[seq_samp.index].smile

  df_temp.smile_onehot = [ascii_onehot(length=pad_length_lig, s=smile) for _ in range(rnn_num_samples)]


  df_temp.pred_binding = bc_.predict(tfidf.transform(df_temp.sequence.values).toarray())
  df_temp.proba_pred_binding = bc_.predict_proba(tfidf.transform(df_temp.sequence.values).toarray())[:, 1]

  assert df_temp.pred_binding.sum() > 1
  print("how many positive? ", df_temp.pred_binding.sum())
  print(df_temp.shape)

  return df_temp

df_total = reduce(lambda D, E: pd.concat([D, E], ignore_index=True), [df_temp_(i) for i,_ in enumerate(PUC_models_list)])

# df_total = pd.concat([df_temp_(i) for i,_ in enumerate(PUC_models_list)])

print(df_total.shape)

df_total.head()

how many positive?  11.0
(100, 6)
how many positive?  21.0
(100, 6)
how many positive?  35.0
(100, 6)
how many positive?  17.0
(100, 6)
how many positive?  28.0
(100, 6)
how many positive?  25.0
(100, 6)
how many positive?  27.0
(100, 6)
how many positive?  63.0
(100, 6)
how many positive?  53.0
(100, 6)
how many positive?  18.0
(100, 6)
how many positive?  4.0
(100, 6)
how many positive?  24.0
(100, 6)
how many positive?  24.0
(100, 6)


OSError: ignored

In [None]:


def train_test_val_split(Dat: pd.DataFrame, x: float, y: float, z: float): 
  '''dat: data
  x: train size
  y: validation size
  z: test size'''
  y_ = y / (1-x)
  z_ = z / (1-x)
  np.testing.assert_almost_equal(x+y+z, 1)
  np.testing.assert_almost_equal(y_+z_, 1)
  train, valtest = train_test_split(Dat, train_size=x, test_size=y+z)
  val, test = train_test_split(valtest, train_size=y_, test_size=z_)
  return train, val, test

train, val, test = train_test_val_split(df_total, 0.5, 0.2, 0.3)

print(train.shape, val.shape, test.shape)

(1500, 6) (600, 6) (900, 6)


In [None]:
def X(dat: pd.DataFrame): 
  return dat.smile_onehot, dat.seq_onehot

def y(dat: pd.DataFrame): 
  return dat.pred_binding

X_train_A, X_train_B = X(train)

X_val_A, X_val_B = X(val)

X_test_A, X_test_B = X(test)

y_train = y(train)
y_val = y(val)
y_test = y(test)

print([d.shape for d in [X_train_A, X_train_B, X_val_A, X_val_B, X_test_A, X_test_B]])

[(1500,), (1500,), (600,), (600,), (900,), (900,)]
