In [None]:
%%time
from tfidf import raw_prefix
from puc import models, tfidf
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Concatenate
from keras.layers import LSTM
import numpy as np
import pandas as pd
from functools import partial, reduce
from sklearn.model_selection import train_test_split
from utils import Spinner
import sys 
import pickle

spinner = Spinner()
seed = 42
np.random.seed(seed)
rnn_num_samples=100
lstm_size = 128
dropout = 0.2
recurrent_dropout = 0.2
epochs = 20
batch_size = 128 # number of datapoints its fed each time it fits
pad_length_lig = 1000 # max size of smile string
pad_length_seq = 1000 # max size of sequence
num_puc_models = 30
num_bindings = 300

print(len(models.keys()))

print(list(models.items())[:3])

FOR TF-IDF: training TF-IDF with dims (40000, 2)	fitting.......................................................................................................................................................................................................................................................................................................................................................................................................................TF-IDF is all trained up!
FOR BaggingClassifierPU MODELS:	 Sequences: (105802, 1);	 Binding: (1913, 2);	 Number of ligand id values: 1000. 
Models dict now populated to length 4; next, fitting on ligand #705642........................................\Models dict now populated to length 4; next, fitting on ligand #509764........................../

In [3]:
#with open('')
!wc -c models.pickle

with open('everything.pickle', 'wb') as ep: 
    T = (models, tfidf) # Tuple[Dict[int, Estimator], Estimator]
    pickle.dump(T, ep)

247010621 models.pickle


In [2]:

ligands = (pd.read_csv(raw_prefix+'ligands.csv', 
                       index_col='id', 
                       usecols=['id', 'SMILES'])
           .rename({'SMILES': 'smile'}, axis=1))


sequences = pd.read_csv(raw_prefix+'sequences.csv', index_col=0)#.rename({'Unnamed: 0': 'seq_id'}, axis=1)

ligands['smile_length'] = ligands.smile.apply(lambda x: len(x))
ligands = ligands.loc[ligands.smile_length<=pad_length_lig].loc[ligands.smile_length>0]

sequences['seq_length'] = sequences.sequence.apply(lambda x: len(x))
sequences = sequences.loc[sequences.seq_length<=pad_length_seq].loc[sequences.seq_length>0]

PUC_models_list = np.random.choice([k for k,v 
                                    in models.items() 
                                    if k in ligands.index 
                                    and k in sequences.index], num_puc_models)

lig2seq = pd.read_csv(raw_prefix+'lig2seq.csv').rename({'lig': 'lig_idx', 'seq': 'seq_idx'}, axis=1)

lig2seq = lig2seq.loc[lig2seq.lig_idx.isin(ligands.index)]
lig2seq = lig2seq.loc[lig2seq.seq_idx.isin(sequences.index)]

lig2seq = lig2seq.loc[~lig2seq.lig_idx.isin(PUC_models_list)].sample(n=num_bindings)

print(ligands.shape, sequences.shape, lig2seq.shape)


(499997, 2) (104874, 2) (300, 2)


In [3]:
print(ligands.shape)
ligands.head()

(499997, 2)


Unnamed: 0_level_0,smile,smile_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,CN1CCC(CC1)COC2=C(C=C3C(=C2)N=CN=C3NC4=C(C=C(C...,57
1,C1=CC=C(C=C1)C[C@H]2N(C(=O)N([C@@H]([C@@H]([C@...,97
2,C1CC1CN2[C@@H]([C@@H]([C@H]([C@H](N(C2=O)C/C=C...,86
3,C1CC1CN2[C@@H]([C@@H]([C@H]([C@H](N(C2=O)CCCCC...,78
4,C1CC1CN2[C@@H]([C@@H]([C@H]([C@H](N(C2=O)CCCCC...,77


In [4]:
print(sequences.shape)
sequences.head()

(104874, 2)


Unnamed: 0,sequence,seq_length
0,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,154
1,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...,165
2,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,154
3,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...,167
4,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAA...,166


In [5]:
def ascii_onehot(length: int, s: str) -> np.ndarray: 
    ''' like a one-hot with an ascii-table as features '''
    vec = np.zeros((length, 128))
    ind = [ord(x) for x in s]
    for i, cod in enumerate(ind): 
        vec[i, cod] = 1.
    return vec


In [6]:
def df_temp_(i: int) -> pd.DataFrame:
    spinner.start()
    dftemp_cols = [
      'sequence', 'smile', 
      'seq_onehot', 'smile_onehot', 
      'pred_binding', 'proba_pred_binding'
    ]
    
    k=PUC_models_list[i]
    
    df_temp = pd.DataFrame(columns=dftemp_cols)
    bc_ = models[k]

    smile = ligands.loc[k, 'smile']

    seq_samp = sequences.sample(rnn_num_samples)
        #i.e. from df_temp.seq_idx = seq_samp.index.values

    df_temp.sequence = seq_samp.sequence.values
    df_temp.seq_onehot = df_temp.sequence.apply(lambda x: ascii_onehot(length=pad_length_seq, s=x))

    df_temp.smile = [smile for _ in range(rnn_num_samples)]#ligands.loc[seq_samp.index].smile

    df_temp.smile_onehot = [ascii_onehot(length=pad_length_lig, s=smile) for _ in range(rnn_num_samples)]


    df_temp.pred_binding = bc_.predict(tfidf.transform(df_temp.sequence.values).toarray())
    df_temp.proba_pred_binding = bc_.predict_proba(tfidf.transform(df_temp.sequence.values).toarray())[:, 1]

    assert df_temp.pred_binding.sum() > 1
    sys.stdout.write(f"with {df_temp.pred_binding.sum()} positive vals")
    assert df_temp.shape==(rnn_num_samples, len(dftemp_cols))
    spinner.stop()
    sys.stdout.write('\r')
    return df_temp


In [7]:
df_total = reduce(lambda D, E: pd.concat([D, E], ignore_index=True), [df_temp_(i) for i,_ in enumerate(PUC_models_list)])

# df_total = pd.concat([df_temp_(i) for i,_ in enumerate(PUC_models_list)])

print(df_total.shape)

df_total.head()

.................|

TypeError: write() takes 2 positional arguments but 3 were given