## Translating Amino Acid Sequences to Vectors

In [71]:
import numpy as np
import pandas as pd
import matplotlib as plt
import keras

In [None]:
!PATH

In [72]:
df = pd.read_parquet('Metal_all_20180116.snappy.parquet')
print ('***** Data Types *****' + '\n' + str(df.dtypes) + '\n\n' + 
       '***** Unique Ligands *****' + '\n' + str(df.ligandId.unique()))

***** Data Types *****
structureChainId      object
ligandId              object
fingerprint           object
groupNumber           object
sequence              object
interactingChains      int32
clusterNumber30      float64
clusterNumber40      float64
clusterNumber50      float64
clusterNumber70      float64
clusterNumber90      float64
clusterNumber95      float64
clusterNumber100     float64
dtype: object

***** Unique Ligands *****
['MN' 'CA' 'MG' 'ZN' 'CU' 'FE' 'CO' 'FE2' 'NI' 'CU1' '3CO' '3NI' 'MN3']


In [73]:
df_zn = df.loc[df['ligandId'] == 'ZN']
df_zn_single = df_zn.loc[df_zn['interactingChains'] == 1]
seqs = np.array(df_zn_single.sequence)
teacher = np.array(df_zn_single.fingerprint)
print (seqs.shape)

# del df
# del df_zn
# del df_zn_single

(22823,)


### BioVec Embedding using gensim

#### https://arxiv.org/pdf/1310.4546.pdf , https://github.com/kyu999/biovec ProtVec module

#### Additional ref: http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0141287 , https://github.com/ehsanasgari/Deep-Proteomics  sample dataset


In [74]:
import biovec

# A very simple protein sequences to fasta file conversion
def convertSeqsToFasta (seqs, filename):
    file = open(filename, 'w')
    size = seqs.shape[0]
    for i,l in zip(range(size),seqs):
        file.write('>' + str(i) + '\n')
        file.write(l + '\n')

In [None]:
filename = 'all.fasta'
convertSeqsToFasta(seqs=df.sequence, filename=filename)
pv = biovec.models.ProtVec(filename)

In [75]:
pv = biovec.models.load_protvec('./modelforallwords')

In [None]:
# The output has 3 vectors of dimension 100.
# vector_2 -> skipping first letter and make corpus
# vecotr_3 -> skipping first and second letters and make corpus

In [76]:
def prepareFeatureVector(inputSeq, teacherSeq, pv):
    X = []
    T = []
    for i in range(teacherSeq.shape[0]):
        v = pv.to_vecs(inputSeq[i])
        X.extend(v)
        T.extend([teacherSeq, teacherSeq, teacherSeq])
    return X,T

In [None]:
import json

def toVector(seqs, pv):
    X1 = {}
    X2 = {}
    X3 = {}
    for i in range(seqs.shape[0]):
        G = pv.to_vecs(seqs[i])
        X1[i] = G[0].tolist()
        X2[i] = G[1].tolist()
        X3[i] = G[2].tolist()
        
    with open('X1', 'w') as fp:
        json.dump(X1, fp)
    with open('X2', 'w') as fp:
        json.dump(X2, fp)
    with open('X3', 'w') as fp:
        json.dump(X3, fp)
    
    return X1, X2, X3
        
X1,X2,X3 = toVector(seqs, pv)

In [78]:
print (len(X1))

22823


In [77]:
with open('X1', 'r') as fp:
    X1 = json.load(fp)

In [None]:
with open('X2', 'r') as fp:
    X2 = json.load(fp)
        
with open('X3', 'r') as fp:
    X3 = json.load(fp)

In [79]:
def toOnehot(inputSeq, teachingSeq):
    T_onehot = []
    for i in range(teachingSeq.shape[0]):
        t = [0] * len(seqs[i])
        for j in teachingSeq[i]:
            t[j] = 1 
        T_onehot.append(t)        
    return T_onehot

In [80]:
T_onehot = toOnehot(inputSeq=X1, teachingSeq=teacher)

## SVM and RNN

In [81]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import TimeDistributed
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [None]:
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# This does work by using only one sample:
data = [[0,0,0,0,0,0,0,0,0,2,1]]
data = np.array(data, dtype=float)
target = [0,0,0,0,0,0,0,0,2,1,0]
target = np.array(target, dtype=float)

data = data.reshape((1, 1, 11)) # Single batch, 1 time steps, 11 dimentions
target = target.reshape((-1, 11)) # Corresponds to shape (None, 11)

In [103]:
# Build Model
model = Sequential()  
model.add(LSTM(120, input_shape=(None,100), return_sequences=True))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [104]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_24 (LSTM)               (None, None, 120)         106080    
_________________________________________________________________
time_distributed_17 (TimeDis (None, None, 1)           121       
Total params: 106,201
Trainable params: 106,201
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Do the output values match the target values?
predict = model.predict(data)
print repr(data)
print repr(predict)

In [105]:
maxIter = 10
for i in range(maxIter):
    timespan = len(seqs[i])
    x = np.array(X1[str(i)]).reshape(1,1,100)
    t = np.array(T_onehot[i]).reshape(1,timespan,1)
    model.fit(x, t, epochs=1, batch_size=1, verbose=2)
    print (x.shape, t.shape)

Epoch 1/1
 - 1s - loss: 1.4170 - acc: 0.0089
(1, 1, 100) (1, 449, 1)
Epoch 1/1
 - 0s - loss: 0.9114 - acc: 0.0067
(1, 1, 100) (1, 449, 1)
Epoch 1/1
 - 0s - loss: 0.8377 - acc: 0.0102
(1, 1, 100) (1, 294, 1)
Epoch 1/1
 - 0s - loss: 0.4695 - acc: 0.9924
(1, 1, 100) (1, 393, 1)
Epoch 1/1
 - 0s - loss: 0.3102 - acc: 0.9924
(1, 1, 100) (1, 393, 1)
Epoch 1/1
 - 0s - loss: 0.3346 - acc: 0.9874
(1, 1, 100) (1, 239, 1)
Epoch 1/1
 - 0s - loss: 0.3382 - acc: 0.9718
(1, 1, 100) (1, 142, 1)
Epoch 1/1
 - 0s - loss: 0.2836 - acc: 0.9851
(1, 1, 100) (1, 202, 1)
Epoch 1/1
 - 0s - loss: 0.1832 - acc: 0.9755
(1, 1, 100) (1, 163, 1)
Epoch 1/1
 - 0s - loss: 0.1811 - acc: 0.9744
(1, 1, 100) (1, 156, 1)


In [106]:
predict = model.predict(np.array(X1[str(0)]).reshape(1,1,100))

##### Problems:
When doing binary classification site by site
1. Variable target length
2. Overfitting
3. Negative abundance
4. Too many similar sequences
5. Embedded vectors scale with input length

In [107]:
print (predict)

[[[ 0.10553339]]]


In [None]:
class frogNet (keras.layers.Layer):
    def __init__(self, units, **kwargs):
        self.units = units
        self.state_size = units
        super(frogNet, self).__init__(**kwargs)
        
    def build(self, input_shape):
             
    def forward(self, inputs, states):
          
    model = Sequential()
    model.add(LSTM(4, input_shape=(1, look_back)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)
        