## Translating Amino Acid Sequences to Vectors

In [1]:
import numpy as np
import pandas as pd
# import matplotlib as plt
# import keras

In [2]:
df = pd.read_parquet('Metal_all_20180116.snappy.parquet')
print ('***** Data Types *****' + '\n' + str(df.dtypes) + '\n\n' + 
       '***** Unique Ligands *****' + '\n' + str(df.ligandId.unique()))

***** Data Types *****
structureChainId      object
ligandId              object
fingerprint           object
groupNumber           object
sequence              object
interactingChains      int32
clusterNumber30      float64
clusterNumber40      float64
clusterNumber50      float64
clusterNumber70      float64
clusterNumber90      float64
clusterNumber95      float64
clusterNumber100     float64
dtype: object

***** Unique Ligands *****
['MN' 'CA' 'MG' 'ZN' 'CU' 'FE' 'CO' 'FE2' 'NI' 'CU1' '3CO' '3NI' 'MN3']


### Use entries with single-chained Zinc-binded sequence

In [3]:
df_zn = df.loc[df['ligandId'] == 'ZN']
df_zn_single = df_zn.loc[df_zn['interactingChains'] == 1]
seqs = np.array(df_zn_single.sequence)
print (seqs.shape)

(22823,)


### BioVec Embedding using gensim

#### https://arxiv.org/pdf/1310.4546.pdf , https://github.com/kyu999/biovec ProtVec module

#### Additional ref: http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0141287 , https://github.com/ehsanasgari/Deep-Proteomics  sample dataset


In [4]:
import biovec

# A very simple protein sequences to fasta file conversion
def convertSeqsToFasta (seqs, filename):
    file = open(filename, 'w')
    size = seqs.shape[0]
    for i,l in zip(range(size),seqs):
        file.write('>' + str(i) + '\n')
        file.write(l + '\n')



In [6]:
filename = 'seqs.fasta'
convertSeqsToFasta(seqs=seqs, filename=filename)
pv = biovec.models.ProtVec(filename, out="corpus.txt")

Generate Corpus file from fasta file...


........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [7]:
print (pv["TPE"])

[-0.20085467  0.46583471  0.67636544  0.498072    0.24153344 -0.17534502
  0.1848436  -0.18777981  0.11753874  0.57113546  0.26465893 -0.07601485
  0.17453228 -0.47819728  0.18457411 -0.12865703  0.03914425 -0.01401101
 -0.34250784 -0.24357861  0.02143645  0.41397545 -0.3279469  -0.27439451
 -0.23357424 -0.06622382 -0.05940906  0.21137157  0.31850484 -0.21036439
 -0.19450538  0.20496079 -0.07303259 -0.09646235 -0.00409356 -0.0400577
  0.1750669  -0.3919206  -0.04574221 -0.14515442 -0.29257917 -0.13053426
  0.1494278   0.20136715  0.14214486 -0.09993107 -0.06590401  0.40391287
 -0.09959547 -0.03526699 -0.29182854 -0.3125706   0.49385861 -0.02200506
 -0.26043606 -0.0072023  -0.18479767  0.07377576 -0.34786209 -0.16034891
 -0.08529822 -0.18368536 -0.32795399  0.28478763 -0.13469274  0.01643045
 -0.48008612  0.49897408  0.05230757 -0.2663542  -0.17242475  0.15964168
 -0.32219282 -0.14431168  0.19570076  0.01339126 -0.35851729  0.1412496
  0.25280127  0.05905769 -0.14834498  0.18064813 -0.2

In [8]:
print (seqs[0])

TPEMPVLENRAAQGDITAPGGARRLTGDQTAALRDSLSDKPAKNIILLIGDGMGDSEITAARNYAEGAGGFFKGIDALPLTGQYTHYALNKKTGKPDYVTDSAASATAWSTGVKTYNGALGVDIHEKDHPTILEMAKAAGLATGNVSTAELQGATPAALVAHVTSRKCYGPSATSEKCPGNALEKGGKGSITEQLLNARADVTLGGGAKTFAETATAGEWQGKTLREQAEARGYQLVSDAASLNSVTEANQQKPLLGLFADGNMPVRWLGPKATYHGNIDKPAVTCTPNPQRNDSVPTLAQMTDKAIELLSKNEKGFFLQVEGASIDKQDHAANPCGQIGETVDLDEAVQRALEFAKKEGNTLVIVTADHAHASQIVAPDTKAPGLTQALNTKDGAVMVMSYGNSEEDSQEHTGSQLRIAAYGPHAANVVGLTDQTDLFYTMKAALGLK


In [10]:
# The output has 3 vectors of dimension 100.
# vector_2 -> skipping first letter and make corpus
# vecotr_3 -> skipping first and second letters and make corpus
print (pv.to_vecs(seqs[0]))

[array([  6.03224134,   5.05971718,  33.83206558,  -9.18672371,
        26.9513588 , -41.17642212,  32.32953644,   5.79460812,
       -17.66915512,  23.44058418,   4.96797419,   8.66376305,
        21.18340111, -10.09245014,  11.92599392,  -5.94735241,
       -22.86287689,  35.16371536,  34.86291122, -15.44340229,
       -11.85766602,  20.34383774,  -8.11576939, -26.85851288,
         5.65278673,  15.26967335, -34.42901993,  36.91520309,
        35.93103027, -42.7673645 ,  -7.73987627,   1.45264971,
        21.13653564, -11.14747143,  14.84906769, -26.54049873,
        48.55208588, -25.39860535,   1.37245882,  49.30249023,
         6.68801737,  -0.36679772,  24.8449192 ,   5.53102446,
        15.29855824,  17.22402   ,   2.26302481,  11.69782257,
         0.2813167 , -39.85539627,   1.60654569, -38.69297028,
        30.77915764,   6.30818272, -13.64509869,  11.51874828,
        12.80219555,  17.10786247, -27.1062603 ,  -3.58662581,
        -5.71256399,  -5.55887651,  -3.99650288,  26.4

#### Playing with protVec  
##### Increase dissimilarity between two sequences by replacing letters of the copy 
##### Expect to see an increase in standard deviation of difference between two vectors

In [12]:
# Randomly mutate the copy and check difference in standard deviation
seq1 = seq2 = seqs[1]
for i in range(10):
    target_idx = np.random.randint(len(seq1))
    while seq1[target_idx] == seq2[target_idx]:
        copy_idx = np.random.randint(len(seq1))
        s = list(seq1)
        s[target_idx] = seq2[copy_idx]
        seq1 = "".join(s)
    print(np.std(np.array(pv.to_vecs(seq1)) - np.array(pv.to_vecs(seq2))))

0.465471
0.712111
0.928998
1.00264
1.06345
1.13109
1.2725
1.32809
1.49363
1.59698


##### Test missing corpus duplicates in output.txt

In [13]:
filename = 'testMissingDuplicates_0.fasta'
convertSeqsToFasta(seqs=seqs[:2], filename=filename)
pv_0 = biovec.models.ProtVec(filename)
pv_0["NPQ"]

Generate Corpus file from fasta file...
..

array([-0.03642675,  0.06321725,  0.12695576, -0.12131022,  0.11035305,
       -0.03207188,  0.1317859 ,  0.10986187,  0.10576672,  0.00571126,
        0.10223284,  0.11277767,  0.16092938, -0.06686632,  0.26335388,
       -0.15971787,  0.23755193,  0.09742025, -0.18554738,  0.08825751,
       -0.21210045,  0.03249475, -0.06986614, -0.06771938, -0.06051311,
        0.10819146, -0.07332271,  0.01906307,  0.14862406,  0.12978467,
       -0.14219448,  0.03657615,  0.21255527, -0.12997495,  0.10255323,
       -0.24640146,  0.10983435, -0.11481737, -0.06367569,  0.03849897,
       -0.03279691,  0.03921964, -0.03765773,  0.17073365,  0.02846995,
        0.06542655, -0.22108281, -0.01697577, -0.07214518,  0.0035334 ,
       -0.1221192 ,  0.04710317, -0.00510394, -0.1097883 , -0.17736433,
       -0.12891047,  0.32980332,  0.05192059, -0.04775158,  0.08287971,
       -0.1058209 , -0.1243704 ,  0.0929676 ,  0.08649253,  0.15910184,
        0.13226686, -0.05575509,  0.0447101 ,  0.07344429, -0.04

In [14]:
idx = seqs[0].find("NPQ")
s = list(seqs[0])
s[idx] = s[idx+1] = s[idx+2] = 'T'
seqs[0] = "".join(s)
filename = 'testMissingDuplicates_1.fasta'
convertSeqsToFasta(seqs=seqs[:2], filename=filename)
pv_1 = biovec.models.ProtVec(filename)

Generate Corpus file from fasta file...
..

In [15]:
seqs[1].find("NPQ")

288