# MSA-VAE for TF2.0

In [1]:
from models.vaes import MSAVAE
from models.protcnn import BaseProtVAE
from utils.io import load_gzdata
from utils.data_loaders import one_hot_generator

import tensorflow.keras as keras

import numpy as np
import pandas as pd

2022-05-21 03:39:39.692080: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-21 03:39:39.692116: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


This is the data use in the original paper : **Generating novel protein variants with variational autoencoders** by Alex Hawkins-Hooker et al. *PLOS Computational Biology*, Feb 2021.

In [2]:
a, raw_seqs = load_gzdata('data/training_data/ll_train.fa.gz', one_hot=False)
b, msa_seqs = load_gzdata('data/training_data/luxafilt_llmsa_train.fa.gz', one_hot=False)

Let us see what inside the data

In [3]:
print('raw data')
len(raw_seqs), len(msa_seqs)
print(raw_seqs[:5])
lraw = np.array([len(s) for s in raw_seqs])
print('mean, std, max and min:', lraw.mean(), lraw.std(), lraw.max(), lraw.min())
print()

print('msa data')
print(msa_seqs[0])
print(msa_seqs[1])
print(msa_seqs[-1])
lmsa = np.array([len(s) for s in msa_seqs])
print('mean, std, max and min:', lmsa.mean(), lmsa.std(), lmsa.max(), lmsa.min())

raw data
['MAELKLGYKASAEQFAPRELVELAVAAEAHGMDSATVSDHFQPWRHEGGHAPFSLAWMTAVGERTTRITLGTSVLTPTFRYNPAVVAQAFATMACLYPGRIFLGVGTGEALNEIATGYQGEWPEFKERFARLRESVRLMRELWRGDRVDFDGEYYRLKGASIYDVPDGGVPIYIAAGGPAVAKYAGRAGDGFICTSGKGEELYKDKLIPAVKEGAAINDRNVDDIDKMIEIKISYDPDPELALENTRFWAPLSLTAEQKHSIDDPIEMEKAADALPIEQVAKRWIVASDPDEAVAKVKDYVDWGLNHLVFHAPGHDQRRFLELFEKDLAPRLRRLG', 'MAELKLGYKASAEQFAPRELVELAVLAESAGMDSATVSDHFQPWRHEGGHAPFSLAWMTAVGERTKNLVLGTSVLTPTFRYNPAVIAQAFATMGCLYPGRIFLGVGTGEALNEIATGYAGEWPEFKERFARLRESVRLMRELWLGDRVDFDGEYYRTKGASIYDVPEGGIPVYIAAGGPVVAKYAGRAGDGFICTSGKGEELYAEKLIPAVKEGAAAADRDADAIDRMIEIKISYDTDPELALENTRFWAPLSLTAEQKHSIDDPIEMEKAADALPIEQVAKRWIVASDPDEAVEKVGQYVKWGLNHLVFHAPGHDQRRFLELFKRDLEPRLRKLA', 'MSLNMFWFLPTHGDGHYLGTEEGSRPVDHGYLQQIAQAADRLGYTGVLIPTGRSCEDAWLVAASMIPVTQRLKFLVALRPSVTSPTVAARQAATLDRLSNGRALFNLVTGSDPQELAGDGVFLDHSERYEASAEFTQVWRRLLLGETVDFNGKHIHVRGAKLLFPPIQQPYPPLYFGGSSDVAQELAAEQVDLYLTWGEPPELVKEKIEHVRAKAAAHGRKIRFGVRLHVIVRETNDEAWQAAERLISRLDDETIAKAQAAFARTDSVGQQRMAALHNGKRDNLEISPNLWAGVGLVRGGAGTALVGDG

And this is a data generator used in the original author

In [4]:
train_gen = one_hot_generator(msa_seqs, padding=None)
for (x,y) in train_gen:
    print(x.shape, y.shape)
    print(x[0, 0,:], y[0, 0,:])
    break

(32, 360, 21) (32, 360, 21)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


## Model & training

In [5]:
model = MSAVAE()

2022-05-21 03:39:43.031693: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-21 03:39:43.031733: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-21 03:39:43.031757: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (codespaces-b9df07): /proc/driver/nvidia/version does not exist
2022-05-21 03:39:43.032030: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
model.compile(optimizer=keras.optimizers.Adam())
model.fit(train_gen, epochs=5,  steps_per_epoch=20)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f55a2a78ee0>

In [8]:
model.summary()

Model: "msavae_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (Functional)          [(None, 10),              2006548   
                              (None, 10)]                        
                                                                 
 model_1 (Functional)        (None, 360, 21)           2011528   
                                                                 
Total params: 4,018,084
Trainable params: 4,018,076
Non-trainable params: 8
_________________________________________________________________


## Generate luxA variants

Following the original repo, we can remove or not remove the gap generated by the MSA-VAE model.

In [9]:
xx = model.generate_variants_luxA(5)
print(xx)

(5, 10)
['MKFGLFGAELVELAKLAEELGFDSAWVGEHHFPSPFTVLAAAAARTKRIRLGTAVTVLPHPVRVAQDAATLDHLSGRELGIGRGEFAFRGARFEEALELLRRLWTFEGKFVPRPQHPPIWVAGGSLERAGRYGDGLLEFALVDLYRAAGFVADTDAAYGPVIEKLLDRLSEFAEEVAPLR', 'FGLFLLELVELAKLAEAGFDGVFVAEHHFPDPLTLLAALAAAERIGLGTTLSHPALVARRFATLDHLSGRLNIGTGERYERAEEFLEVLRKLWEGFPIWIAEAGRYGDGVYRARLLGEKAEPLR', 'MKFGLFGRERELVELAKLAEELGFDVAWVGEHHFSSPFTVLAAAAARTKRIRLGTAVTVLSDPVRVAEDFATLDHLSGRELGIGRGEFPLFYDYRELFEEKLELLRKLWRVTWEGKFVYPRPQGPPIWVAGGESAERAGRLGLGLLLAPEFALVDLYREAAAGHFVADDAAERYFGPEVIEKILEADRELRSIELAEEVAPLR', 'MKFGLFGAELVELAKLAEELGFDGAWVGEHHFPSPFTVLAAAAARTKRIRLGTAVTVLPHPVRVAEDFATLDHLSGRELGIGRGEFRGALFEEALELLRALWTFEGKFVPRPPPIWIAGGSAERAAEYGDGLAEFALVDLYREAAGFVADDAAYGPVIEKLLDRLEFAEEVAPLR', 'MKFGLFGRELVELAKLAEELGFDSFWVGEHHFPSPFTVLAAAAARTKRIRLGTAVTVLSDPVRVAEDAATLDLLSGRELGIGRGEFLFYRGALFREALELLRALWTVTFEGKFVPRPQHPPIWVAGGSAERAGRLGDGLLAPEFALVDLYREAAGFVADDAARYGPVIEKLLDRLSLELAEEVAPLR']


In [10]:
xx = model.generate_variants_luxA(5,remove_gaps=False)
print(xx)

(5, 10)
['MKFGLF--G--------RERI-ELVEEAVLAEELGFDVAWVGEHHF------SSPFTVLAAAAARTKRIRLGTAVTVLS--DPVRVAEDFATLDHLS-GR-ELGIGRGF---EFPLF-YD---YRELFEEKLDLLRKLWR---VTWEGK-F------VYPRP-Q--GPPIWVAGG--ESAERAGRLGLGLLLAI--P--EFA-LVDLYREAAA--GH-----------H-FVAD-D--A-AE--YY-------F-----------------------------------------------G-P--VIEKLL---EA---DR--L---G-----E-LRSIE-LAEEVAP-LR---A----', 'MEFGLF--G--------RER---LVEEAVLAEELGFDVAWVGEHHF------SSPFTVLAAAAARTKRIRLGTAVTVLS--DPVRVAEDAATLDLLS-GR-ELGIGRGF---EFPLF-YD---YRELFEEKLELLRRLWT---VTWEGK-F------VYPRPVQ--GPPIWVAGG--ESVERAGRLGLPLLLA---PP-EFA-LVDLYREAAA--GH-----------HVFVADTD--A-AE-RRY-------F----------------------------------------A------G-PDEVIEKILRL-EA---DR----Q-------E-LRSIE-FAEEVAP-LR---A----', 'MKFGLF--G--------AE----LVELAKLAEELGFDSAWVGEHHF------PSPFVVLAAAAARTKRIRLGTAVTVLP--HPVRVAEDFATLDHLS-GR-ELGVGRG----EF--F------RGELFEEALELLRALWT-----FEGK-F------V-PRP-Q--HPPIWVAGG---SAERAGRLGDGLL-A------EFA-LVDLYREAA---G--------------FVAD-D--A-A----Y----------------