# Creating and searching against vector databases with TM-Vec

To form protein databases that are easily stored using vector embeddings, we will:
1. Feed ProTrans embeddings into out TM-Vec model
2. Generate a DB of protein vectors
3. Search against our DB and plot the results


__Refer to protein_embedding_datasets.ipynb for further explanation of functions in utils.py__

In [None]:
from utils import load_protein_t5_embedding, read_fasta_file
from tqdm import tqdm
import argparse
import torch

In [None]:
#Embed a protein using tm_vec (takes as input a prottrans embedding)
def embed_tm_vec(prottrans_embedding, model_deep, device):
    padding = torch.zeros(prottrans_embedding.shape[0:2]).type(torch.BoolTensor)
    tm_vec_embedding = model_deep(prottrans_embedding, src_mask=None, src_key_padding_mask=padding)

    return(tm_vec_embedding.cpu().detach().numpy())

In [None]:
# parse arguments
parser = argparse.ArgumentParser()

# Modify the default values of the arguments to match the desired values
parser.add_argument("--n_sequences", type=int, default=20)
parser.add_argument("--model_name", type=str, default="Rostlab/prot_t5_xl_uniref50")
parser.add_argument("--tokenizer_name", type=str, default="Rostlab/prot_t5_xl_uniref50")
args = parser.parse_args("")

# Parse bagel.fa
sequence_list = read_fasta_file("bagel.fa", args.n_sequences)
embed_list = []
print("Accepted protein sequences: ", sequence_list, "\n")

# Embed the random/inputted protein sequence(s)
for sequence in tqdm(sequence_list):
    print("Accepted protein sequence: ", sequence, "\n")
    test_embed = load_protein_t5_embedding(sequence, args.model_name, args.tokenizer_name).numpy()
    #reshape embeddings to fit the skbio format
    embed_list.append(test_embed.reshape(test_embed.shape[0], -1))