In [28]:

from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import os

import numpy as np
import torch
import numpy.typing as npt

from tqdm import tqdm
from transformers import BatchEncoding, PreTrainedTokenizerFast, BertForMaskedLM

from dataset import FastaDataset

In [49]:

from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import os

import numpy as np
import torch
import numpy.typing as npt

from tqdm import tqdm
from transformers import BatchEncoding, PreTrainedTokenizerFast, BertForMaskedLM

from dataset import FastaDataset

CODON_TO_CHAR = {
    "TCG": "A",
    "GCA": "B",
    "CTT": "C",
    "ATT": "D",
    "TTA": "E",
    "GGG": "F",
    "CGT": "G",
    "TAA": "H",
    "AAA": "I",
    "CTC": "J",
    "AGT": "K",
    "CCA": "L",
    "TGT": "M",
    "GCC": "N",
    "GTT": "O",
    "ATA": "P",
    "TAC": "Q",
    "TTT": "R",
    "TGC": "S",
    "CAC": "T",
    "ACG": "U",
    "CCC": "V",
    "ATC": "W",
    "CAT": "X",
    "AGA": "Y",
    "GAG": "Z",
    "GTG": "a",
    "GGT": "b",
    "GCT": "c",
    "TTC": "d",
    "AAC": "e",
    "TAT": "f",
    "GTA": "g",
    "CCG": "h",
    "ACA": "i",
    "CGA": "j",
    "TAG": "k",
    "CTG": "l",
    "GGA": "m",
    "ATG": "n",
    "TCT": "o",
    "CGG": "p",
    "GAT": "q",
    "ACC": "r",
    "GAC": "s",
    "GTC": "t",
    "TGG": "u",
    "CCT": "v",
    "GAA": "w",
    "TCA": "x",
    "CAA": "y",
    "AAT": "z",
    "ACT": "0",
    "GCG": "1",
    "GGC": "2",
    "CTA": "3",
    "AAG": "4",
    "AGG": "5",
    "CAG": "6",
    "AGC": "7",
    "CGC": "8",
    "TTG": "9",
    "TCC": "!",
    "TGA": "@",
    "XXX": "*",
}

# enter the fasta filepath to a fasta path:
fasta_path = "../data/datasets/sample_fasta.fasta"

# enter the checkpoint to the tokenizer:
tokenizer_path = "cpe_tokenizer_retrained_3000"


model_checkpoint = "checkpoints/bpe/cpe_tokenizer/bert/checkpoint-34000"

# ImportError: cannot import name 'GenSLMColatorForLanguageModeling' from 'dataset' (/home/couchbucks/Documents/saketh/cpe/cpe/dataset.py)
from transformers import BatchEncoding, DataCollatorForLanguageModeling
class GenSLMColatorForLanguageModeling(DataCollatorForLanguageModeling):
    """Augment the underlying DataCollatorForLanguageModeling to handle
    multiple batch encoding inputs."""

    def __init__(self, train_mode: bool = False, **kwargs) -> None:
        self.train_mode = train_mode
        super().__init__(**kwargs)

    def tokenize(self, sequences: List[str]) -> BatchEncoding:
        return self.tokenizer(
            sequences,
            return_tensors="pt",
            truncation=True,
            padding='max_length',
            return_special_tokens_mask=self.train_mode and self.mlm,
            max_length=1024)

    def torch_call(self, examples: List[str]) -> Dict[str, Any]:
        # First, tokenize the batch
        batch = self.tokenize(examples)
        
        # We only need to mask tokens if we are training
        if not self.train_mode:
            return batch

        if self.mlm:
            # If special token mask has been preprocessed, pop it from the dict.
            batch["input_ids"], batch["labels"] = self.torch_mask_tokens(
                batch["input_ids"],
                special_tokens_mask=batch.pop("special_tokens_mask", None),
            )
        else:
            labels = batch["input_ids"].clone()
            if self.tokenizer.pad_token_id is not None:
                labels[labels == self.tokenizer.pad_token_id] = -100
            batch["labels"] = labels
        return batch



In [32]:
from torch.utils.data import DataLoader

In [50]:


def generate_embeddings_and_logits(model, dataloader):
    embeddings = []
    lsoftmax = torch.nn.LogSoftmax(dim=1)
    with torch.no_grad():
        for batch in tqdm(dataloader):
            print(batch)
            batch = batch.to(model.device)
            outputs = model(**batch, output_hidden_states=True)
            last_hidden_states = outputs.hidden_states[-1].cpu().numpy()
        #    seq_lengths = batch.attention_mask.sum(axis=1)
            embeddings.append(last_hidden_states)
        
    return embeddings



def llm_inference(
    tokenizer_path: Path,
    model_path: Path,
    fasta_path: Path,
    return_codon: bool,
    return_aminoacid: bool,
    batch_size: int,
    fasta_contains_aminoacid: bool = False,
) -> Tuple[npt.ArrayLike, npt.ArrayLike, npt.ArrayLike]:
    
    
    if os.path.isfile(Path(tokenizer_path)):
        # These are for the .json files
        tokenizer = PreTrainedTokenizerFast.from_pretrained(
            pretrained_model_name_or_path=tokenizer_path
        )

    else:
        # These are for the bpe tokenizers
        tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)
        
    special_tokens = {
            "unk_token": "[UNK]",
            "cls_token": "[CLS]",
            "sep_token": "[SEP]",
            "pad_token": "[PAD]",
            "mask_token": "[MASK]",
            "bos_token": "[BOS]",
            "eos_token": "[EOS]",
        }

        # for some reason, we need to add the special tokens even though they are in the json file
    tokenizer.add_special_tokens(special_tokens)

    model = BertForMaskedLM.from_pretrained(model_checkpoint)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device).eval()

    if fasta_contains_aminoacid:
        dataset = FastaAminoAcidDataset(file_path=fasta_path)
    else:
        dataset = FastaDataset(
            file_path=fasta_path,
            num_char_per_token = 3,
            convert_to_aa = False,
            tokenizer_type = "cpe_tokenizer"
        )

    data_collator = GenSLMColatorForLanguageModeling(
        train_mode=False,
        tokenizer=tokenizer,
    )

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=data_collator,
        num_workers=4,
        pin_memory=True,
    )

    embeddings = generate_embeddings_and_logits(model, dataloader)

    return embeddings

embeddings= llm_inference(
    tokenizer_path,
    model_checkpoint,
    fasta_path,
    return_codon = False,
    return_aminoacid = False,
    batch_size = 1,
    fasta_contains_aminoacid = False,
)


3


  0%|          | 0/3 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling pa

{'input_ids': tensor([[  5,  58, 543,  ...,   3,   3,   3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}


 33%|███▎      | 1/3 [00:01<00:02,  1.05s/it]

{'input_ids': tensor([[   5,  179, 1268,  ...,    3,    3,    3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}


 67%|██████▋   | 2/3 [00:01<00:00,  1.12it/s]

{'input_ids': tensor([[   5,  103, 1974,  ...,    3,    3,    3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}


100%|██████████| 3/3 [00:02<00:00,  1.12it/s]


In [45]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)
tokenizer(
            "aaas;gq8934LAGQ#$Iawenfo;",
            return_tensors="pt",
            truncation=True,
            padding='max_length',
           # return_special_tokens_mask=self.train_mode and self.mlm,
            max_length=1024,
        )


{'input_ids': tensor([[  5,  97, 523,  ...,   3,   3,   3]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}

[array([[[-0.23334244,  1.3598535 , -1.421997  , ..., -0.08454225,
          -1.5931135 , -0.34243402],
         [-2.2037168 ,  0.06538261, -1.5209543 , ..., -1.1352464 ,
          -1.692257  , -0.98212147],
         [-0.8462671 ,  0.87533736, -1.2767893 , ..., -0.3343385 ,
          -1.6808283 , -0.3408133 ],
         ...,
         [-0.12490162,  0.22917472, -1.2925127 , ..., -0.38309816,
          -1.9702008 , -0.5219698 ],
         [ 0.52098024, -0.7124421 ,  0.72963756, ...,  1.1690844 ,
          -0.5265065 ,  0.02509315],
         [-0.6468866 ,  0.22047757, -1.2802352 , ...,  0.2552388 ,
          -2.201959  ,  0.05643922]]], dtype=float32),
 array([[[ 0.19624719,  0.9082316 , -1.5482137 , ..., -0.10049539,
          -1.6603034 , -0.4468241 ],
         [-1.7120521 ,  0.7479217 , -1.1822919 , ..., -0.08151136,
          -0.8333578 ,  0.11761697],
         [-0.28487328,  0.6458622 , -1.2428639 , ..., -0.09778947,
          -1.8639753 , -0.41054076],
         ...,
         [-0.32222

In [52]:
np.array(embeddings).shape

(3, 1, 1024, 240)