In [1]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Verify the path is set
print("TRITON_LIBCUDA_PATH:", os.getenv("TRITON_LIBCUDA_PATH"))

TRITON_LIBCUDA_PATH: /nix/store/z8ac4sgxc4h86zfmlz7yi0kkv95wgz84-graphics-drivers/lib


In [2]:
import torch
from evo2 import Evo2

evo2_model = Evo2("evo2_1b_base")

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 25/25 [00:00<00:00, 314.46it/s]


Extra keys in state_dict: {'blocks.3.mixer.dense._extra_state', 'blocks.17.mixer.dense._extra_state', 'unembed.weight', 'blocks.24.mixer.dense._extra_state', 'blocks.10.mixer.dense._extra_state'}


In [None]:
# example

sequence = "ACGTAG"
input_ids = (
    torch.tensor(
        evo2_model.tokenizer.tokenize(sequence),
        dtype=torch.int,
    )
    .unsqueeze(0)
    .to("cuda:0")
)

layer_name = "blocks.24.mlp.l3"

outputs, embeddings = evo2_model(input_ids, return_embeddings=True, layer_names=[layer_name])

print("Embeddings shape: ", embeddings[layer_name].shape)

In [3]:
from Bio import SeqIO
import numpy as np
import pandas as pd
from tqdm import tqdm

In [4]:
seq_subset_30k = list(SeqIO.parse("../dataset/1Jan2025_genomes_subset_30k.fa", "fasta"))

In [7]:
# Process all sequences in the FASTA file and extract embeddings

# Create lists to store results and failed sequences
results = []
failed_sequences = []

# Process each sequence
for record in tqdm(seq_subset_30k):
    try:
        # Extract sequence and header
        sequence = str(record.seq)
        header = record.description

        # Make sure GPU memory is cleared before processing
        torch.cuda.empty_cache()

        # Tokenize and get embeddings
        input_ids = (
            torch.tensor(
                evo2_model.tokenizer.tokenize(sequence),
                dtype=torch.int,
            )
            .unsqueeze(0)
            .to("cuda:0")
        )

        layer_name = "blocks.24.mlp.l3"

        # Get embeddings with explicit dtype to avoid BFloat16 issues
        with torch.amp.autocast("cuda", enabled=False):
            outputs, embeddings = evo2_model(
                input_ids, return_embeddings=True, layer_names=[layer_name]
            )

        # Extract the embeddings tensor and ensure it's float32
        embedding_tensor = embeddings[layer_name].to(torch.float32)

        # Average over the sequence length dimension to get a 1920-dim vector
        # Shape goes from [1, n, 1920] to [1, 1920] to [1920]
        avg_embedding = embedding_tensor.mean(dim=1).squeeze().cpu().numpy()

        # Store results
        results.append({"header": header, "embedding": avg_embedding})

        # Clear GPU cache to free memory
        torch.cuda.empty_cache()

    except Exception as e:
        print(f"Error processing sequence {header}: {e}")
        # Record the failed sequence
        failed_sequences.append(header)
        # Force GPU memory cleanup
        torch.cuda.empty_cache()
        continue

# Extract headers
headers = [r["header"] for r in results]

# Create a numpy array of all embeddings
embeddings_array = np.stack([r["embedding"] for r in results])

print(f"Processed {len(results)} sequences")
print(f"Embeddings array shape: {embeddings_array.shape}")
print(f"Number of headers: {len(headers)}")

# Save embeddings and headers with model-specific names
np.save("../results/embeddings_evo2.npy", embeddings_array)
pd.DataFrame({"header": headers}).to_csv("../results/headers_evo2.csv", index=False)

# Save failed sequences
if failed_sequences:
    print(f"Failed to process {len(failed_sequences)} sequences")
    pd.DataFrame({"header": failed_sequences}).to_csv(
        "../results/failed_sequences_evo2.csv", index=False
    )

 58%|█████▊    | 1938/3316 [28:50<54:33,  2.38s/it]  

Error processing sequence PP599996 Yersinia phage vB_YpM_MHG7, complete genome.: CUDA out of memory. Tried to allocate 3.40 GiB. GPU 0 has a total capacity of 15.60 GiB of which 2.63 GiB is free. Including non-PyTorch memory, this process has 12.96 GiB memory in use. Of the allocated memory 6.72 GiB is allocated by PyTorch, and 4.08 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 58%|█████▊    | 1939/3316 [28:51<48:58,  2.13s/it]

Error processing sequence PP599995 Yersinia phage vB_YpM_MHG54, complete genome.: CUDA out of memory. Tried to allocate 3.40 GiB. GPU 0 has a total capacity of 15.60 GiB of which 2.63 GiB is free. Including non-PyTorch memory, this process has 12.96 GiB memory in use. Of the allocated memory 6.72 GiB is allocated by PyTorch, and 4.08 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 59%|█████▊    | 1940/3316 [28:53<45:26,  1.98s/it]

Error processing sequence PP599994 Yersinia phage vB_YpM_MHG39, complete genome.: CUDA out of memory. Tried to allocate 3.38 GiB. GPU 0 has a total capacity of 15.60 GiB of which 2.66 GiB is free. Including non-PyTorch memory, this process has 12.93 GiB memory in use. Of the allocated memory 6.70 GiB is allocated by PyTorch, and 4.06 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 59%|█████▊    | 1946/3316 [29:08<53:59,  2.36s/it]  

Error processing sequence PP537607 Yersinia phage vB_YpM_MHG101, complete genome.: CUDA out of memory. Tried to allocate 3.41 GiB. GPU 0 has a total capacity of 15.60 GiB of which 2.87 GiB is free. Including non-PyTorch memory, this process has 12.72 GiB memory in use. Of the allocated memory 6.60 GiB is allocated by PyTorch, and 3.96 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 59%|█████▉    | 1964/3316 [29:44<19:33,  1.15it/s]  

Error processing sequence PP516625 Yersinia phage vB_YpM_MHG38, complete genome.: CUDA out of memory. Tried to allocate 3.41 GiB. GPU 0 has a total capacity of 15.60 GiB of which 3.39 GiB is free. Including non-PyTorch memory, this process has 12.20 GiB memory in use. Of the allocated memory 6.43 GiB is allocated by PyTorch, and 3.62 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 59%|█████▉    | 1966/3316 [29:51<49:48,  2.21s/it]

Error processing sequence PP516623 Yersinia phage vB_YpM_MDG94-186, complete genome.: CUDA out of memory. Tried to allocate 3.38 GiB. GPU 0 has a total capacity of 15.60 GiB of which 2.68 GiB is free. Including non-PyTorch memory, this process has 12.91 GiB memory in use. Of the allocated memory 6.70 GiB is allocated by PyTorch, and 4.06 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 64%|██████▍   | 2131/3316 [36:53<1:01:46,  3.13s/it]

Error processing sequence NC_070916 Yersinia phage vB_YenM_06.16-2, complete genome.: CUDA out of memory. Tried to allocate 3.38 GiB. GPU 0 has a total capacity of 15.60 GiB of which 2.78 GiB is free. Including non-PyTorch memory, this process has 12.81 GiB memory in use. Of the allocated memory 6.69 GiB is allocated by PyTorch, and 4.04 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 64%|██████▍   | 2132/3316 [36:54<49:55,  2.53s/it]  

Error processing sequence NC_070911 Salmonella phage BIS20, complete genome.: CUDA out of memory. Tried to allocate 3.38 GiB. GPU 0 has a total capacity of 15.60 GiB of which 2.79 GiB is free. Including non-PyTorch memory, this process has 12.80 GiB memory in use. Of the allocated memory 6.68 GiB is allocated by PyTorch, and 4.04 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 65%|██████▌   | 2158/3316 [38:23<55:10,  2.86s/it]  

Error processing sequence NC_049392 Escherichia phage ESSI2_ev239 genome assembly, chromosome: 1.: CUDA out of memory. Tried to allocate 3.34 GiB. GPU 0 has a total capacity of 15.60 GiB of which 3.34 GiB is free. Including non-PyTorch memory, this process has 12.26 GiB memory in use. Of the allocated memory 6.51 GiB is allocated by PyTorch, and 3.68 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 67%|██████▋   | 2238/3316 [42:24<54:11,  3.02s/it]  

Error processing sequence NC_029003 Salmonella phage SEN1, complete genome.: CUDA out of memory. Tried to allocate 3.40 GiB. GPU 0 has a total capacity of 15.60 GiB of which 2.66 GiB is free. Including non-PyTorch memory, this process has 12.93 GiB memory in use. Of the allocated memory 6.71 GiB is allocated by PyTorch, and 4.17 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 68%|██████▊   | 2254/3316 [43:28<1:00:42,  3.43s/it]

Error processing sequence NC_019932 Erwinia phage ENT90, complete genome.: CUDA out of memory. Tried to allocate 3.38 GiB. GPU 0 has a total capacity of 15.60 GiB of which 2.69 GiB is free. Including non-PyTorch memory, this process has 12.90 GiB memory in use. Of the allocated memory 6.68 GiB is allocated by PyTorch, and 4.17 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 75%|███████▌  | 2488/3316 [54:44<31:47,  2.30s/it]  

Error processing sequence LR597637 Escherichia phage ESSI2_ev239 genome assembly, chromosome: 1.: CUDA out of memory. Tried to allocate 3.34 GiB. GPU 0 has a total capacity of 15.60 GiB of which 3.19 GiB is free. Including non-PyTorch memory, this process has 12.40 GiB memory in use. Of the allocated memory 6.52 GiB is allocated by PyTorch, and 3.68 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 80%|████████  | 2655/3316 [1:00:40<26:26,  2.40s/it]

Error processing sequence KT630644 Salmonella phage SEN1, complete genome.: CUDA out of memory. Tried to allocate 3.40 GiB. GPU 0 has a total capacity of 15.60 GiB of which 3.01 GiB is free. Including non-PyTorch memory, this process has 12.58 GiB memory in use. Of the allocated memory 6.58 GiB is allocated by PyTorch, and 3.74 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 91%|█████████ | 3010/3316 [1:05:42<12:48,  2.51s/it]

Error processing sequence KP282674 Acidianus bottle-shaped virus 3 strain ABV3, complete genome.: CUDA out of memory. Tried to allocate 3.26 GiB. GPU 0 has a total capacity of 15.60 GiB of which 2.83 GiB is free. Including non-PyTorch memory, this process has 12.76 GiB memory in use. Of the allocated memory 6.54 GiB is allocated by PyTorch, and 3.95 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 92%|█████████▏| 3043/3316 [1:06:56<10:41,  2.35s/it]

Error processing sequence KC139521 Salmonella phage FSL SP-004, complete genome.: CUDA out of memory. Tried to allocate 3.40 GiB. GPU 0 has a total capacity of 15.60 GiB of which 2.27 GiB is free. Including non-PyTorch memory, this process has 13.32 GiB memory in use. Of the allocated memory 6.69 GiB is allocated by PyTorch, and 4.35 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


 92%|█████████▏| 3049/3316 [1:07:06<06:56,  1.56s/it]

Error processing sequence HQ110084 Erwinia phage ENT90, complete genome.: CUDA out of memory. Tried to allocate 3.38 GiB. GPU 0 has a total capacity of 15.60 GiB of which 3.33 GiB is free. Including non-PyTorch memory, this process has 12.27 GiB memory in use. Of the allocated memory 6.39 GiB is allocated by PyTorch, and 3.60 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


100%|██████████| 3316/3316 [1:12:44<00:00,  1.32s/it]

Processed 3300 sequences
Embeddings array shape: (3300, 1920)
Number of headers: 3300
Failed to process 16 sequences



