In [1]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Verify the path is set
print("TRITON_LIBCUDA_PATH:", os.getenv("TRITON_LIBCUDA_PATH"))

TRITON_LIBCUDA_PATH: /nix/store/z8ac4sgxc4h86zfmlz7yi0kkv95wgz84-graphics-drivers/lib


In [2]:
import torch
from evo2 import Evo2

evo2_model = Evo2("evo2_1b_base")

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 25/25 [00:00<00:00, 235.37it/s]


Extra keys in state_dict: {'blocks.17.mixer.dense._extra_state', 'blocks.3.mixer.dense._extra_state', 'unembed.weight', 'blocks.24.mixer.dense._extra_state', 'blocks.10.mixer.dense._extra_state'}


In [None]:
# example

sequence = "ACGTAG"
input_ids = (
    torch.tensor(
        evo2_model.tokenizer.tokenize(sequence),
        dtype=torch.int,
    )
    .unsqueeze(0)
    .to("cuda:0")
)

layer_name = "blocks.24.mlp.l3"

outputs, embeddings = evo2_model(input_ids, return_embeddings=True, layer_names=[layer_name])

print("Embeddings shape: ", embeddings[layer_name].shape)

In [3]:
from Bio import SeqIO
import numpy as np
import pandas as pd
from tqdm import tqdm

In [6]:
seq_matrix = list(SeqIO.parse("../dataset/matrix_genomes_11k.fasta", "fasta"))

In [7]:
# Process all sequences in the FASTA file and extract embeddings

# Create lists to store results and failed sequences
results = []
failed_sequences = []

# Process each sequence
for record in tqdm(seq_matrix):
    try:
        # Extract sequence and header
        sequence = str(record.seq)
        header = record.description

        # Make sure GPU memory is cleared before processing
        torch.cuda.empty_cache()

        # Tokenize and get embeddings
        input_ids = (
            torch.tensor(
                evo2_model.tokenizer.tokenize(sequence),
                dtype=torch.int,
            )
            .unsqueeze(0)
            .to("cuda:0")
        )

        layer_name = "blocks.24.mlp.l3"

        # Get embeddings with explicit dtype to avoid BFloat16 issues
        with torch.amp.autocast("cuda", enabled=False):
            outputs, embeddings = evo2_model(
                input_ids, return_embeddings=True, layer_names=[layer_name]
            )

        # Extract the embeddings tensor and ensure it's float32
        embedding_tensor = embeddings[layer_name].to(torch.float32)

        # Average over the sequence length dimension to get a 1920-dim vector
        # Shape goes from [1, n, 1920] to [1, 1920] to [1920]
        avg_embedding = embedding_tensor.mean(dim=1).squeeze().cpu().numpy()

        # Store results
        results.append({"header": header, "embedding": avg_embedding})

        # Clear GPU cache to free memory
        torch.cuda.empty_cache()

    except Exception as e:
        print(f"Error processing sequence {header}: {e}")
        # Record the failed sequence
        failed_sequences.append(header)
        # Force GPU memory cleanup
        torch.cuda.empty_cache()
        continue

# Extract headers
headers = [r["header"] for r in results]

# Create a numpy array of all embeddings
embeddings_array = np.stack([r["embedding"] for r in results])

print(f"Processed {len(results)} sequences")
print(f"Embeddings array shape: {embeddings_array.shape}")
print(f"Number of headers: {len(headers)}")

# Save embeddings and headers with model-specific names
np.save("../results/embeddings_evo2_matrix_small.npy", embeddings_array)
pd.DataFrame({"header": headers}).to_csv("../results/headers_evo2_matrix_small.csv", index=False)

# Save failed sequences
if failed_sequences:
    print(f"Failed to process {len(failed_sequences)} sequences")
    pd.DataFrame({"header": failed_sequences}).to_csv(
        "../results/failed_sequences_evo2_matrix_small.csv", index=False
    )

100%|██████████| 1024/1024 [30:16<00:00,  1.77s/it]

Processed 1024 sequences
Embeddings array shape: (1024, 1920)
Number of headers: 1024



