In [3]:
from dotenv import load_dotenv
import torch
import sys
import os


load_dotenv()  # will load from .env file in the same directory

# Then add this to check the environment:

print(f"CUDA_HOME: {os.environ.get('CUDA_HOME', 'Not set')}")


def check_cuda():
    print(f"PyTorch version: {torch.__version__}")
    print(f"Python version: {sys.version}")

    # Check if CUDA is available
    cuda_available = torch.cuda.is_available()
    print(f"\nCUDA available: {cuda_available}")

    if cuda_available:
        # Get current CUDA device
        current_device = torch.cuda.current_device()
        # Get device properties
        device_props = torch.cuda.get_device_properties(current_device)

        print("\nCUDA Device Details:")
        print(f"  Device: {torch.cuda.get_device_name(current_device)}")
        print(f"  Total memory: {device_props.total_memory / 1024**3:.2f} GB")
        print(f"  CUDA capability: {device_props.major}.{device_props.minor}")
        print(f"  Number of CUDA devices: {torch.cuda.device_count()}")
    else:
        print("\nNo CUDA devices available")


check_cuda()

CUDA_HOME: /run/current-system/sw
PyTorch version: 2.6.0+cu124
Python version: 3.12.9 (main, Feb 12 2025, 14:50:50) [Clang 19.1.6 ]

CUDA available: True

CUDA Device Details:
  Device: NVIDIA GeForce RTX 4060 Ti
  Total memory: 15.60 GB
  CUDA capability: 8.9
  Number of CUDA devices: 1


## model loading


In [4]:
# model can be downloaded from https://huggingface.co/lingxusb/megaDNA_updated/resolve/main/megaDNA_phage_145M.pt
model_path = "./checkpoints/megaDNA_phage_145M.pt"  # model name
device = "cuda"  # change this to 'cuda' if you use GPU

model = torch.load(model_path, map_location=torch.device(device), weights_only=False)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


MEGADNA(
  (start_tokens): ParameterList(
      (0): Parameter containing: [torch.float32 of size 512 (cuda:0)]
      (1): Parameter containing: [torch.float32 of size 256 (cuda:0)]
      (2): Parameter containing: [torch.float32 of size 196 (cuda:0)]
  )
  (token_embs): ModuleList(
    (0): Embedding(6, 196)
    (1): Sequential(
      (0): Embedding(6, 196)
      (1): Rearrange('... r d -> ... (r d)')
      (2): LayerNorm((3136,), eps=1e-05, elementwise_affine=True)
      (3): Linear(in_features=3136, out_features=256, bias=True)
      (4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
    (2): Sequential(
      (0): Embedding(6, 196)
      (1): Rearrange('... r d -> ... (r d)')
      (2): LayerNorm((200704,), eps=1e-05, elementwise_affine=True)
      (3): Linear(in_features=200704, out_features=512, bias=True)
      (4): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
  )
  (transformers): ModuleList(
    (0): Transformer(
      (layers): ModuleList(
       

## sequence generation


In [2]:
import numpy as np

nucleotides = ["**", "A", "T", "C", "G", "#"]  # vocabulary


def token2nucleotide(s):
    return nucleotides[s]


PRIME_LENGTH = 4  # give the model a random DNA primer to start
num_seq = 2  # number of runs
context_length = (
    10000  # maximal length for the generated sequence (upper limit for the model is 131K)
)

In [None]:
for j in range(num_seq):
    # Load the pre-trained model
    model = torch.load(model_path, map_location=torch.device(device), weights_only=False)
    model.eval()  # Set the model to evaluation mode

    # set the random DNA primer
    primer_sequence = (
        torch.tensor(np.random.choice(np.arange(1, 5), PRIME_LENGTH)).long().to(device)[None,]
    )
    primer_DNA = "".join(map(token2nucleotide, primer_sequence[0]))
    print(f"Primer sequence: {primer_DNA}\n{'*' * 100}")

    # Generate a sequence using the model
    seq_tokenized = model.generate(
        primer_sequence, seq_len=context_length, temperature=0.95, filter_thres=0.0
    )
    generated_sequence = "".join(map(token2nucleotide, seq_tokenized.squeeze().cpu().int()))

    # Split the generated sequence into contigs at the '#' character
    contigs = generated_sequence.split("#")

    # Write the contigs to a .fna file
    output_file_path = f"generate_{1 + j}.fna"
    with open(output_file_path, "w") as file:
        for idx, contig in enumerate(contigs):
            if len(contig) > 0:
                file.write(f">contig_{idx}\n{contig}\n")

    # Clean up to free memory
    del model, primer_sequence, generated_sequence
    torch.cuda.empty_cache()

## mutagenesis


## embedding and loss


In [6]:
import numpy as np

In [13]:
# a random input sequence
encoded_sequence = np.random.choice(np.arange(1, 5), 100)
input_seq = torch.tensor(encoded_sequence).unsqueeze(0).to(device)

# get embeddings
embeddings = model(input_seq, return_value="embedding")
print(embeddings)

[tensor([[[ 0.0109, -0.0855,  0.0778,  ..., -0.0298,  0.0041,  0.0096],
         [-0.0152,  0.8417, -0.6062,  ...,  0.1078,  0.3091, -0.2047]]],
       device='cuda:0', grad_fn=<MulBackward0>), tensor([[[-0.6653,  0.2984,  0.3873,  ...,  0.0366,  0.0596, -0.0057],
         [-0.8001,  0.3894,  0.3046,  ..., -0.3062,  0.0349,  0.4216],
         [-0.6077,  0.2663,  0.5808,  ..., -0.1424, -0.1995, -0.0598],
         ...,
         [-0.4043,  0.2778,  0.5528,  ...,  0.1454, -0.2260,  0.9171],
         [-0.4392,  0.2235,  0.5886,  ...,  0.1115, -0.2188,  0.9542],
         [-0.5363,  0.2499,  0.6020,  ...,  0.0849, -0.2252,  0.9662]]],
       device='cuda:0', grad_fn=<MulBackward0>), tensor([[[ 8.5990e-02,  9.8310e-02, -7.9113e-03,  ...,  5.1405e-01,
           2.1096e-01, -9.6927e-03],
         [ 4.5448e-02,  2.6922e-02,  2.8464e-02,  ...,  1.2249e-02,
          -9.8482e-02,  6.3418e-02],
         [ 2.6853e-03,  2.7896e-02, -5.7082e-02,  ...,  1.2375e-02,
          -2.2628e-02,  6.4354e-02],


  self.gen = func(*args, **kwds)


In [14]:
# output[0:3] stores embeddings from three transformer layers.

# get model loss
loss = model(input_seq, return_value="loss")

print(loss)

tensor(1.4495, device='cuda:0', grad_fn=<NllLoss2DBackward0>)
