# Sequence Generation and Alignment Analysis with Evo2
This notebook demonstrates how to generate biological sequences using the Evo2 model and analyze them using Biopython alignments.

## Setup and Dependencies

First, let's import our required libraries and set up our environment. Note you need Jupyter to run notebooks.


In [4]:
import os
import argparse
import csv
from pathlib import Path
from typing import List, Optional, Tuple
import numpy as np
import torch
import torch.nn.functional as F
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.Seq import Seq

from evo2 import Evo2

# Set random seeds for reproducibility
torch.manual_seed(42)
torch.cuda.manual_seed(42)



## Model Initialization
Let's initialize our Evo2 model. We'll use the 7B parameter version as a default.

In [5]:
model_name = 'evo2_7b'

model = Evo2(model_name)

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 32263.88it/s]




Found complete file in repo: evo2_7b.pt


100%|██████████| 32/32 [00:00<00:00, 180.96it/s]


Extra keys in state_dict: {'blocks.2.mixer.mixer.filter.t', 'blocks.16.mixer.mixer.filter.t', 'blocks.20.mixer.mixer.filter.t', 'blocks.9.mixer.mixer.filter.t', 'blocks.27.mixer.mixer.filter.t', 'blocks.17.mixer.dense._extra_state', 'blocks.31.mixer.attn._extra_state', 'blocks.24.mixer.dense._extra_state', 'blocks.17.mixer.attn._extra_state', 'blocks.13.mixer.mixer.filter.t', 'blocks.10.mixer.attn._extra_state', 'blocks.10.mixer.dense._extra_state', 'blocks.30.mixer.mixer.filter.t', 'blocks.31.mixer.dense._extra_state', 'blocks.24.mixer.attn._extra_state', 'blocks.3.mixer.dense._extra_state', 'blocks.3.mixer.attn._extra_state', 'unembed.weight', 'blocks.6.mixer.mixer.filter.t', 'blocks.23.mixer.mixer.filter.t'}


  state = torch.load(state, map_location="cuda")
  return torch_load(state, map_location=device)


## Data Loading
Next we'll create functions to load our example sequences


In [23]:
def read_sequences(input_file: Path) -> Tuple[List[str], List[str]]:
    """
    Read input and target sequences from CSV file.
    
    Expected CSV format:
    input_sequence,target_sequence
    ACGTACGT,ACGTACGTAA
    ...
    """
    input_seqs: List[str] = []
    names: List[str] = []
    
    with open(input_file, encoding='utf-8-sig', newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        for row in reader:
            input_seqs.append(row[0])
            if len(row) > 1:
                names.append(row[1])
    
    return input_seqs, names

# Load example data

sequences, names = read_sequences('../../vortex/test/data/prompts.csv')

# For 'autocomplete', we split the data into input and target sequences

input_seqs = [seq[:500] for seq in sequences]
target_seqs = [seq[500:1000] for seq in sequences]

print(f"Loaded {len(sequences)} sequence pairs")

Loaded 4 sequence pairs


### Now it's time to generate!

In [24]:
generations = model.generate(
    input_seqs,
    n_tokens=500,
    temperature=1.0,
)

generated_seqs = generations.sequences
print(generated_seqs)

Initializing inference params with max_seqlen=1000
Prompt: "GAATAGGAACAGCTCCGGTCTACAGCTCCCAGCGTGAGCGACGCAGAAGACGGTGATTTCTGCATTTCCATCTGAGGTACCGGGTTCATCTCACTAGGGAGTGCCAGACAGTGGGCGCAGGCCAGTGTGTGTGCGCACCGTGCGCGAGCCGAAGCAGGGCGAGGCATTGCCTCACCTGGGAAGCGCAAGGGGTCAGGGAGTTCCCTTTCCGAGTCAAAGAAAGGGGTGATGGACGCACCTGGAAAATCGGGTCACTCCCACCCGAATATTGCGCTTTTCAGACCGGCTTAAGAAACGGCGCACCACGAGACTATATCCCACACCTGGCTCAGAGGGTCCTACGCCCACGGAATCTCGCTGATTGCTAGCACAGCAGTCTGAGATCAAACTGCAAGGCGGCAACGAGGCTGGGGGAGGGGCGCCCGCCATTGCCCAGGCTTGCTTAGGTAAACAAAGCAGCCGGGAAGCTCGAACTGGGTGGAGCCCACCACAGCTCAAGG",	Output: "AGGCCTGCCTGCCTCTGTAGGCTCCACCTCCGGGGGAAGGGCACAGCCCAACAAAAGGCGGCAGACACCTCTGCAGACTTAAATGTCCCTGTCTGACAGCTTTGAAGAGAGCAGTGGTTCTCCTAGCACGCAGCTGGAGATCTGAGAACGGGCAGACTGCCTCCTCAAGTGGGTCCCTGACCCCTGACCCCCGAGCAGCCTAACTGGGAGGCACCCCCCAGCAGGGGCACACTGACACCTCACACGGCAGGGTATTCCAACAGACCTGCAGCTGAGGATCCTGTCTGCAAGACAGCTTAGGCCCTACAACAGTCTTGCAGCCACCTCTACTGATGTAGGAAAGCCTGCCTGCCTCTGTAGGCTCCACCTCTGGGAGCAGGGCATAGACAAACAAAAAGAGGCAGCAGCAGCCTCAGCAGACAGAAACC

## Alignment Analysis
### Let's analyze our generated sequences using Biopython's alignment tools.

In [28]:
def analyze_alignments(generated_seqs: List[str],
                       target_seqs: List[str],
                       names: Optional[List[str]] = None
                      ) -> List[dict]:
    """
    Analyze and visualize alignments between generated and target sequences.
    
    Args:
        generated_seqs: List of generated sequences
        target_seqs: List of target sequences
        names: Optional list of sequence names
        
    Returns:
        List of alignment metrics for each sequence pair
    """
    metrics = []
    print("\nSequence Alignments:")
    
    for i, (gen_seq, target_seq) in enumerate(zip(generated_seqs, target_seqs)):
        if names and i < len(names):
            print(f"\nAlignment {i+1} ({names[i]}):")
        else:
            print(f"\nAlignment {i+1}:")
        
        gen_bio_seq = Seq(gen_seq)
        target_bio_seq = Seq(target_seq)
        
        # Get alignments
        alignments = pairwise2.align.globalms(
            gen_bio_seq, target_bio_seq,
            match=2,
            mismatch=-1,
            open=-0.5,
            extend=-0.1
        )
        
        best_alignment = alignments[0]
        print(format_alignment(*best_alignment))
        
        matches = sum(a == b for a, b in zip(best_alignment[0], best_alignment[1]) 
                      if a != '-' and b != '-')
        alignment_length = len(best_alignment[0].replace('-', ''))
        similarity = (matches / len(target_seq)) * 100
        
        seq_metrics = {
            'similarity': similarity,
            'score': best_alignment[2],
            'length': len(target_seq),
            'gaps': best_alignment[0].count('-') + best_alignment[1].count('-')
        }
        
        if names and i < len(names):
            seq_metrics['name'] = names[i]
            
        metrics.append(seq_metrics)
        
        print(f"Sequence similarity: {similarity:.2f}%")
        print(f"Alignment score: {best_alignment[2]:.2f}")
    
    return metrics

# Analyze alignments
alignment_metrics = analyze_alignments(generated_seqs, target_seqs, names)


Sequence Alignments:

Alignment 1 (L1RE2):
AGGCCTGCCTGCCTCTGTAGGCTCCACCTCC-GGGGGA-AGGGCACAGCC-CAA-CAAAA-GGCG-GCAG-ACACCTCTGCAGACTTAAA-TGTCCCTGTCTGACAGCTTTGAAGAGAGCAGTGGTTCTCCT-AGCACGCAGCTGGAGATCTGAGAACGGGCAGACTGCCTCCTCAAGTGGGTCCCTGACCCCTGACCCCCGAGCAGCCTAACTGGGAGGCACCCCCCAGCAGGGGCACACTGACACCTCACACGGCAGGGTATTCCAACAGACCTGCAGCTGAGGA-TCCTGTCTGCA--AGACAG-----CTTAGG-C--CCTAC-AACAGTCTTGCAGCCACCTCTACTGAT--GTAGGAAAGCCTGCCTGCC-TCTGTAGGC-TCCACC-TC-TGGG---AG-C----AGGGCATAG--ACAAACA-A-AAAGA-GGCAG-------CAGCAGCCTCAGCAGACA------GAAAC-C---ATACCGCCT-G-GCAGC-T-T--TG------AAGAGA--GCAGTGGATC-TC-C---CAACACGG-----AGGT-TGAGATCTGAGAACGGACA-GAC---
||||||||||||||||||||||||||||| | |||||  |||||||||   ||| ||||| |||  |||| | |||||||||||||| || ||||||||||||||||||||||||||||||||||||||||  |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||  |||||||||    ||| ||     | ||   |  || |  || ||    | |  ||  |||||  |   | |  ||| |  

## Generate with species prompt

In [11]:
from evo2.utils import make_phylotag_from_gbif

species = 'Phascolarctos cinereus' # Koala bear

species_tag_prompt = make_phylotag_from_gbif(species)

print(f"Species tag prompt: {species_tag_prompt}") # Check if the GBIF API returned a valid species tag!

# Generate species tag
koala_sequence = model.generate(
    [species_tag_prompt],
    n_tokens=500,
    temperature=1.0,
)

print(f"Koala sequence:")
print(koala_sequence.sequences[0])

Species tag prompt: |D__ANIMALIA;P__CHORDATA;C__MAMMALIA;O__DIPROTODONTIA;F__PHASCOLARCTIDAE;G__PHASCOLARCTOS;S__PHASCOLARCTOS CINEREUS|
Initializing inference params with max_seqlen=616
Prompt: "|D__ANIMALIA;P__CHORDATA;C__MAMMALIA;O__DIPROTODONTIA;F__PHASCOLARCTIDAE;G__PHASCOLARCTOS;S__PHASCOLARCTOS CINEREUS|",	Output: "TAGTACCCCGTCCAATATTCGGAAAACGAGAACTGGACGAACTGAACTTACTTCTTTGTTGATGCACGGGAAGGATCTTCAGCTTATCACCGTCGCGTCGATCAAGTTACTGACTCACAATTCTTCTTTCTCTTCGAGGTCCTTTTCTAGATTTGTAAAGTTACGTTAGGTATTAATATCTACCGCATGTTCCGTCCAAAGTAAACGCTCCCCCTAACTGCATTATATTAAGCCGAACCGAACGAAGTTGCGCAGAAACTATGAACGTTTCCGTATTTGCGGAAGATATCTCTCAACTCTCTGCAAACTGAAATAAGCCAGTGAATATAACAATGAACGTTTCCGTATTTGCGGAAGATATCTCTCAACTCTCTGCAAACTGAAATAAGCCAGTGAATATAACAATAGAAAACCTTCGCACCTTACATTCGCGTCCATTAGGTATGCAGGCAGTTCGGCCGGGCCGAAGAATAAGAAGCCACCCCAACTCTGCAAAAAAA",	Score: -1.2092068195343018
Koala sequence:
TAGTACCCCGTCCAATATTCGGAAAACGAGAACTGGACGAACTGAACTTACTTCTTTGTTGATGCACGGGAAGGATCTTCAGCTTATCACCGTCGCGTCGATCAAGTTACTGACTCACAATTCTTCTTTC