In [None]:
from transformers import AutoModelForMaskedLM

#Download CodonBERT_Sanofi model and add path
model = AutoModelForMaskedLM.from_pretrained('../CodonBERT_Sanofi/codonbert_models/codonbert')


In [None]:
# Read sequences from fastaa file
def read_fasta(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()

    sequences = {}
    sequence_name = None
    sequence = ''

    for line in lines:
        if line.startswith('>'):
            if sequence_name:  # save the sequence when we encounter a new header
                sequences[sequence_name] = sequence
                sequence = ''
            sequence_name = line.strip()
        else:
            sequence += line.strip()

    # Save the last sequence
    if sequence_name and sequence:
        sequences[sequence_name] = sequence

    return sequences

fasta_file = 'exp_nature_gene.faa'
sequences = read_fasta(fasta_file)
sequence = list(sequences.values())[0]
#print(sequence)
#sequence = 'AUGCCAAACACCCUGGCAUGCCCC'
# Format sequence so it's compatible with CodonBert (sepaarte codnos by space)
codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)]
spaced_sequence = ' '.join(codons)
print(spaced_sequence)

In [None]:
#Example: Get model outputs for a sequence with one masked Codon
from tokenizer import get_tokenizer
import torch
seq = '[MASK] AAU GAU ACG GAA GCG AUC'
tokenizer = get_tokenizer()
input_ids = tokenizer.encode(seq).ids
input_ids = torch.tensor([input_ids], dtype=torch.int64)  # batch_size = 1
outputs = model(input_ids)
outputs

In [None]:
#Output logits shape is ([batch size, number of tokens, vocab_size])
outputs.logits.shape

In [None]:
#Example: Predict the masked codon in example sequence and compare with real
import numpy as np
seq_codons = seq.split()
idx = seq_codons.index('[MASK]')
print(idx)
torch.argmax(outputs.logits[0,idx+1])
print('real: ', spaced_sequence.split()[idx])
#tokenizer.decode([np.argmax(outputs.logits[0,idx].detach().numpy())])
print('predicted: ', tokenizer.decode([np.argmax(outputs.logits[0,idx+1].detach().numpy())]))


In [None]:
# Mask each codon one by one in a sequence and predict the masked codon. 
correct_predictions = 0

for idx in range(len(codons)):
    masked_codon = codons[idx]
    masked_seq = ' '.join(codons[:idx] + ['[MASK]'] + codons[idx+1:])
    masked_input_ids = tokenizer.encode(masked_seq).ids
    masked_input_ids = torch.tensor([masked_input_ids], dtype=torch.int64)  # batch_size = 1
    
    with torch.no_grad():
        outputs = model(masked_input_ids)
    
    # Get the index of the masked token in the logits output
    mask_token_index = masked_input_ids[0].tolist().index(tokenizer.token_to_id('[MASK]'))
    predicted_token_id = torch.argmax(outputs.logits[0, mask_token_index])
    predicted_codon = tokenizer.decode([predicted_token_id])
    
    print(f"Masked sequence: {masked_seq}")
    print(f"Real codon: {masked_codon}")
    print(f"Predicted codon: {predicted_codon}")
    
    if masked_codon == predicted_codon:
        correct_predictions += 1

total_codons = len(codons)
accuracy = correct_predictions / total_codons

print(f"Total correct predictions: {correct_predictions}")
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# Find accuarcy of predictions by comparing the real and predicted masked codons. Include synonymous codons
from tokenizer import get_tokenizer
import torch
import numpy as np

# Assuming the sequence and the model are already defined
sequence = 'AUGAAUGAUACGGAAGCGAUC'
codons = [sequence[i:i+3] for i in range(0, len(sequence), 3)]
spaced_sequence = ' '.join(codons)
print('Original sequence:', spaced_sequence)

# Codon table mapping
codon_table = {
    'G': ['GGU', 'GGC', 'GGA', 'GGG'],
    'A': ['GCU', 'GCC', 'GCA', 'GCG'],
    'V': ['GUU', 'GUC', 'GUA', 'GUG'],
    'L': ['CUU', 'CUC', 'CUA', 'CUG', 'UUA', 'UUG'],
    'I': ['AUU', 'AUC', 'AUA'],
    'P': ['CCU', 'CCA', 'CCG', 'CCC'],
    'F': ['UUU', 'UUC'],
    'Y': ['UAU', 'UAC'],
    'W': ['UGG'],
    'S': ['UCU', 'UCA', 'UCC', 'UCG', 'AGU', 'AGC'],
    'T': ['ACU', 'ACC', 'ACG', 'ACA'],
    'M': ['AUG'],
    'C': ['UGU', 'UGC'],
    'N': ['AAU', 'AAC'],
    'Q': ['CAA', 'CAG'],
    'D': ['GAU', 'GAC'],
    'E': ['GAA', 'GAG'],
    'K': ['AAA', 'AAG'],
    'R': ['CGU', 'CGC', 'CGG', 'CGA', 'AGA', 'AGG'],
    'H': ['CAU', 'CAC'],
    'X': ['UAA', 'UAG', 'UGA']
}

# Function to get amino acid for a given codon
def get_amino_acid(codon):
    for aa, codons in codon_table.items():
        if codon in codons:
            return aa
    return None

# Initialize tokenizer and model (assuming model is already defined)
tokenizer = get_tokenizer()
seq = ' '.join(codons)

# Encode the sequence
input_ids = tokenizer.encode(seq).ids
input_ids = torch.tensor([input_ids], dtype=torch.int64)  # batch_size = 1

# Mask each codon one by one and predict
correct_predictions = 0

for idx in range(len(codons)):
    masked_codon = codons[idx]
    masked_seq = ' '.join(codons[:idx] + ['[MASK]'] + codons[idx+1:])
    masked_input_ids = tokenizer.encode(masked_seq).ids
    masked_input_ids = torch.tensor([masked_input_ids], dtype=torch.int64)  # batch_size = 1
    
    with torch.no_grad():
        outputs = model(masked_input_ids)
    
    # Get the index of the masked token in the logits output
    mask_token_index = masked_input_ids[0].tolist().index(tokenizer.token_to_id('[MASK]'))
    predicted_token_id = torch.argmax(outputs.logits[0, mask_token_index])
    predicted_codon = tokenizer.decode([predicted_token_id])
    
    real_amino_acid = get_amino_acid(masked_codon)
    predicted_amino_acid = get_amino_acid(predicted_codon)

    print(f"Masked sequence: {masked_seq}")
    print(f"Real codon: {masked_codon} (Amino acid: {real_amino_acid})")
    print(f"Predicted codon: {predicted_codon} (Amino acid: {predicted_amino_acid})")
    
    if real_amino_acid == predicted_amino_acid:
        correct_predictions += 1

total_codons = len(codons)
accuracy = correct_predictions / total_codons

print(f"Total correct predictions: {correct_predictions}")
print(f"Accuracy: {accuracy:.2f}")
