In [None]:
#Copyright R. R. Syahdi, 2025

In [None]:
# Install Biopython if you haven't already
%pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import random


In [None]:
def mutate_fasta(
    input_fasta,
    output_fasta="mutated_sequences.fasta",
    mutation_fraction=0.20,
    similar_weight=0.10,
    dissimilar_weight=0.90,
    random_seed=42
):
    """
    Mutates an amino acid FASTA sequence and saves the result.
    """

    from Bio import SeqIO
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    import random

    def load_fasta(file_path):
        for record in SeqIO.parse(file_path, "fasta"):
            return str(record.seq)
        raise ValueError("No sequence found in FASTA file.")

    def get_similar_amino_acid(amino_acid):
        similar_groups = {
            ('A', 'G', 'S', 'T'): ('A', 'G', 'S', 'T'),
            ('V', 'L', 'I', 'M'): ('V', 'L', 'I', 'M'),
            ('F', 'Y', 'W'): ('F', 'Y', 'W'),
            ('P',): ('P',),
            ('D', 'E'): ('D', 'E'),
            ('N', 'Q'): ('N', 'Q'),
            ('K', 'R', 'H'): ('K', 'R', 'H'),
            ('C',): ('C',)
        }
        for group in similar_groups.values():
            if amino_acid in group:
                options = [aa for aa in group if aa != amino_acid]
                return random.choice(options) if options else None
        return None

    def get_dissimilar_amino_acid(amino_acid):
        all_amino_acids = "ACDEFGHIKLMNPQRSTVWY"
        similar_groups = [
            ('A', 'G', 'S', 'T'),
            ('V', 'L', 'I', 'M'),
            ('F', 'Y', 'W'),
            ('P',),
            ('D', 'E'),
            ('N', 'Q'),
            ('K', 'R', 'H'),
            ('C',)
        ]
        similar_amino_acids = set()
        for group in similar_groups:
            if amino_acid in group:
                similar_amino_acids.update(group)
                break
        dissimilar_amino_acids = [aa for aa in all_amino_acids if aa not in similar_amino_acids]
        return random.choice(dissimilar_amino_acids)

    random.seed(random_seed)
    original_sequence = load_fasta(input_fasta)
    sequence_list = list(original_sequence)

    num_mutations = int(len(sequence_list) * mutation_fraction)
    mutation_positions = random.sample(range(len(sequence_list)), num_mutations)

    for pos in mutation_positions:
        original = sequence_list[pos]
        mutation_type = random.choices(
            ['similar', 'dissimilar'],
            weights=[similar_weight, dissimilar_weight],
            k=1
        )[0]

        # FIX: add fallback to dissimilar if similar mutation fails
        if mutation_type == 'similar':
            new_aa = get_similar_amino_acid(original)
            if new_aa is None:
                new_aa = get_dissimilar_amino_acid(original)
        else:
            new_aa = get_dissimilar_amino_acid(original)

        while new_aa == original:
            new_aa = get_dissimilar_amino_acid(original)

        sequence_list[pos] = new_aa

    mutated_sequence = "".join(sequence_list)

    mutated_record = SeqRecord(
        Seq(mutated_sequence),
        id="mutated_sequence",
        description=f"{round(mutation_fraction*100)}% mutations, seed={random_seed}"
    )
    with open(output_fasta, "w") as handle:
        SeqIO.write([mutated_record], handle, "fasta")

    print(f"Mutated sequence saved to {output_fasta}")
    print(f"Original length: {len(original_sequence)} | Mutations: {num_mutations}")


In [None]:
mutate_fasta(
    input_fasta="rcsb_pdb_5NN8.fasta",
    output_fasta="mutated_5NN8.fasta", mutation_fraction=0.7
)

✅ Mutated sequence saved to mutated_5NN8.fasta
Original length: 872 | Mutations: 610
