In [2]:
# This script takes an input fasta file and outputs a fasta file where each sequence from the input is mutated at a user provided rate.
# The mutation rate ranges from 0 to 1. A value of 0.5 means that 50% of the nucleotides in each sequence will be mutated to a different base.

import random
from Bio import SeqIO

print("Loaded libraries!")

Loaded libraries!


In [20]:
# This function mutates num_mutations number of mutations to a diferent nucleotide in a given sequence

def mutate_dna(dna_string, mutation_rate):
    dna_list = list(dna_string)
    dna_length = len(dna_list)
    num_mutations = int(mutation_rate*dna_length)
    if num_mutations > dna_length:
        raise ValueError("Number of mutations cannot exceed the length of the DNA string.")

    mutation_indices = random.sample(range(dna_length), num_mutations)

    for index in mutation_indices:
        current_base = dna_list[index]
        valid_bases = ['A', 'C', 'G', 'U']
        if current_base not in valid_bases:
            continue
        else:
            valid_bases.remove(current_base) # Ensure the new base is different.
            new_base = random.choice(valid_bases)
            dna_list[index] = new_base

    return "".join(dna_list)  # Convert back to a string

In [22]:
f = 'SILVA_138.2_SSURef_NR99_tax_silva_filtered.fasta'

In [28]:
mutation_rate = 0.01
out_file = f.replace(".fasta",f".mutated_{mutation_rate}.fasta")

with open(f"{out_file}",'w') as out:
    for h,i in enumerate(SeqIO.parse(f,'fasta')):
        d,s,L = str(i.description),str(i.seq).upper(),len(i.seq)
        mutated_seq = mutate_dna(s, mutation_rate)
        out.write(f'>{d}\n{mutated_seq}\n')
        print(h,end='\r')

397475