In [1]:
import itertools
import random
import math
import re

In [2]:
def mutate(base, sub_dict, versions=3, mut_per_ver=[1,1,1,1,2,2,2,2,3,3,3,4], max_length=150):
    for v in range(versions):
        base_list = list(base)
        #print ()
        for n in range(random.choice(mut_per_ver)):
            index = random.choice([i for i in range(len(base))])
            #print (index)
            #print (base[index], sub_dict[base[index]])
            base_list[index] = random.choice(sub_dict[base[index]])
        #print (base)
        new_base = ''.join(base_list)
        copies = math.floor(max_length / len(new_base))
        #print (copies, len(new_base), max_length)
        yield(new_base*copies)


In [3]:
def load_fasta(fas_filename):
    fas_list = []
    with open(fas_filename, 'r') as fas_file:
        fas_lines = fas_file.readlines()
    for i, line in enumerate( fas_lines ):
        if line.startswith('>'):
            header = re.sub(r'>(.+)\n$', r'\1', line)
            seq = re.sub(r'(.+)\n$', r'\1', fas_lines[i+1])
            fas_list.append( (header, seq) )    
    return fas_list

In [4]:
def determine_repeats(sequence, mute=1, max_repeat_length=0):
    seq_len = len(sequence)
    if not max_repeat_length:
        max_repeat_length = int(seq_len/2)
    #print (max_repeat_length,'max_repeat_length')
    repeat_ranges = [x for x in range(5, max_repeat_length)]
    match_fraction = []
    for rep in repeat_ranges:
        #print (rep,'rep')
        seq_frags = []
        for r in range(0, int((seq_len+1)/rep)):
            seq_frags.append(sequence[r*rep: min([seq_len,r*rep+rep]) ])
        if not mute: print ('\n'.join(seq_frags))
        total = 0.0
        match = 0.0
        #print (seq_frags,'seq_frags')
        for fragA, fragB in itertools.combinations(seq_frags, 2):
            #print ('fragA, fragB', fragA, fragB)
            for p in range(min([len(fragA), len(fragB)])):
                total += 1.0
                if fragA[p] == fragB[p]:
                    match += 1.0
        if not mute: print(match/total)
        if not mute: print()
        match_fraction.append(match/total)
    
    maximum = max(match_fraction)
    if not mute: print (maximum)
    if not mute: print (match_fraction.index(maximum))
    repeat = repeat_ranges[match_fraction.index(maximum)]
    return (repeat)

In [5]:
substitutions = {   'Q':'EK',
                    'E':'QK',
                    'K':'QE',
                    'A':'STV',
                    'S':'TPGA',
                    'T':'SPGA',
                    'G':'GGSSPADNT',
                    'P':'KGAS',
                    'N':'DDQQGGAKS',
                    'D':'NNQQGGAKS',
                    'C':'VILAST',
                    'V':'IIIIILLLLLFFAWYTS',
                    'I':'VVVVVLLLLLFFAWYTS',
                    'L':'VVVVVLIIIIFFAWYTS',
                    'F':'YYYWWILV',
                    'Y':'FFFWWILV',
                    'W':'FFFYYYILV'} 

In [8]:
fasta_list = load_fasta('./example/initial_seq/selected.fas')

In [9]:
max_length = 200

mutated_sequences = []

seen = {}

for name, sequence in fasta_list:
    repeat_len = determine_repeats(sequence, 50)
    single_repeat = sequence[:repeat_len]
    og_seq = single_repeat*math.floor(max_length / len(single_repeat))
    #print (0, og_seq)
    
    try: seen[og_seq]
    except KeyError:
        seen[og_seq] = 1
        mutated_sequences.append('>{0}_MUT0'.format(name))
        mutated_sequences.append(og_seq)

    for v, seq in enumerate(mutate(single_repeat, substitutions,  versions=80, max_length=max_length)):
        #print (v+1, seq)
        try: seen[seq]
        except KeyError:
            seen[seq] = 1
            mutated_sequences.append('>{0}_MUT{1}'.format(name,v+1))
            mutated_sequences.append(seq)
        

print (int(len(mutated_sequences)/2), 'sequences generated')
# mutated_sequences
# print ('\n'.join(mutated_sequences))

453 sequences generated


In [10]:
with open('./example/mutated_seq/mutated_sequences.fas', 'w') as output:
    print ('\n'.join(mutated_sequences), file=output)
    