## Generate artificial data with implanted motifs

In [13]:
import numpy as np
import os
import pandas as pd
from Bio import SeqIO
import random

#### set motif parameters

In [14]:
append = False
core1 = ['CCC']
core2 = ['GGG']
l = 3
d = 4
sig = 2
alphabet = ['A','C','G','T']
#alphabet2 = ['A']
seq_length = 40
no_sequences = 6000  #positive and negative

#### generate sequences

In [15]:
def generate_sequences(no_sequences):
    no_motifs = no_sequences
    #generate random sequences 
    positive_sequences = [''.join([random.choice(alphabet) for i in range(seq_length)]) for j in range(no_sequences)]
    negative_sequences = [''.join([random.choice(alphabet) for i in range(seq_length)]) for j in range(no_sequences)]

    #assume one motif per sequence
    seqs_to_insert = random.sample(range(no_sequences),no_motifs) #select sequences for motif insertion

    for i in seqs_to_insert:
        seq = positive_sequences[i]

        ins = random.choice(range(seq_length-l*2-d))  #insertion position
        dist = int(np.random.normal(d, sig, 1)[0])  #motif distances
        c1 = random.choice(core1)    #select core 1 
        c2 = random.choice(core2)    #select core 2

        seq = list(seq)   #convert to list for indexing
        for pos in range(len(c1)):  #insert core 1
            if pos + ins < len(seq):
                seq[pos+ins] = c1[pos]

        for pos in range(len(c2)):   #insert core 2
            if pos + ins + dist + len(c1) < len(seq):
                seq[pos+ins+dist+len(c1)] = c2[pos]

        seq = ''.join(seq)

        positive_sequences[i] = seq
        
    
    return positive_sequences, negative_sequences

#### save sequences as fasta files

In [16]:
def write_seqs(positive_sequences, negative_sequences, file_pos, file_neg):
    #write positives to file
    with open(file_pos, 'a' if append else 'w') as f:
        for i, seq in enumerate(positive_sequences):
            print('> header', i+1, file=f)
            print(seq, file=f)

    #write negatives to file
    with open(file_neg, 'w') as f:
        for i, seq in enumerate(negative_sequences):
            print('> header', i+1, file=f)
            print(seq, file=f)

#### generate dataset

In [17]:
write_seqs(*generate_sequences(no_sequences), 'positives_toy.fasta', 'negatives_toy.fasta')