## Generate artificial data with implanted motifs

In [1]:
import numpy as np
import os
import pandas as pd
from Bio import SeqIO
import random
import itertools

#### set motif parameters

In [2]:
append = False

alphabet = ['A','C','G','T']
random_3mers = [''.join(p) for p in itertools.product(alphabet, repeat=3)]

core1 = ['TAG']
core2 = ['ACT']
l = 3
r = 40
p = 0.8

#alphabet2 = ['A']
seq_length = 40
no_sequences = 15000  #positive and negative
ratio_with_motif = 1

#### generate sequences

In [3]:
def generate_sequences(no_sequences):
    no_motifs = int(no_sequences*ratio_with_motif)
    #generate random sequences 
    positive_sequences = [''.join([random.choice(alphabet) for i in range(seq_length)]) for j in range(no_sequences)]
    negative_sequences = [''.join([random.choice(alphabet) for i in range(seq_length)]) for j in range(no_sequences)]

    #assume one motif per sequence
    seqs_to_insert = random.sample(range(no_sequences),no_motifs) #select sequences for motif insertion

    for i in seqs_to_insert:
        seq = positive_sequences[i]

        dist = np.random.negative_binomial(r, p, 1)[0]  #motif distances
        ins = random.choice(range(0, max(1,seq_length-l*2-dist)))  #insertion position
        c1 = random.choice(core1)    #select core 1 
        c2 = random.choice(core2)    #select core 2

        seq = list(seq)   #convert to list for indexing
        for pos in range(len(c1)):  #insert core 1
            if pos + ins < len(seq):
                seq[pos+ins] = c1[pos]

        for pos in range(len(c2)):   #insert core 2
            if pos + ins + dist + len(c1) < len(seq):
                seq[pos+ins+dist+len(c1)] = c2[pos]

        seq = ''.join(seq)

        positive_sequences[i] = seq
    
    return positive_sequences, negative_sequences

#### save sequences as fasta or fastq files

In [4]:
def write_seqs(positive_sequences, negative_sequences, file_pos, file_neg):
    #write positives to file
    with open(file_pos, 'a' if append else 'w') as f:
        for i, seq in enumerate(positive_sequences):
            print('> header', i+1, file=f)
            print(seq, file=f)

    #write negatives to file
    with open(file_neg, 'w') as f:
        for i, seq in enumerate(negative_sequences):
            print('> header', i+1, file=f)
            print(seq, file=f)

In [5]:
def write_seqs_fastq(positive_sequences, negative_sequences, file_pos, file_neg):
    #write positives to file
    with open(file_pos + '.fastq', 'a' if append else 'w') as f:
        for i, seq in enumerate(positive_sequences):
            print('@header', i+1, file=f)
            print(seq, file=f)
            print('+', file=f)
            print(''.join(['~']*len(seq)), file=f)

    #write negatives to file
    with open(file_neg + '.fastq', 'w') as f:
        for i, seq in enumerate(negative_sequences):
            print('@header', i+1, file=f)
            print(seq, file=f)
            print('+', file=f)
            print(''.join(['~']*len(seq)), file=f)
            
def write_seqs_fasta(positive_sequences, negative_sequences, file_pos, file_neg):
    #write positives to file
    with open(file_pos + '.fasta', 'a' if append else 'w') as f:
        for i, seq in enumerate(positive_sequences):
            print('>header', i+1, file=f)
            print(seq, file=f)

    #write negatives to file
    with open(file_neg + '.fasta', 'w') as f:
        for i, seq in enumerate(negative_sequences):
            print('>header', i+1, file=f)
            print(seq, file=f)

#### generate dataset

In [6]:
write_seqs_fastq(*generate_sequences(no_sequences), 'positives_toy', 'negatives_toy')