In [None]:
import numpy as np
import sys
from random import random, randint
import time

n = 200  # nucleotides per sequence
m = 4999  # number of sequences - 1 per class

then = time.time()

nucleotides = {
    0: [1, 0, 0, 0],  # A
    1: [0, 1, 0, 0],  # T
    2: [0, 0, 1, 0],  # C
    3: [0, 0, 0, 1]  # G
}

transcription_factors = {
    'arid3a': [
        [27, 0, 1, 27, 27, 20],
        [0, 0, 9, 0, 0, 0],
        [0, 0, 0, 0, 0, 1],
        [0, 27, 17, 0, 0, 6]
    ]
    
}


def get_cumulative_ppm(pfm):
    ppm = np.array([[0. for j in range(len(pfm[i]))] for i in range(len(pfm))])
    for i in range(len(ppm)):
        for j in range(len(ppm[i])):
            ppm[i][j] = pfm[i][j] / sum(pfm[k][j] for k in range(len(pfm))) + (0 if i == 0 else ppm[i - 1][j])
    return ppm


def generate_tf_sequence(ppm):
    tf = []
    for j in range(len(ppm[0])):
        num = random()
        for i in range(len(ppm)):
            if num < ppm[i][j]:
                tf.append(nucleotides[i])
                break
    return np.array(tf)


def generate_dna_sequence():
    return np.array([nucleotides[int(random() * 4)] for i in range(n)])


arid3a = get_cumulative_ppm(transcription_factors['arid3a'])

control_x = np.array([generate_dna_sequence()])
control_y = np.array([0])

bind_x = np.array([generate_dna_sequence()])
bind_y = np.array([1])

for i in range(m):
    control_x = np.vstack((control_x, [generate_dna_sequence()]))
    control_y = np.append(control_y, 0)

    arid3a_site = generate_tf_sequence(arid3a)
    bind_x = np.vstack((bind_x, [generate_dna_sequence()]))
    loc = randint(0, n - arid3a_site.shape[0] - 1)
    bind_x[i + 1][loc:loc + arid3a_site.shape[0]] = arid3a_site
    bind_y = np.append(bind_y, 1)
    sys.stdout.write('\r' + str(i * 100 // (m+1)) + ' %    done')

combined_x = np.concatenate([control_x, bind_x])
combined_y = np.concatenate([control_y, bind_y])

shuffle = np.random.permutation(len(combined_x))
combined_x = combined_x[shuffle]
combined_y = combined_y[shuffle]

np.save('train_x.npy', combined_x, allow_pickle=True)
np.save('train_y.npy', combined_y, allow_pickle=True)

print('\nCompleted in ' + str(time.time() - then) + 'seconds')
print(combined_x.shape, combined_y.shape)

76 %    done