In [1]:
import numpy as np
import random
import sys
np.set_printoptions(threshold=sys.maxsize)

In [2]:
def generate_score(seq_len, scores):
    s = ''
    for i in range(seq_len):
        s += random.choice(scores)
    return s


def count_scores(score_seq, count_mx, scores_dt):
    for idx, i in enumerate(score_seq[:-1]):
        a, b = scores_dt[score_seq[idx]], scores_dt[score_seq[idx+1]]
        count_mx[a, b] += 1

    return count_mx


def transition_matrix(count_mx):
    trans_mx = []
    
    for i in count_mx:
        new_row = i / i.sum()
        trans_mx.append(new_row.tolist())

    trans_mx = np.array(trans_mx)
    
    return trans_mx


scores = '''LMH''' # this is a highly simplified scoring system for testing purposes
scores_dt = {char:idx for idx, char in enumerate(scores)}
print(scores_dt)

seq_len = 250
n = 3

random_scores = []
for i in range(n):
    random_scores.append(generate_score(seq_len, scores))
count_mx = np.zeros([len(scores), len(scores)])


random_scores = ['HHHHHHHHMMMMLLLLLL', 'HHHHHMMMHHHHMMMMMMMLLLMLLLLLL', 'MMHHHHHHHMMMHHHMMMMMMLLLLMMMMLLLLLLLL']


for score_seq in random_scores:
    count_mx = count_scores(score_seq, count_mx, scores_dt)
print(count_mx)
trans_mx = transition_matrix(count_mx)
print(trans_mx)

{'L': 0, 'M': 1, 'H': 2}
[[22.  2.  0.]
 [ 5. 22.  3.]
 [ 0.  5. 22.]]
[[0.91666667 0.08333333 0.        ]
 [0.16666667 0.73333333 0.1       ]
 [0.         0.18518519 0.81481481]]


In [3]:
init_char = 'H'

np.random.choice(['H','M','L'], 30, p=[0.91666667, 0.08333333, 0.])

array(['M', 'M', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H'], dtype='<U1')

In [45]:
def simulate_error(scores_dt, trans_mx, score, len_scores, next_s):
    for i in range(len_scores):
        next_p = trans_mx[scores_dt[score[-1]]]
        score += np.random.choice(next_s, 1, p=next_p)[0]
        
    return score

score = 'H'
next_s = list(scores_dt.keys())
score = simulate_error(scores_dt, trans_mx, score, 30, next_s)

KeyError: 'H'

In [5]:
def sniff_data(example_file, scores, scores_dt):
    count_mx = np.zeros([len(scores), len(scores)])
    i = 0
    with open(example_file) as f:
        for line in f:
            i += 1
            if i % 4 == 0:
                line = line.rstrip()
                count_mx = count_scores(line, count_mx, scores_dt)
            if i % 40000 == 0:
                print((i/3994444)*100)
                break # stop the file iteration early
                
    return count_mx
    
scores = '''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJ'''
scores_dt = {char:idx for idx, char in enumerate(scores)}
print(scores_dt)

example_file = '../tests/test_error_model1/simulated_R1.fastq'
count_mx = sniff_data(example_file, scores, scores_dt)

{'!': 0, '"': 1, '#': 2, '$': 3, '%': 4, '&': 5, "'": 6, '(': 7, ')': 8, '*': 9, '+': 10, ',': 11, '-': 12, '.': 13, '/': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, ';': 26, '<': 27, '=': 28, '>': 29, '?': 30, '@': 31, 'A': 32, 'B': 33, 'C': 34, 'D': 35, 'E': 36, 'F': 37, 'G': 38, 'H': 39, 'I': 40, 'J': 41}
1.0013909320045544


In [6]:
keep_scores = np.where(~count_mx.any(axis=0))[0]
keep_scores

array([ 0,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 16, 17, 18,
       19, 20, 21, 23, 24, 25, 26, 28, 29, 30, 31, 33, 34, 35, 36, 37, 39,
       41])

In [7]:
count_mx = np.delete(count_mx, keep_scores, axis=1)
count_mx = np.delete(count_mx, keep_scores, axis=0)

In [8]:
scores_dt = [k for k,v in scores_dt.items() if v not in keep_scores]
scores_dt = {k:idx for idx, k in enumerate(scores_dt)}
scores_dt

{'#': 0, '.': 1, '7': 2, '<': 3, 'A': 4, 'G': 5, 'I': 6}

In [9]:
np.savetxt('count_mx.csv', count_mx, delimiter=',')

In [50]:
trans_mx = transition_matrix(count_mx)
print(trans_mx)

[[7.21476510e-01 1.00671141e-02 3.35570470e-03 2.65100671e-01
  0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.63327971e-05 3.43780879e-01 1.04562567e-01 2.40230946e-01
  1.47664818e-01 1.63744457e-01 0.00000000e+00]
 [0.00000000e+00 3.31260383e-01 1.90271329e-01 4.86433531e-02
  2.79805767e-01 1.50019168e-01 0.00000000e+00]
 [3.02201756e-04 2.42221902e-01 1.60742553e-02 2.07123327e-01
  1.90257591e-01 3.44020722e-01 0.00000000e+00]
 [1.54964281e-05 1.05752791e-01 1.76969209e-02 3.92886106e-02
  2.15333199e-01 6.21613385e-01 2.99597609e-04]
 [1.24861820e-05 2.83028449e-02 2.36654769e-03 1.17644806e-02
  8.25270034e-02 6.38040568e-01 2.36986069e-01]
 [2.27437878e-06 3.24212695e-03 2.11517227e-04 1.93208478e-03
  1.71852061e-02 3.00811612e-01 6.76615179e-01]]


In [71]:
import time

score = 'I'
count = 0

start = time.time()
while count < 10000:
    count += 1
    score = 'I'
    next_s = list(scores_dt.keys())
    score = simulate_error(scores_dt, trans_mx, score, 150, next_s)
print(score)
print((time.time() - start)/10000)

IIGA...A..GIGGGGIIGGIGGGGIIIGGAGGIIIGGGGIIIGGGGGGGG.AGGGAAGGGGG<.AGGGGGGGGIIIGGGAGGIIIIGGGIIGGIIGGGGIGGIIIIIGIIGIIIIGGGIGIIIII.<GIIIIIIIIIIAAGGGGGGIIII
0.007457117414474487
