In [25]:
from kmer import kmer_featurization
import random
import warnings
import math

def pseudo(input, k, M, tolerance = 0.01, max_iterations = 1000):

    N = len(input)
    # Initialization: random assign each read to a cluster
    random.seed(20221021)
    # Q1: Do we have to ensure that every number appear at least once?
    # For example, can we have no reads assigned to cluster 4 at first iteration?
    # A1: Yes
    # Q2: What we do if M > number of reads? Do we force M = number of reads then?
    # A2: add a warning to the users
    if M > N:
        warnings.warn("Notice: M > number of reads!")

    Y = []
    for i in range(N):
        if i+1 > M:
            Y.append(random.randint(1,M))
        else:
            Y.append(i+1)
    random.shuffle(Y)

    Y = [1,3,3,2,1]

    q_im = []
    # NOTE: for q_im, i index is i, while j index is the m^th cluster
    for i in range(N):
        l = []
        for m in range(1, M+1):
            if Y[i] == m:
                l.append(1)
            else:
                l.append(0)
        q_im.append(l)
    print(f"q_im: {q_im}")

    # TODO: wrap everything below with a while loop for each iteration
    alpha = []
    for i in range(1,M+1):
        alpha.append(Y.count(i)/N)
    print(f"alpha: {alpha}")
    round = 0
    new_q_im = []
    for read in input:
        if k > len(input):
            k = len(input)
        obj = kmer_featurization(k)
        # counts of the words of the k-mers in the i^th read
        # x_ij = count of word w_j in the current read
        x_i = obj.obtain_kmer_feature_for_one_sequence(read, write_number_of_occurrences=True)
        # list of posterior probability q_im: read x_i belong to species m
        print(f"X_i: {x_i}") # Note: x_i should never change

        l = 4**len(input[0]) # if length of a word is 3, then l = 64
        p_x_i_lambda_m = [] # length of this array should be M
        for m in range(M):
            p_x_i_lambda_k = 1 # this value will be added to p_x_i_lambda_m
            for j in range(l):
                numerator = 0
                denominator = 0
                for i in range(N): # 1. calculate lambda_mj
                    numerator += (q_im[i][m] * x_i[j])
                    if round == 0: # if we are at our first iteration
                        denominator += (q_im[i][m] * len(input[i]))
                    else:
                        denominator += (q_im[i][m])
                lambda_mj = numerator / denominator
                # print(f"lambda_mj: {lambda_mj}")
                p_x_i_lambda_k = p_x_i_lambda_k * ((math.e**lambda_mj)*(lambda_mj**x_i[j])/(math.factorial(x_i[j])))
            p_x_i_lambda_m.append(p_x_i_lambda_k)
            # print(f"read: {read}, p_x_i_lambda_m: {p_x_i_lambda_m}")
        new_q = []
        for m_ in range(M):
            numerator = alpha[m_] * p_x_i_lambda_m[m_]
            denominator = 1
            for k_ in range(M):
                denominator *= (alpha[k_] * p_x_i_lambda_m[k_])
            new_q.append(numerator/denominator)
        new_q_im.append(new_q)
        round += 1
    print(f"new_q_im: {new_q_im}")

In [6]:
from kmer import kmer_featurization  # import the module kmer_featurization from the kmer.py file

seq_list = 'AAAAAAA'  # a list of DNA sequences

k = 3  # choose the value for k
obj = kmer_featurization(k)  # initialize a kmer_featurization object
kmer_features = obj.obtain_kmer_feature_for_one_sequence(seq_list, write_number_of_occurrences=True)
# If you would like the k-mer features to be the percentage of occurrences (ranging from 0 to 1) as stated above, then leave write_number_of_occurrences as False (the default). If you prefer the features to be the counts for each k-mer occurrence, then set it to True.
print(kmer_features)

[5. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [23]:
import random
# random.seed(20221021)
Y = []
M = 10
N = 5
for i in range(N):
    Y.append(random.randint(1,M))
print(Y)
alpha = []
for i in range(1,M+1):
    alpha.append(Y.count(i)/N)
print(alpha)

[7, 2, 6, 6, 3]
[0.0, 0.2, 0.2, 0.0, 0.0, 0.4, 0.2, 0.0, 0.0, 0.0]


In [26]:
input  = ["AAA", "CCC", "TTT", "GGG", "AAC"]
k = 5
M = 3
pseudo(input, k, M)

q_im: [[1, 0, 0], [0, 0, 1], [0, 0, 1], [0, 1, 0], [1, 0, 0]]
alpha: [0.4, 0.2, 0.4]
X_i: [0. 0. 0. ... 0. 0. 0.]
X_i: [0. 0. 0. ... 0. 0. 0.]
X_i: [0. 0. 0. ... 0. 0. 0.]
X_i: [0. 0. 0. ... 0. 0. 0.]
X_i: [0. 0. 0. ... 0. 0. 0.]
new_q_im: [[12.499999999999998, 6.249999999999999, 12.499999999999998], [12.499999999999998, 6.249999999999999, 12.499999999999998], [12.499999999999998, 6.249999999999999, 12.499999999999998], [12.499999999999998, 6.249999999999999, 12.499999999999998], [12.499999999999998, 6.249999999999999, 12.499999999999998]]


In [4]:
import random
Y = []
N=10
M=8
for i in range(N):
    if i+1 > M:
        Y.append(random.randint(1,M))
    else:
        Y.append(i+1)
random.shuffle(Y)
Y

[5, 8, 6, 5, 1, 3, 8, 7, 4, 2]