<a href="https://colab.research.google.com/github/santiagoahl/rna-taxonomy-prediction/blob/main/functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
def get_data(taxonomy):
  """
  Input: Taxonomy name
  Output: 
    - data: List of ARN Chains for each taxonomy name
    - cod: List of unique codons of the taxonomy
  """
  data = df[df['Taxonomy']==taxonomy]['Codons']
  cod = []
  for b in data:
    cod+=b
  cod = list(set(cod))  # Select unique values
  return data, cod

In [None]:
def prob(prev, post, chain):
  """
  Input: ARN chain sliced by codons and two codons.
  Params:
    - prev = str. Specific codon which stays at first position, this parameter is obligatory. 
    - post = str. Specific codon which stays at second position, this parameter is obligatory. 
    - chain = str. Sequence of codons
  Output:
    - probability = float32. Probability of the event "codon prev is observed before codon post"
  """
  occurrences = 0
  for cod_index in range(len(chain)-1):
    #print('\n ok \n')
    codon = chain[cod_index]
    next_codon = chain[cod_index+1]
    #print(type(codon), type(next_codon))
    #print(codon, next_codon)
    if (codon == prev) & (next_codon == post):
      occurrences+=1
    #print(occurrences)
  probability = occurrences/len(chain)
  #print('Prob ', str(prev)+'+'+str(post)+' :',occurrences/len(chain))
  return probability

In [None]:
# Attempt
from numpy.random import randint

In [None]:
A = [randint(0, 4) for i in range(20)]

In [None]:
A

[2, 0, 3, 0, 3, 3, 2, 3, 1, 3, 3, 2, 0, 2, 3, 0, 2, 3, 2, 0]

In [None]:
prev = 3
post = 3
p = prob(prev, post, A)
#print(p)

Prob:  0.05
Prob:  0.1


In [None]:
def transition_matrix(chain, codons):
  """
  Input: ARN chains and codons of a unique taxonomy
  Output: Transition matrix as npArray
  """
  matches = {i:codons[i] for i in range(len(codons))}
  #print('\n', matches, '\n')
  #print(type(matches[0]))
  #print('hi')
  #print(len(range(len(chain))))
  mat = [[prob(matches[i], matches[j], chain) for j in range(len(codons))] for i in range(len(codons))]
  mat = np.array(mat)
  return mat

In [None]:
c = [i for i in range(4)]

In [None]:
T = transition_matrix(A, c)


 {0: 0, 1: 1, 2: 2, 3: 3} 

hi
20
Prob  0+0 : 0.05
Prob  0+1 : 0.0
Prob  0+2 : 0.0
Prob  0+3 : 0.15
Prob  1+0 : 0.1
Prob  1+1 : 0.1
Prob  1+2 : 0.1
Prob  1+3 : 0.05
Prob  2+0 : 0.05
Prob  2+1 : 0.05
Prob  2+2 : 0.1
Prob  2+3 : 0.0
Prob  3+0 : 0.05
Prob  3+1 : 0.15
Prob  3+2 : 0.0
Prob  3+3 : 0.0


In [None]:
T.shape

(4, 4)

In [None]:
20*T

array([[1., 0., 0., 3.],
       [2., 2., 2., 1.],
       [1., 1., 2., 0.],
       [1., 3., 0., 0.]])

In [None]:
c

[0, 1, 2, 3]

In [None]:
A

[1, 0, 3, 0, 0, 3, 1, 1, 2, 1, 1, 3, 1, 2, 2, 2, 0, 3, 1, 0]

In [None]:
def taxonomy_transition_matrices(tax_data, codons):
  """
  Input: 
    - tax_data := Set of chains of an unique taxonomy
    - codons := Different types of codons founded
  Output: List of transition matrices
  """
  matrices = []
  i=1
  for chain in tax_data:  
    if i%20==0:
      print(str(i), '/', len(tax_data))
    i=i+1
    matrix = transition_matrix(chain, codons)
    matrices.append(matrix)
  return matrices

In [None]:
A = [randint(0, 4) for i in range(20)]
B = [randint(0, 4) for i in range(20)]
C = [randint(0, 4) for i in range(20)]
D = [randint(0, 4) for i in range(20)]
E = [randint(0, 4) for i in range(20)]
F = [randint(0, 4) for i in range(20)]

In [None]:
td = [A, B, C, D, E, F]

In [None]:
m = taxonomy_transition_matrices(td, c)


 {0: 0, 1: 1, 2: 2, 3: 3} 

hi
20
Prob  0+0 : 0.1
Prob  0+1 : 0.2
Prob  0+2 : 0.0
Prob  0+3 : 0.05
Prob  1+0 : 0.15
Prob  1+1 : 0.05
Prob  1+2 : 0.2
Prob  1+3 : 0.0
Prob  2+0 : 0.05
Prob  2+1 : 0.1
Prob  2+2 : 0.0
Prob  2+3 : 0.0
Prob  3+0 : 0.05
Prob  3+1 : 0.0
Prob  3+2 : 0.0
Prob  3+3 : 0.0

 {0: 0, 1: 1, 2: 2, 3: 3} 

hi
20
Prob  0+0 : 0.0
Prob  0+1 : 0.2
Prob  0+2 : 0.0
Prob  0+3 : 0.05
Prob  1+0 : 0.05
Prob  1+1 : 0.0
Prob  1+2 : 0.05
Prob  1+3 : 0.15
Prob  2+0 : 0.05
Prob  2+1 : 0.05
Prob  2+2 : 0.0
Prob  2+3 : 0.05
Prob  3+0 : 0.15
Prob  3+1 : 0.0
Prob  3+2 : 0.1
Prob  3+3 : 0.05

 {0: 0, 1: 1, 2: 2, 3: 3} 

hi
20
Prob  0+0 : 0.05
Prob  0+1 : 0.1
Prob  0+2 : 0.25
Prob  0+3 : 0.0
Prob  1+0 : 0.05
Prob  1+1 : 0.05
Prob  1+2 : 0.0
Prob  1+3 : 0.0
Prob  2+0 : 0.2
Prob  2+1 : 0.0
Prob  2+2 : 0.15
Prob  2+3 : 0.05
Prob  3+0 : 0.05
Prob  3+1 : 0.0
Prob  3+2 : 0.0
Prob  3+3 : 0.0

 {0: 0, 1: 1, 2: 2, 3: 3} 

hi
20
Prob  0+0 : 0.05
Prob  0+1 : 0.05
Prob  0+2 : 0.05
Prob  0+3 : 0.1
Prob

In [None]:
m

[array([[0.1 , 0.2 , 0.  , 0.05],
        [0.15, 0.05, 0.2 , 0.  ],
        [0.05, 0.1 , 0.  , 0.  ],
        [0.05, 0.  , 0.  , 0.  ]]), array([[0.  , 0.2 , 0.  , 0.05],
        [0.05, 0.  , 0.05, 0.15],
        [0.05, 0.05, 0.  , 0.05],
        [0.15, 0.  , 0.1 , 0.05]]), array([[0.05, 0.1 , 0.25, 0.  ],
        [0.05, 0.05, 0.  , 0.  ],
        [0.2 , 0.  , 0.15, 0.05],
        [0.05, 0.  , 0.  , 0.  ]]), array([[0.05, 0.05, 0.05, 0.1 ],
        [0.  , 0.  , 0.  , 0.1 ],
        [0.1 , 0.05, 0.15, 0.  ],
        [0.05, 0.05, 0.1 , 0.1 ]]), array([[0.15, 0.05, 0.  , 0.1 ],
        [0.1 , 0.  , 0.05, 0.05],
        [0.05, 0.1 , 0.1 , 0.05],
        [0.  , 0.05, 0.1 , 0.  ]]), array([[0.  , 0.1 , 0.05, 0.1 ],
        [0.15, 0.  , 0.05, 0.  ],
        [0.1 , 0.  , 0.05, 0.1 ],
        [0.05, 0.1 , 0.05, 0.05]])]