# Consensus and Profile

1. Read in fasta sequences and construct sequence matrix (dna_mat)
2. Construct profile (profile_mat)
3. Construct consensus matrix (cons_mat) - matrix if more than one consensus strings possible

In [1]:
#import Bio

In [13]:
from Bio import SeqIO
for seq_record in SeqIO.parse("test.fasta", "fasta"):
    fasta_record = repr(seq_record.seq)
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

Rosalind_1
Seq('ATCCAGCT', SingleLetterAlphabet())
8
Rosalind_2
Seq('GGGCAACT', SingleLetterAlphabet())
8
Rosalind_3
Seq('ATGGATCT', SingleLetterAlphabet())
8
Rosalind_4
Seq('AAGCAACC', SingleLetterAlphabet())
8
Rosalind_5
Seq('TTGGAACT', SingleLetterAlphabet())
8
Rosalind_6
Seq('ATGCCATT', SingleLetterAlphabet())
8
Rosalind_7
Seq('ATGGCACT', SingleLetterAlphabet())
8


In [23]:
import numpy as np

from Bio import SeqIO
with open("test.fasta", "r") as fasta_handle: # test.fasta
    fasta_record = list(SeqIO.parse(fasta_handle, "fasta"))
    strings_length = [len(dna_string) for dna_string in fasta_record]
    dna_strings = [list(str(dna_string.seq)) for dna_string in fasta_record]

In [24]:
dna_strings

[['A', 'T', 'C', 'C', 'A', 'G', 'C', 'T'],
 ['G', 'G', 'G', 'C', 'A', 'A', 'C', 'T'],
 ['A', 'T', 'G', 'G', 'A', 'T', 'C', 'T'],
 ['A', 'A', 'G', 'C', 'A', 'A', 'C', 'C'],
 ['T', 'T', 'G', 'G', 'A', 'A', 'C', 'T'],
 ['A', 'T', 'G', 'C', 'C', 'A', 'T', 'T'],
 ['A', 'T', 'G', 'G', 'C', 'A', 'C', 'T']]

In [25]:
# create positional matrix
string_length = strings_length[0]


# manually transposing matrix
dna_mat = []

for position in range(string_length):
    #dna_mat.append([item[position] for item in dna_strings].count('A'))
    dna_mat.append([item[position] for item in dna_strings])
    
A_mat = [item.count('A') for item in dna_mat]
C_mat = [item.count('C') for item in dna_mat]
G_mat = [item.count('G') for item in dna_mat]
T_mat = [item.count('T') for item in dna_mat]

profile_mat = np.array([A_mat, C_mat, G_mat, T_mat]) # np.mat    

In [26]:
dna_mat

[['A', 'G', 'A', 'A', 'T', 'A', 'A'],
 ['T', 'G', 'T', 'A', 'T', 'T', 'T'],
 ['C', 'G', 'G', 'G', 'G', 'G', 'G'],
 ['C', 'C', 'G', 'C', 'G', 'C', 'G'],
 ['A', 'A', 'A', 'A', 'A', 'C', 'C'],
 ['G', 'A', 'T', 'A', 'A', 'A', 'A'],
 ['C', 'C', 'C', 'C', 'C', 'T', 'C'],
 ['T', 'T', 'T', 'C', 'T', 'T', 'T']]

In [27]:
profile_mat.shape

(4, 8)

In [28]:
profile_mat

array([[5, 1, 0, 0, 5, 5, 0, 0],
       [0, 0, 1, 4, 2, 0, 6, 1],
       [1, 1, 6, 3, 0, 1, 0, 0],
       [1, 5, 0, 0, 0, 1, 1, 6]])

In [29]:
np.max(profile_mat, axis=0)

array([5, 5, 6, 4, 5, 5, 6, 6])

In [35]:
cons_dict = {0: 'A', 1:'C', 2:'G', 3:'T'}

#cons_indices_list = np.array(cons_indices)[0].tolist()
cons_indices = np.argmax(profile_mat, axis=0)
cons_indices_list = cons_indices.tolist()
cons_indices_list

[0, 3, 2, 1, 0, 0, 1, 3]

In [36]:
cons_string = [cons_dict[position] for position in cons_indices_list]
print("".join(cons_string))

ATGCAACT


In [37]:
# Printing profile matrix

# Converting integer list to string list 
# and joining the list using join() 
A_mat_str = "A: " + " ".join(map(str, profile_mat[0,:].tolist()))
C_mat_str = "C: " + " ".join(map(str, profile_mat[1,:].tolist()))
G_mat_str = "G: " + " ".join(map(str, profile_mat[2,:].tolist()))
T_mat_str = "T: " + " ".join(map(str, profile_mat[3,:].tolist()))
print("{}\n{}\n{}\n{}".format(A_mat_str, C_mat_str, G_mat_str, T_mat_str))

A: 5 1 0 0 5 5 0 0
C: 0 0 1 4 2 0 6 1
G: 1 1 6 3 0 1 0 0
T: 1 5 0 0 0 1 1 6
