In [1]:
from Bio import SeqIO
import numpy as np

## Read in the sequence file. Count number of sequence.

In [2]:
filename='/Users/tracy/Desktop/Computational Medicine/Project/test.fasta'

arrays = list()

with open(filename, "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        arrays.append(np.array([char for char in record.seq]))

seq = np.array(arrays)
num_seq = len(seq)

## MUSCLE alignment: https://www.ebi.ac.uk/Tools/msa/muscle/ to obtain a .clw file

## Read in the clustal alignment file.

In [27]:
x = open(r'/Users/tracy/Desktop/Computational Medicine/Project/test_ala.clw', 'r')
aln = x.read()
x.close
# print(aln)

<function TextIOWrapper.close()>

## Modify clw file to form a list of strings. The last string is the similarity score, and others are the aligned sequence

### Remove title and empty lines

In [5]:
ls = aln.split('\n')
ls = list(filter(None, ls))
ls.remove(ls[0])
# ls

### Remove accession number

In [6]:
no_acc = []
line_len = len(ls[0].split(' ')[-1])
last_seq_len = len(ls[-2].split(' ')[-1])

# Everything but the last couple of lines where length may not equal full length
for ele in range(len(ls)):
    if ele%5 != 4:
        no_acc.append(ls[ele].split(' ')[-1])
    else:
        sig = ls[ele].lstrip()
        add_space = line_len - len(sig)
        
        if add_space != 0:
            sig = " "*add_space + sig
        no_acc.append(sig)

In [7]:
# Deal with the last couple of lines
last_line = no_acc[-1].lstrip()
add_to_last = last_seq_len - len(last_line)

if add_to_last != 0:
    last_line = " "*add_to_last + last_line
last_line
no_acc[-1] = last_line
# no_acc

### Get the desired list of strings

In [9]:
seq_mat = ['']*(num_seq +1)
for i in range (len(no_acc)):
    for j in range(num_seq +1):
        if i%(num_seq +1) == j:
            seq_mat[j] += no_acc[i]
# seq_mat

### Get the starting and ending positions of the conserved sequences, based on the scores.  Store this the range if it is at least of length 8.

https://en.wikipedia.org/wiki/Clustal about scoring:

asterisk *   -   positions that have a single and fully conserved residue

collon :  -  conservation between groups of strongly similar properties with a score greater than .5 on the PAM 250 matrix

period .  -  conservation between groups of weakly similar properties with a score less than or equal to .5 on the PAM 250 matrix

#### !!! The starting position is inclusive and ending position is not, to make indexing more convenient !!!

In [16]:
align_score = seq_mat[-1]

In [17]:
# scores is the alignment score (last string of the list of strings)
# requirement is a list of strings that can be '*' or '.' or ':'
# min_len is the minimum length. In our case it should be 8

def FindConservedRegionWithMinimumLength(scores, requirement, min_len = 8):
    conserved = []
    align_fix_len = FindConservedRegionFixedLength(scores, requirement, min_len)
    if len(align_fix_len) != 0:
        conserved.append(align_fix_len[0])
        for i in range(1, len(align_fix_len)):
            if align_fix_len[i][1] != align_fix_len[i-1][1]:
                conserved.append(align_fix_len[i])
    return conserved

In [18]:
def FindConservedRegionFixedLength(scores, requirement, min_len = 8):
    align_ind = []

    for i in range(len(scores)-min_len):
        if scores[i] in requirement:
            stop = False
            ali_num =0
            now = i
            while stop == False:
                now += 1
                if scores[now] in requirement:
                    ali_num += 1
                else:
                    stop = True
            ali_range = [i, now]
            if (now - i < min_len) == False:
                align_ind.append(ali_range)
    return align_ind

In [26]:
# We can modify the second parameter depending on how conserved we want our consensu sequence to be.
# ['*'] being the most conserved, followed by ['*',':'], followed by ['*',':','.']
positions = FindConservedRegionWithMinimumLength(align_score, ['*',':'], 8)
positions

[[419, 428],
 [709, 717],
 [877, 885],
 [986, 1000],
 [1079, 1088],
 [1089, 1101],
 [1145, 1153],
 [1165, 1188],
 [1202, 1213],
 [1340, 1352],
 [1389, 1398],
 [1404, 1412]]

### Pick a reference sequence from all the virus sequences, and let the sequence corresponding to the range obtained in the previous step be our consensus sequence. (not sure if this is acceptable?)