# Count match/mismatch/insertion/deletion
- Given a maf file (out put from LAST aligner), we would like to count match/mismatch/insertion/deletion with respect to reference sequence

## a class to represent alignment

In [1]:
class maf_item():
    # This class can store an alignment in a maf file
    # 
    # For example,
    #   a score=27 EG2=4.7e+04 E=2.6e-05
    #   s humanMito 2170 145 + 16571 AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...
    #   s fuguMito  1648 142 + 16447 AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...
    # This data can be stored as follows
    #  entry = maf_item('score=27 EG2=4.7e+04 E=2.6e-05')
    #  entry.append('humanMito', 2170, 145, '+', 16571, 'AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...')
    #  entry.append('fuguMito',  1648, 142, '+', 16447, 'AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...')
    # http://last.cbrc.jp/doc/last-tutorial.html
    def __init__(self, a):
        self._a = a
        self._name    = []
        self._name_hash = {}
        self._start   = []
        self._alnSize = []
        self._strand  = []
        self._seqSize = []
        self._alignment = []
        self.other_fields = {}
    def __str__(self):
        strs = []
        strs.append(self._a + " ({0} sequences)".format(self.size()))
        return('\n'.join(strs))
    def size(self):
        return(len(self._name))
    def append(self, l):
        name, start, alnSize, strand, seqSize, alignment = l[0], l[1], l[2], l[3], l[4], l[5]
        # add a sequence into alignment
        self._name_hash[name] = self.size()
        self._name.append(name)
        self._start.append(start)
        self._alnSize.append(alnSize)
        self._strand.append(strand)
        self._seqSize.append(seqSize)
        self._alignment.append(alignment.upper())
    def add_field(self, key, val):
        # add some additional information about alignment
        self.other_fields[key] = val
        
    def count_sub(self, ref_id = 0, tar_id = 1):
        # count match/mismatch/insertion/deletion
        if(self.size() <= max(ref_id, tar_id)):
            return([-1, -1, -1, -1])
        else:
            match = 0
            mismatch = 0
            insertion = 0
            deletion = 0
            seq_ref = self.get(ref_id)[5]
            seq_tar = self.get(tar_id)[5]
            for position in range(len(seq_ref)):
                if(seq_ref[position] == '-'):
                    insertion += 1
                elif(seq_tar[position] == '-'):
                    deletion += 1
                elif(seq_ref[position] == seq_tar[position]):
                    match += 1
                else:
                    mismatch += 1
            return([match, mismatch, insertion, deletion])
    def count(self, ref_id = 0, tar_id = 1):
        count_stats = []
        count_stats += self.count_sub(ref_id, tar_id)
        count_stats += self.get(ref_id)[:-1]
        count_stats += self.get(tar_id)[:-1]
        return(count_stats)
    def get_id(self, name):
        return(self._name_hash[name])
    def get_by_name(self, name):
        return(self.get(self.get_id(name)))
    def get(self, i):
        # get a component of the alignment
        return([self._name[i], self._start[i], self._alnSize[i], 
                self._strand[i], self._seqSize[i], self._alignment[i]])
    def dump(self):
        # show the alignment
        strs = []
        strs.append(self._a + " ({0} sequences)".format(self.size()))
        strs.append('\t'.join(['name', 'start', 'alnSize', 'strand', 'seqSize', 'alignment']))
        strs += ['\t'.join([str(x) for x in self.get(i)]) for i in range(self.size())]
        return('\n'.join(strs))


### test code for this class

- we can construct an alignment by adding the elements in maf file

In [2]:
entry = maf_item('score=27 EG2=4.7e+04 E=2.6e-05')
entry.append(['humanMito', 2170, 145, '+', 16571, 'AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...'])
entry.append(['fuguMito',  1648, 142, '+', 16447, 'AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...'])
print(entry.dump())

score=27 EG2=4.7e+04 E=2.6e-05 (2 sequences)
name	start	alnSize	strand	seqSize	alignment
humanMito	2170	145	+	16571	AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...
fuguMito	1648	142	+	16447	AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...


- we can retrieve a sequence in an alignemt

In [3]:
entry.get_by_name('humanMito')

['humanMito',
 2170,
 145,
 '+',
 16571,
 'AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...']

- We can also count match/mismatch/insertion/deletion of a pair of sequences

In [4]:
print(entry.count(ref_id=0, tar_id=1))

[35, 3, 0, 2, 'humanMito', 2170, 145, '+', 16571, 'fuguMito', 1648, 142, '+', 16447]


## main function

In [5]:
def count_main(maf_file):
    entry = None
    strs = []
    strs.append('\t'.join(['match', 'mismatch', 'insertion', 'deletion',
                           'ref_name', 'ref_start', 'ref_alnSize', 'ref_strand', 'ref_seqSize',
                           'tar_name', 'tar_start', 'tar_alnSize', 'tar_strand', 'tar_seqSize']))
    with open(maf_file, 'r') as f:
        for l in f:
            if((not l.startswith('#')) and len(l.strip()) > 0):
                if(l.startswith('a')):
                    if(entry != None):
                        strs.append('\t'.join([str(x) for x in entry.count(ref_id=0, tar_id=1)]))
                    entry = maf_item(l[2:])
                elif(l.startswith('s')):
                    entry.append(l.split()[1:])
                else:
                    splitted_line = l.split()
                    entry.add_field(splitted_line[0], ' '.join(splitted_line[1:]))
    if(entry != None):
        strs.append('\t'.join([str(x) for x in entry.count(ref_id=0, tar_id=1)]))
    return('\n'.join(strs))

### how it works?

In [6]:
maf_file = '/home/ytanigaw/myalns.maf'

In [7]:
print(count_main(maf_file))

match	mismatch	insertion	deletion	ref_name	ref_start	ref_alnSize	ref_strand	ref_seqSize	tar_name	tar_start	tar_alnSize	tar_strand	tar_seqSize
1723	608	21	0	humanMito	5916	2331	+	16571	fuguMito	5456	2352	+	16447
1381	681	13	0	humanMito	8746	2062	+	16571	fuguMito	8292	2075	+	16447
1318	720	7	6	humanMito	3320	2044	+	16571	fuguMito	2859	2045	+	16447
1034	490	6	0	humanMito	12514	1524	+	16571	fuguMito	12107	1530	+	16447
983	465	0	0	humanMito	14432	1448	+	16571	fuguMito	14029	1448	+	16447
863	427	0	0	humanMito	10924	1290	+	16571	fuguMito	10486	1290	+	16447
621	275	9	10	humanMito	598	906	+	16571	fuguMito	16	905	+	16447
183	10	0	0	humanMito	2914	193	+	16571	fuguMito	2433	193	+	16447
309	113	6	1	humanMito	2422	423	+	16571	fuguMito	1916	428	+	16447
271	105	9	5	humanMito	5512	381	+	16571	fuguMito	5059	385	+	16447
211	76	13	2	humanMito	1742	289	+	16571	fuguMito	1182	300	+	16447
77	13	3	0	humanMito	1549	90	+	16571	fuguMito	961	93	+	16447
63	8	2	0	humanMito	12264	71	+	16571	fuguMito	11840	73	+	16447
