In [1]:
import numpy as np

In [116]:
class maf_item():
    # This class can store an alignment in a maf file
    # 
    # For example,
    #   a score=27 EG2=4.7e+04 E=2.6e-05
    #   s humanMito 2170 145 + 16571 AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...
    #   s fuguMito  1648 142 + 16447 AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...
    # This data can be stored as follows
    #  entry = maf_item('score=27 EG2=4.7e+04 E=2.6e-05')
    #  entry.append('humanMito', 2170, 145, '+', 16571, 'AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...')
    #  entry.append('fuguMito',  1648, 142, '+', 16447, 'AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...')
    # http://last.cbrc.jp/doc/last-tutorial.html
    def __init__(self, a):
        self._a = a
        self._name    = []
        self._name_hash = {}
        self._start   = []
        self._alnSize = []
        self._strand  = []
        self._seqSize = []
        self._alignment = []
        self.other_fields = {}
    def __str__(self):
        strs = []
        strs.append(self._a + " ({0} sequences)".format(self.size()))
        return('\n'.join(strs))
    def size(self):
        return(len(self._name))
    def append(self, l):
        name, start, alnSize, strand, seqSize, alignment = l[0], l[1], l[2], l[3], l[4], l[5]
        # add a sequence into alignment
        self._name_hash[name] = self.size()
        self._name.append(name)
        self._start.append(start)
        self._alnSize.append(alnSize)
        self._strand.append(strand)
        self._seqSize.append(seqSize)
        self._alignment.append(alignment.upper())
    def add_field(self, key, val):
        # add some additional information about alignment
        self.other_fields[key] = val
        
    def count_sub(self, ref_id = 0, tar_id = 1):
        # count match/mismatch/insertion/deletion
        if(self.size() <= max(ref_id, tar_id)):
            return([-1, -1, -1, -1])
        else:
            match = 0
            mismatch = 0
            insertion = 0
            deletion = 0
            seq_ref = self.get(ref_id)[5]
            seq_tar = self.get(tar_id)[5]
            for position in range(len(seq_ref)):
                if(seq_ref[position] == '-'):
                    insertion += 1
                elif(seq_tar[position] == '-'):
                    deletion += 1
                elif(seq_ref[position] == seq_tar[position]):
                    match += 1
                else:
                    mismatch += 1
            return([match, mismatch, insertion, deletion])
    def count(self, ref_id = 0, tar_id = 1):
        count_stats = []
        count_stats += self.get(ref_id)[:-1]
        count_stats += self.get(tar_id)[:-1]
        count_stats += self.count_sub(ref_id, tar_id)
        return(count_stats)
    def get_id(self, name):
        return(self._name_hash[name])
    def get_by_name(self, name):
        return(self.get(self.get_id(name)))
    def get(self, i):
        # get a component of the alignment
        return([self._name[i], self._start[i], self._alnSize[i], 
                self._strand[i], self._seqSize[i], self._alignment[i]])
    def dump(self):
        # show the alignment
        strs = []
        strs.append(self._a + " ({0} sequences)".format(self.size()))
        strs.append('\t'.join(['name', 'start', 'alnSize', 'strand', 'seqSize', 'alignment']))
        strs += ['\t'.join([str(x) for x in self.get(i)]) for i in range(self.size())]
        return('\n'.join(strs))


In [117]:
entry = maf_item('score=27 EG2=4.7e+04 E=2.6e-05')
entry.append(['humanMito', 2170, 145, '+', 16571, 'AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...'])
entry.append(['fuguMito',  1648, 142, '+', 16447, 'AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...'])
print(entry.dump())

score=27 EG2=4.7e+04 E=2.6e-05 (2 sequences)
name	start	alnSize	strand	seqSize	alignment
humanMito	2170	145	+	16571	AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...
fuguMito	1648	142	+	16447	AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...


In [118]:
entry.get_by_name('humanMito')

['humanMito',
 2170,
 145,
 '+',
 16571,
 'AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...']

In [119]:
entry.count(ref_id=0, tar_id=1)

['humanMito',
 2170,
 145,
 '+',
 16571,
 'fuguMito',
 1648,
 142,
 '+',
 16447,
 35,
 3,
 0,
 2]

In [103]:
sample_maf = '''# LAST version 759
#
# a=7 b=1 A=7 B=1 e=22 d=13 x=21 y=9 z=21 D=1e+06 E=3.01732e+07
# R=01 u=2 s=2 S=0 M=0 T=0 m=10 l=1 n=10 k=1 w=1000 t=0.910239 j=3 Q=0
# /tmp/last-759/humdb
# Reference sequences=1 normal letters=16571
# lambda=1.09602 K=0.335388
#
#    A  C  G  T
# A  1 -1 -1 -1
# C -1  1 -1 -1
# G -1 -1  1 -1
# T -1 -1 -1  1
#
# Coordinates are 0-based.  For - strand matches, coordinates
# in the reverse complement of the 2nd sequence are used.
#
# name start alnSize strand seqSize alignment
#
# batch 0
a score=31 EG2=5.9e+02 E=3.2e-07
s humanMito 3217 87 + 16571 AAGAACAGGGTTTGTTAAGATGGCAGAGCCCGGTAATCGCATAAAACTTAaaactTTACAGTCAGAGGTTCAATTCCTCTTCTTAAC
s fuguMito  2742 85 + 16447 AAGAACATGACATGTTAGTGTGGCAGAGCCCGGTA-TTGCAAAAGCCTTAAACCCTT-CGAACAGAGGTTCAACTCCTCTCCCTAAC

a score=27 EG2=4.7e+04 E=2.6e-05
s humanMito 2170 145 + 16571 AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAACATATAACTGAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATGTTAGTATAAGTAA
s fuguMito  1648 142 + 16447 AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTTAAAGCTCTAGCACATCCCTGCCACAAATACC-AATAAAACACTCCTAACCCCTTCCCCTACCGGGCTTTTCTATGCTTCCATAGAAGAAATTATGCTAAAATGAGTAA

a score=26 EG2=1.4e+05 E=7.6e-05
s humanMito 15977 56 + 16571 CCACCATTAGCACCCAAAGCTAAGATTCTAATTTAAACTATTCTCTGTTCTTTCAT
s fuguMito  15578 56 + 16447 CCACCACTGGCTCCCAAAGCCAGCATTCTTAATTAAACTACTTTTTGATAATACAT

a score=25 EG2=4.2e+05 E=0.00023
s humanMito 8300 80 + 16571 TAAAGCTAACTTAGCATTAACCTTTTAAGTTAAAGATTAAGAGAACCAACACCTCTTTACAGTGAAATGCCCCAACTAAA
s fuguMito  7849 77 + 16447 TAAAACGGACACAGCGTTAGCCTTTTAAGCTAAAAATGGTGCCTACCAAACACCCTT---AGTGAAATGCCTCAACTCAA

# Query sequences=1
'''

In [107]:
maf_file = '/home/ytanigaw/myalns.maf'

In [125]:
def count(maf_file):
    entry = None
    strs = []
    with open(maf_file, 'r') as f:
        for l in f:
            if((not l.startswith('#')) and len(l.strip()) > 0):
                if(l.startswith('a')):
                    if(entry != None):
                        strs.append('\t'.join([str(x) for x in entry.count(ref_id=0, tar_id=1)]))
                    entry = maf_item(l[2:])
                elif(l.startswith('s')):
                    entry.append(l.split()[1:])
                else:
                    splitted_line = l.split()
                    entry.add_field(splitted_line[0], ' '.join(splitted_line[1:]))
    if(entry != None):
        strs.append('\t'.join([str(x) for x in entry.count(ref_id=0, tar_id=1)]))
    return('\n'.join(strs))

In [126]:
print(count(maf_file))

humanMito	5916	2331	+	16571	fuguMito	5456	2352	+	16447	1723	608	21	0
humanMito	8746	2062	+	16571	fuguMito	8292	2075	+	16447	1381	681	13	0
humanMito	3320	2044	+	16571	fuguMito	2859	2045	+	16447	1318	720	7	6
humanMito	12514	1524	+	16571	fuguMito	12107	1530	+	16447	1034	490	6	0
humanMito	14432	1448	+	16571	fuguMito	14029	1448	+	16447	983	465	0	0
humanMito	10924	1290	+	16571	fuguMito	10486	1290	+	16447	863	427	0	0
humanMito	598	906	+	16571	fuguMito	16	905	+	16447	621	275	9	10
humanMito	2914	193	+	16571	fuguMito	2433	193	+	16447	183	10	0	0
humanMito	2422	423	+	16571	fuguMito	1916	428	+	16447	309	113	6	1
humanMito	5512	381	+	16571	fuguMito	5059	385	+	16447	271	105	9	5
humanMito	1742	289	+	16571	fuguMito	1182	300	+	16447	211	76	13	2
humanMito	1549	90	+	16571	fuguMito	961	93	+	16447	77	13	3	0
humanMito	12264	71	+	16571	fuguMito	11840	73	+	16447	63	8	2	0
humanMito	16456	71	+	16571	fuguMito	15935	71	+	16447	53	18	0	0
humanMito	3217	87	+	16571	fuguMito	2742	85	+	16447	66	19	0	2
humanMito	2170	145

In [100]:
's fuguMito  7849 77 + 16447 TAAAACGGACACAGCGTTAGCCTTTTAAGCTAAAAATGGTGCCTACCAAACACCCTT---AGTGAAATGCCTCAACTCAA'.split()

['s',
 'fuguMito',
 '7849',
 '77',
 '+',
 '16447',
 'TAAAACGGACACAGCGTTAGCCTTTTAAGCTAAAAATGGTGCCTACCAAACACCCTT---AGTGAAATGCCTCAACTCAA']

In [25]:
def get_alignments(maf_file):
    alns = []    
    with open(maf_file, 'r') as f:
        alignment_block_str = []
        for l in f:
            if(not l.startswith('#') and len(l.strip()) > 0):
                if(l.startswith('a')):
                    if(len(alignment_block_str) > 0):
                        block = aln(alignment_block_str[0], alignment_block_str[1:])
                        alns.append(block)
                        alignment_block = []
                        alignment_block_str.append(l.strip())
                else:
                    alignment_block_str.append(l.strip())            
        alns.append(aln(alignment_block_str[0], alignment_block_str[1:]))
    return(alns)

In [26]:
import numpy as np

In [27]:
maf_file = '/home/ytanigaw/data/nanopore/20161008_wgs_caucasian_48hr.20k.maf'

In [28]:
alns = get_alignments(maf_file)

TypeError: 'aln' object is not callable

In [29]:
for aln in alns:
    print(aln.count_stats_pair(0, 1))
    

IndexError: list index out of range