In [1]:
import numpy as np

In [93]:
class maf_item():
    # This class can store an alignment in a maf file
    # 
    # For example,
    #   a score=27 EG2=4.7e+04 E=2.6e-05
    #   s humanMito 2170 145 + 16571 AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...
    #   s fuguMito  1648 142 + 16447 AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...
    # This data can be stored as follows
    #  entry = maf_item('score=27 EG2=4.7e+04 E=2.6e-05')
    #  entry.append('humanMito', 2170, 145, '+', 16571, 'AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...')
    #  entry.append('fuguMito',  1648, 142, '+', 16447, 'AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...')
    # http://last.cbrc.jp/doc/last-tutorial.html
    def __init__(self, a):
        self._a = a
        self._name    = []
        self._name_hash = {}
        self._start   = []
        self._alnSize = []
        self._strand  = []
        self._seqSize = []
        self._alignment = []
        self.other_fields = {}
    def __str__(self):
        strs = []
        strs.append(self._a + " ({0} sequences)".format(self.size()))
        return('\n'.join(strs))
    def size(self):
        return(len(self._name))
    def append(self, name, start, alnSize, strand, seqSize, alignment):
        # add a sequence into alignment
        self._name_hash[name] = self.size()
        self._name.append(name)
        self._start.append(start)
        self._alnSize.append(alnSize)
        self._strand.append(strand)
        self._seqSize.append(seqSize)
        self._alignment.append(alignment.upper())
    def add_field(self, key, val):
        # add some additional information about alignment
        self.other_fields[key] = val
        
    def count_sub(self, ref_id = 0, tar_id = 1):
        # count match/mismatch/insertion/deletion
        if(self.size() <= max(ref_id, tar_id)):
            return([-1, -1, -1, -1])
        else:
            match = 0
            mismatch = 0
            insertion = 0
            deletion = 0
            seq_ref = self.get(ref_id)[5]
            seq_tar = self.get(tar_id)[5]
            for position in range(len(seq_ref)):
                if(seq_ref[position] == '-'):
                    insertion += 1
                elif(seq_tar[position] == '-'):
                    deletion += 1
                elif(seq_ref[position] == seq_tar[position]):
                    match += 1
                else:
                    mismatch += 1
            return([match, mismatch, insertion, deletion])
    def count(self, ref_id = 0, tar_id = 1):
        count_stats = []
        count_stats += self.get(ref_id)[:-1]
        count_stats += self.get(tar_id)[:-1]
        count_stats += self.count_sub(ref_id, tar_id)
        return(count_stats)
    def get_id(self, name):
        return(self._name_hash[name])
    def get_by_name(self, name):
        return(self.get(self.get_id(name)))
    def get(self, i):
        # get a component of the alignment
        return([self._name[i], self._start[i], self._alnSize[i], 
                self._strand[i], self._seqSize[i], self._alignment[i]])
    def dump(self):
        # show the alignment
        strs = []
        strs.append(self._a + " ({0} sequences)".format(self.size()))
        strs.append('\t'.join(['name', 'start', 'alnSize', 'strand', 'seqSize', 'alignment']))
        strs += ['\t'.join([str(x) for x in self.get(i)]) for i in range(self.size())]
        return('\n'.join(strs))


In [94]:
entry = maf_item('score=27 EG2=4.7e+04 E=2.6e-05')
entry.append('humanMito', 2170, 145, '+', 16571, 'AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...')
entry.append('fuguMito',  1648, 142, '+', 16447, 'AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...')
print(entry.dump())

score=27 EG2=4.7e+04 E=2.6e-05 (2 sequences)
name	start	alnSize	strand	seqSize	alignment
humanMito	2170	145	+	16571	AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...
fuguMito	1648	142	+	16447	AGTAGGCTTAGAAGCAGCCACCA--CAAGAAAGCGTT...


In [95]:
entry.get_by_name('humanMito')

['humanMito',
 2170,
 145,
 '+',
 16571,
 'AGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTT...']

In [98]:
entry.count(ref_id=0, tar_id=1)

['humanMito',
 2170,
 145,
 '+',
 16571,
 'fuguMito',
 1648,
 142,
 '+',
 16447,
 35,
 3,
 0,
 2]

In [25]:
def get_alignments(maf_file):
    alns = []    
    with open(maf_file, 'r') as f:
        alignment_block_str = []
        for l in f:
            if(not l.startswith('#') and len(l.strip()) > 0):
                if(l.startswith('a')):
                    if(len(alignment_block_str) > 0):
                        block = aln(alignment_block_str[0], alignment_block_str[1:])
                        alns.append(block)
                        alignment_block = []
                        alignment_block_str.append(l.strip())
                else:
                    alignment_block_str.append(l.strip())            
        alns.append(aln(alignment_block_str[0], alignment_block_str[1:]))
    return(alns)

In [26]:
import numpy as np

In [27]:
maf_file = '/home/ytanigaw/data/nanopore/20161008_wgs_caucasian_48hr.20k.maf'

In [28]:
alns = get_alignments(maf_file)

TypeError: 'aln' object is not callable

In [29]:
for aln in alns:
    print(aln.count_stats_pair(0, 1))
    

IndexError: list index out of range