# summary 
- In this note, I wrote a piece of code to 
  - read maf file (output from LAST aligner)
  - count match/mismatch/indel

## maf file
- Here is a sample maf file

In [1]:
                                    maf_file = '../sample_data/cDNA-sample.maf'

In [2]:
!cat $maf_file

# LAST version 759
#
# a=21 b=9 A=21 B=9 e=115 d=115 x=114 y=44 z=114 D=100 E=1.72574e+06
# R=01 u=0 s=2 S=0 M=0 T=0 m=10 l=1 n=10 k=1 w=1000 t=4.36661 j=3 Q=1
# /share/PI/mrivas/data/hg19/hg19.lastdb
# Reference sequences=93 normal letters=2897310462
# lambda=0.228526 K=0.433378
#
#    A  C  G  T
# A  6 -18 -18 -18
# C -18  6 -18 -18
# G -18 -18  6 -18
# T -18 -18 -18  6
#
# Coordinates are 0-based.  For - strand matches, coordinates
# in the reverse complement of the 2nd sequence are used.
#
# name start alnSize strand seqSize alignment
#
# m=0.01 s=135 d=1 c=0 t=1e-05 M=7 S=1.7
# trans=-148
#
# Query sequences=1
a score=438 mismap=1e-10
s chr9                                                                        97802874 270 + 141213431 GGAGAGCAGGGACCTTTCTCCCTCCCCTCCTCTATGTCTTTCTGACCAGGTGCCCCCAGGACGCCTATTGCTGATGGGCATAG-GCATTCCCAGTTCCCTCCTGGCCCAAGCTCCTGGAG-GTGGGACCACACATGACAGAGCGACACCCAGCTTGTCCTCTCTTTTGCAAAGTGCAGGATGAGAATTTTGGGTTATCCTGGCTCCTCCCATGTTTTTTGCAGCT

## maf file parser
- parse consists of 
  1. class to store alignment
  1. parser function

In [3]:
class aln():
    def __init__(self, a_filed, alignment):
        self.a_filed = a_filed
        self.src = []
        self.start = []
        self.size = []
        self.strand = []
        self.srcSize = []
        self.text = []
        self.other = []
        for entry in alignment:
            if(entry.startswith('s')):
                elements = entry.split()
                self.src.append(elements[1])
                self.start.append(elements[2])
                self.size.append(elements[3])
                self.strand.append(elements[4])
                self.srcSize.append(elements[5])
                self.text.append(elements[6])
            else:
                self.other.append(entry)
        self.num_seq = len(self.src)
    def count_stats_pair(self, ref, target):
        insertion = 0
        deletion = 0
        match = 0
        mismatch = 0
        for position in range(len(self.text[ref])):
            if(self.text[ref] == '_'):
                insertion += 1
            elif(self.text[target] == '_'):
                deletion += 1
            elif(self.text[ref] == self.text[target]):
                match += 1
            else:
                mismatch += 1
        return([match, mismatch, insertion, deletion])
    def count_stats(self, ref = 0, targets = None):
        if(targets == None):
            return([self.count_stats_pair(ref, j) for j in range(self.num_seq)])
        else:
            return([self.count_stats_pair(ref, j) for j in targets])
    def __str__(self):
        strs = []
        for i in range(self.num_seq):
            strs.append(self.text[i])
        return('\n'.join(strs))

In [4]:
def get_alignments(maf_file):
    alns = []    
    with open(maf_file, 'r') as f:
        alignment_block_str = []
        for l in f:
            if(not l.startswith('#') and len(l.strip()) > 0):
                if(l.startswith('a')):
                    if(len(alignment_block_str) > 0):
                        block = aln(alignment_block_str[0], alignment_block_str[1:])
                        alignment_block = []
                        alignment_block_str.append(l.strip())
                else:
                    alignment_block_str.append(l.strip())            
        alns.append(aln(alignment_block_str[0], alignment_block_str[1:]))
    return(alns)

## usage

In [5]:
alns = get_alignments(maf_file)

In [6]:
for aln in alns:
    print(aln.count_stats())

[[272, 0, 0, 0]]
