In [31]:
from Bio import AlignIO
from Bio.AlignIO import MafIO
import os

First, we will parse the maf alignment into AlignIO

In [32]:
# Parse the maf alignment
# THIS FILE CANNOT BE COMMITTED DUE TO BEING LARGE
alignment = AlignIO.parse('../data/small_subset/chr21.maf', 'maf')

We can now index the maf file using the following function:

In [33]:
idx = MafIO.MafIndex("../data/chr21.mafindex", '../data/small_subset/chr21.maf', "Homo_sapiens.chr21")

If we want to retrieve a 50 kb window, we can do so with the search method:

In [45]:
results = idx.search([20000000], [20050000])
AlignIO.write(results, '../results/small_subset/chr21_subset.fa', "fasta")

15

We can also save each of the alignments separately:

In [47]:
results = idx.search([20000000], [20050000])
for i, j in enumerate(results):
    AlignIO.write(j, '../results/small_subset/individual_alignments/chr21_subset{}.fa'.format(i), "fasta")

And we can save the coordinates easily:

In [78]:
def get_coord_dict(align):
    """
    Function for getting a dictionary with the species, the
    start coordinates and a binary vector where 1 means a nt
    and 0 means a gap.
    """
    dct = {}
    for i, align in enumerate(results):
        dct[i] = {'names':[],'start':[],'gaps':[]}
        AlignIO.write(align, '../results/small_subset/individual_alignments/chr21_subset{}.fa'.format(i), "fasta")
        for record in align:
            dct[i]['names'].append(record.name.split('.')[0])
            dct[i]['start'].append(record.annotations['start'])
            dct[i]['gaps'].append(''.join([str(0) if n=='-' else str(1) for n in record.seq]))
    return dct

In [81]:
results = idx.search([20000000], [20050000])
dct = get_coord_dict(results)
dct[0]

{'names': ['Gorilla_gorilla_gorilla',
  'Homo_sapiens',
  'Pan_paniscus',
  'Pongo_abelii'],
 'start': [6586419, 19999946, 20071554, 7175441],
 'gaps': ['111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111

And as a sanity check we can also save the end coordinates from the alignment and the end coordinates calculated by summing the binary vector of gaps to the start coordinates. 

In [85]:
dct2 = {}
results = idx.search([20000000], [20050000])

for i, align in enumerate(results):
    dct2[i] = {'end':[], 'end2':[]}
    AlignIO.write(align, '../results/small_subset/individual_alignments/chr21_subset{}.fa'.format(i), "fasta")
    for record in align:  
        dct2[i]['end'].append(record.annotations['start']+record.annotations['size'])
        dct2[i]['end2'].append(record.annotations['start']+sum([0 if n=='-' else 1 for n in record.seq]))

for i in dct2:
    print(dct2[i]['end'] == dct2[i]['end2'])

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
