In [1]:
from Bio import AlignIO
from Bio.AlignIO import MafIO
import os
import pandas as pd

First, we will parse the maf alignment into AlignIO

In [3]:
# Parse the maf alignment
# THIS FILE CANNOT BE COMMITTED DUE TO BEING LARGE
alignment = AlignIO.parse('../data/small_subset/chr21.maf', 'maf')

We can now index the maf file using the following function:

In [4]:
idx = MafIO.MafIndex("../data/chr21.mafindex", '../data/small_subset/chr21.maf', "Homo_sapiens.chr21")

If we want to retrieve a 50 kb window, we can do so with the search method:

In [124]:
results = idx.search([20000000], [20050000])
AlignIO.write(results, '../results/small_subset/chr21_subset.fa', "fasta")

15

We can also save each of the alignments separately:

In [125]:
results = idx.search([20000000], [20050000])
for i, j in enumerate(results):
    AlignIO.write(j, '../results/small_subset/individual_alignments/chr21_subset{}.fa'.format(i), "fasta")

And we can save the coordinates easily:

In [126]:
def get_coord_df(align):
    """
    Function for getting a data frame with the species, the
    start coordinates and a binary vector where 1 means a nt
    and 0 means a gap.
    """
    # Create an empty dataframe
    df = pd.DataFrame(columns = ['file', 'species', 'chr', 'start', 'gaps'])
    # For each of the alignments
    for i, align in enumerate(results):
        # Create empty dictionary
        dct = {'species':[], 'chr':[], 'start':[],'gaps':[]}
        # Write individual FASTA file
        AlignIO.write(align, '../results/small_subset/individual_alignments/chr21_subset{}.fa'.format(i), "fasta")
        # For each of the records
        for record in align:
            # Retrieve species
            dct['species'].append(record.name.split('.')[0])
            # Retrieve chromosome/contig
            dct['chr'].append('.'.join(record.name.split('.')[1:]))
            # Retrieve start coordinate
            dct['start'].append(record.annotations['start'])
            # Retrieve gaps encoded in a binary format
            dct['gaps'].append(''.join([str(0) if n=='-' else str(1) for n in record.seq]))
        # Convert dictionary to data frame
        file_df = pd.DataFrame.from_dict(dct)
        # Insert column mapping to the file
        file_df.insert(0, 'file', i, True)
        # Append rows to overall data frame
        df = df.append(file_df)
    return df

In [127]:
results = idx.search([20000000], [20050000])
get_coord_df(results)

Unnamed: 0,file,species,chr,start,gaps
0,0,Gorilla_gorilla_gorilla,chr21,6586419,1111111111111111111111111111111111111111111111...
1,0,Homo_sapiens,chr21,19999946,1111111111111111111111111111111111111111111111...
2,0,Pan_paniscus,CM003404.1,20071554,1111111111111111111111111111111111111111111111...
3,0,Pongo_abelii,CM009283.2,7175441,1111111111111111111111111111111111111111111111...
0,1,Gorilla_gorilla_gorilla,chr21,6590887,1111111111111111111111111111111111111111111111...
1,1,Homo_sapiens,chr21,20004414,1111111111111111111111111111111111111111111111...
2,1,Pan_paniscus,CM003404.1,20076022,1111111111111111111111111111111111111111111111...
3,1,Pongo_abelii,CM009283.2,7179789,1111111111111111111111111111111111111111111111...
0,2,Gorilla_gorilla_gorilla,chr21,6595123,1111111111111111111111111111111111111111111111...
1,2,Homo_sapiens,chr21,20008672,1111111111111111111111111111111111111111111111...


And as a sanity check we can also save the end coordinates from the alignment and the end coordinates calculated by summing the binary vector of gaps to the start coordinates. 

In [111]:
dct2 = {}
results = idx.search([20000000], [20050000])

for i, align in enumerate(results):
    dct2[i] = {'end':[], 'end2':[]}
    AlignIO.write(align, '../results/small_subset/individual_alignments/chr21_subset{}.fa'.format(i), "fasta")
    for record in align:  
        dct2[i]['end'].append(record.annotations['start']+record.annotations['size'])
        dct2[i]['end2'].append(record.annotations['start']+sum([0 if n=='-' else 1 for n in record.seq]))

for i in dct2:
    print(dct2[i]['end'] == dct2[i]['end2'])

True
True
True
True
True
True
True
True
True
True
True
True
True
True
