In [1]:
import os
import gffutils
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
import json
from pyfaidx import Fasta
from Bio import SeqIO
import re
from pprint import pprint as pp

In [2]:
# globals
template_species_list = ["Bcur", "Bdor", "Bole", "Ccap"]
transvestigated_species_set = {'Bcor', 'Blat', 'Bzon', 'Afra', 'Bmin', 'Bjar', 'Aobl'}
gff_path = "./input/gff/"
fasta_path = "./input/fasta/"
groups_fn = "./input/groups_filtered_6181genes.txt"
output_path = "./output/"
intermediate_path = "./intermediate/"
aligned_fasta_path = "./input/aligned_13spp_fasta/"

# gap filter parameters
max_gap_percent = 0
max_gap_length = 0
# cds length filter parameters
min_cds_length = 400
max_cds_length = 600

In [3]:
# create handles for all .db files in intermediate directory
gff_fn = {name.split('.gff.db')[0]: intermediate_path + name for name in os.listdir(intermediate_path) if ".gff.db" in name}
gff = {key: gffutils.FeatureDB(value) for key, value in gff_fn.items()}

# create handles for all .fasta files in fasta directory
fasta_fn = {name.split('.nt.fasta')[0]: fasta_path + name for name in os.listdir(fasta_path) if
         ((".nt.fasta" in name) and (".nt.fasta.fai" not in name))}
fasta = {key: Fasta(value) for key, value in fasta_fn.items()}
        
# import ortholog groups
with open(intermediate_path + "groups.json", 'r') as f:
    parent_groups = json.load(f)

# create handles for all .fasta files in aligned_13spp_fasta directory
aligned_fasta_fn = {name.split('.13spp')[0]: aligned_fasta_path + name for name in os.listdir(aligned_fasta_path) if
         ((".fasta.aln" in name) and (".fasta.aln.fai" not in name))}

In [4]:
# define functions to parse coordinates of cds's from concatinated aligned fasta w/ n's and -'s
nnn = 50
def findBreakpoints(seq):
    breakpoints = []
    loc = 0
    regex = re.compile(r"n+[-+n+]*")
    while(True):
        #loc = seq.find(nnn, loc)
        match = regex.search(seq, loc)
        if not match:
            break
        if len(match.group().replace('-', '')) >= nnn:
            breakpoints.append(match.span())
        loc = match.end()
    return(breakpoints)

def findExonCoords(seq):
    breakpoints = findBreakpoints(seq)
    length = len(seq)

    if len(breakpoints) == 0:
        return([(0, length)])

    if len(breakpoints) == 1:
        bp = breakpoints[0]
        return([(0, bp[0]), (bp[1], length)])

    elif len(breakpoints) > 0:
        exonCoords = []
        exonCoords.append((0, breakpoints[0][0])) # first exon

        for i in range(len(breakpoints) + 1)[1:-1]: # all intermediate exons
            ex_start = breakpoints[i-1][1]
            ex_end = breakpoints[i][0]
            exonCoords.append((ex_start, ex_end))

        exonCoords.append((breakpoints[-1][1], length)) # last exon
        return(exonCoords)
    
def gapPercent(seq):
    seq = str(seq)
    gappedLen = len(seq)
    gapCount = seq.count('-')
    return( (100.0*gapCount)/gappedLen )

def longestGap(seq):
    seq = str(seq)
    gap_regex = re.compile(r"-+")
    gap_list = gap_regex.findall(seq)
    if gap_list:
        return(sorted([len(gap) for gap in gap_list], reverse=True)[0])
    else:
        return(0)

In [5]:
# read and parse fasta files for each species
aligned_fasta = {}
for ortho in aligned_fasta_fn.keys():
    aligned_fasta[ortho] = {seq_record.id : seq_record 
                                      for seq_record in SeqIO.parse(aligned_fasta_fn[ortho],
                                                                    "fasta", alphabet=IUPAC.ambiguous_dna)}

In [6]:
# parse coords from template species in aligned fasta's and trash entries w/ all gaps
coords = {} # coords[ortho][sp] = [coord, ]
for ortho in aligned_fasta:
    coords[ortho] = {}
    for sp in template_species_list:
        seq = str(aligned_fasta[ortho][sp].seq)
        temp_coords = findExonCoords(str(aligned_fasta[ortho][sp].seq))
        for start,end in temp_coords:
            cds = seq[start:end]
            if len(cds) != cds.count('-'):
                if sp not in coords[ortho]:
                    coords[ortho][sp] = (start,end)
                elif type(coords[ortho][sp]) is list:
                    coords[ortho][sp].append((start,end))
                else:
                    temp = coords[ortho][sp]
                    coords[ortho][sp] = [temp, (start,end)]

In [7]:
# sanity check for multiple non gap template cds's per ortho,sp
for ortho in coords:
    for sp in coords[ortho]:
        if type(coords[ortho][sp]) is list:
            print("error, multiple non-gap template cds's for {},{}: {}".format(ortho, sp, coords[ortho][sp]))

In [8]:
# Filter aligned exons
ortho_coords = {}
for ortho in coords:
    ortho_coords[ortho] = {}
    for sp in coords[ortho]:
        coord = coords[ortho][sp]

        # filter for length
        start, end = coord
        length = end - start
        if not min_cds_length <= length <= max_cds_length:
            continue

        # filter for gap percent
        seq = str(aligned_fasta[ortho][sp].seq[start:end])
        if gapPercent(seq) > max_gap_percent:
            continue

        # filter for gap length
        if longestGap(seq) > max_gap_length:
            continue

        # prep to filter for species membership of ortho
        if coord not in ortho_coords[ortho].keys():
            ortho_coords[ortho][coord] = set()
        ortho_coords[ortho][coord].add(sp)

# set of coords per ortho which were represented in all species
universal_ortho_coords = {}
for ortho in ortho_coords:
    for coord in ortho_coords[ortho]:
        sp_set = ortho_coords[ortho][coord]
        if len(sp_set) == len(template_species_list):
            if ortho not in universal_ortho_coords.keys():
                universal_ortho_coords[ortho] = set()
            universal_ortho_coords[ortho].add(coord)
        else:
            print("warning, {} {} has only {}".format(ortho, coord, sp_set))



In [9]:
import pandas
from bokeh.charts import Histogram, show
from bokeh.io import output_notebook

In [10]:
data = []
for ortho in sorted(universal_ortho_coords.keys()):
    for coord in sorted(universal_ortho_coords[ortho]):
        start, end = coord
        length = end - start
        data.append((ortho, coord, length))

df = pandas.DataFrame.from_records(data=data, columns=['ortho', 'coord', 'Aligned CDS Length'])
print(df.describe())

output_notebook()
hist = Histogram(df, values="Aligned CDS Length", title="Aligned CDS Length Histogram")
show(hist)

       Aligned CDS Length
count          989.000000
mean           487.566229
std             58.328895
min            400.000000
25%            435.000000
50%            480.000000
75%            539.000000
max            600.000000


In [11]:
# fasta prep
fasta_prep = {}
for ortho in universal_ortho_coords:
    fasta_prep[ortho] = []
    for coord in universal_ortho_coords[ortho]:
            temp_sp_list = []
            for sp in sorted(aligned_fasta[ortho]):
                start,end = coord
                seq = aligned_fasta[ortho][sp].seq[start:end]
                des = aligned_fasta[ortho][sp].description
                seqReq = SeqRecord(seq, id=sp, description=des)
                if sp in template_species_list:
                    fasta_prep[ortho].append(seqReq)
                else:
                    temp_sp_list.append(seqReq)

            fasta_prep[ortho].extend(temp_sp_list)


In [12]:
pp(fasta_prep)

{'orth10028_686-1116': [SeqRecord(seq=Seq('gtcgcaattttcgcctatgactcggccactgtacccggcacggcggccacacgc...ttg', IUPACAmbiguousDNA()), id='Bcur', name='<unknown name>', description='Bcur cds913_1', dbxrefs=[]),
                        SeqRecord(seq=Seq('gtcgcaattttcgcatatgactccgccacagtaccgggcacagaggccacacgc...ttg', IUPACAmbiguousDNA()), id='Bdor', name='<unknown name>', description='Bdor cds9243_1', dbxrefs=[]),
                        SeqRecord(seq=Seq('gtcgcaattttcgcctacgactccgccacagttccgggcacggaggccacacgc...ttg', IUPACAmbiguousDNA()), id='Bole', name='<unknown name>', description='Bole cds9629_1', dbxrefs=[]),
                        SeqRecord(seq=Seq('gtcgcaattttcgcctacgactccgccacagtaccagctacggcgaccacacgt...ttg', IUPACAmbiguousDNA()), id='Ccap', name='<unknown name>', description='Ccap cds3088_1', dbxrefs=[]),
                        SeqRecord(seq=Seq('atcgcaattttcgcctacgattcggcaaccgtaccgggcacggcgacaacgcaa...ttg', IUPACAmbiguousDNA()), id='Aobl', name='<unknown name>', description='Aobl T

In [13]:
len(fasta_prep)

989

In [14]:
for ortho in fasta_prep:
    fasta_prep[ortho] = [seqReq for seqReq in fasta_prep[ortho] if (gapPercent(seqReq.seq) <= max_gap_percent) and (longestGap(seqReq.seq) <= max_gap_length)]
    
fasta_prep = {ortho:seq_list for ortho,seq_list in fasta_prep.items() if len(seq_list) >= 8}

In [15]:
len(fasta_prep)

959

In [16]:
# fasta output
for ortho in fasta_prep:
    with open(output_path + ortho + ".13spp.fasta", "w") as f:
        for seqReq in fasta_prep[ortho]:
            f.write(seqReq.format("fasta"))

In [17]:
data = []
for ortho in sorted(fasta_prep.keys()):
    sp_count = len(fasta_prep[ortho])
    data.append((ortho, sp_count))

df = pandas.DataFrame.from_records(data=data, columns=['ortho', 'sp_count'])
print(df.describe())

output_notebook()
hist = Histogram(df, values="sp_count", title="Species Count per Ortho Histogram")
show(hist)

         sp_count
count  959.000000
mean    10.549531
std      1.309750
min      8.000000
25%     10.000000
50%     11.000000
75%     11.000000
max     13.000000


In [18]:
# analize coverage
coverage = {}
for ortho,rec_list in fasta_prep.items():
    for seqRec in rec_list:
        sp = seqRec.id
        if sp not in coverage.keys():
            coverage[sp] = set()
        coverage[sp].add(ortho)

In [19]:
for sp,count in sorted([(sp, len(coverage[sp])) for sp in coverage.keys()], key=lambda x: x[1]):
    print("species: {}\torthos: {}".format(sp,count))

species: Bjar	orthos: 347
species: Aobl	orthos: 400
species: Bmin	orthos: 432
species: Asus	orthos: 698
species: Btry	orthos: 789
species: Afra	orthos: 859
species: Blat	orthos: 908
species: Bzon	orthos: 920
species: Bcor	orthos: 928
species: Bcur	orthos: 959
species: Bole	orthos: 959
species: Ccap	orthos: 959
species: Bdor	orthos: 959


In [20]:
full_species_list = sorted(coverage.keys())
full_species_list

['Afra',
 'Aobl',
 'Asus',
 'Bcor',
 'Bcur',
 'Bdor',
 'Bjar',
 'Blat',
 'Bmin',
 'Bole',
 'Btry',
 'Bzon',
 'Ccap']

In [21]:
species_index = {}
for i,sp in enumerate(full_species_list):
    species_index[sp] = i
species_index

{'Afra': 0,
 'Aobl': 1,
 'Asus': 2,
 'Bcor': 3,
 'Bcur': 4,
 'Bdor': 5,
 'Bjar': 6,
 'Blat': 7,
 'Bmin': 8,
 'Bole': 9,
 'Btry': 10,
 'Bzon': 11,
 'Ccap': 12}

In [22]:
for sp1,sp2,count in sorted([(sp1, sp2, len(set.intersection(coverage[sp1], coverage[sp2]))) for sp1 in full_species_list for sp2 in full_species_list[species_index[sp1]:] if sp1 != sp2], key=lambda x: x[2]):
    print("sp1: {}\tsp2: {}\torthos: {}".format(sp1,sp2,count))

sp1: Aobl	sp2: Bjar	orthos: 152
sp1: Bjar	sp2: Bmin	orthos: 165
sp1: Aobl	sp2: Bmin	orthos: 189
sp1: Asus	sp2: Bjar	orthos: 258
sp1: Bjar	sp2: Btry	orthos: 296
sp1: Aobl	sp2: Asus	orthos: 308
sp1: Afra	sp2: Bjar	orthos: 314
sp1: Asus	sp2: Bmin	orthos: 320
sp1: Aobl	sp2: Btry	orthos: 327
sp1: Bjar	sp2: Blat	orthos: 327
sp1: Bcor	sp2: Bjar	orthos: 336
sp1: Bjar	sp2: Bzon	orthos: 337
sp1: Bmin	sp2: Btry	orthos: 346
sp1: Bcur	sp2: Bjar	orthos: 347
sp1: Bdor	sp2: Bjar	orthos: 347
sp1: Bjar	sp2: Bole	orthos: 347
sp1: Bjar	sp2: Ccap	orthos: 347
sp1: Aobl	sp2: Blat	orthos: 378
sp1: Afra	sp2: Aobl	orthos: 379
sp1: Aobl	sp2: Bzon	orthos: 386
sp1: Aobl	sp2: Bcor	orthos: 389
sp1: Afra	sp2: Bmin	orthos: 392
sp1: Aobl	sp2: Bcur	orthos: 400
sp1: Aobl	sp2: Bdor	orthos: 400
sp1: Aobl	sp2: Bole	orthos: 400
sp1: Aobl	sp2: Ccap	orthos: 400
sp1: Blat	sp2: Bmin	orthos: 409
sp1: Bmin	sp2: Bzon	orthos: 413
sp1: Bcor	sp2: Bmin	orthos: 422
sp1: Bcur	sp2: Bmin	orthos: 432
sp1: Bdor	sp2: Bmin	orthos: 432
sp1: Bmi

In [23]:
set.intersection(coverage['Aobl'], coverage['Bjar'])

{'orth2345_455-1052',
 'orth2347_606-1023',
 'orth2351_737-1215',
 'orth2392_922-1381',
 'orth2395_2323-2777',
 'orth2405_0-588',
 'orth2412_253-701',
 'orth2412_965-1455',
 'orth2427_1553-2044',
 'orth2428_0-543',
 'orth2472_1850-2286',
 'orth2478_127-581',
 'orth2481_1660-2204',
 'orth2501_474-969',
 'orth2505_123-557',
 'orth2512_861-1408',
 'orth2514_0-495',
 'orth2528_741-1231',
 'orth2530_2367-2800',
 'orth2537_108-513',
 'orth2544_667-1171',
 'orth2547_1733-2295',
 'orth2550_125-563',
 'orth2553_637-1195',
 'orth2564_0-505',
 'orth2564_555-986',
 'orth2595_334-822',
 'orth2596_149-624',
 'orth2598_829-1324',
 'orth2612_715-1187',
 'orth2616_2941-3434',
 'orth2620_140-593',
 'orth2627_82-673',
 'orth2635_2540-2998',
 'orth2645_196-696',
 'orth2669_505-939',
 'orth2675_762-1190',
 'orth2677_1118-1599',
 'orth2688_542-1008',
 'orth2729_281-760',
 'orth2752_209-725',
 'orth2765_1213-1616',
 'orth2765_1666-2131',
 'orth2765_690-1163',
 'orth2765_89-516',
 'orth2772_374-940',
 'orth27

In [24]:
data = []
for ortho in sorted(set.intersection(coverage['Aobl'], coverage['Bjar'])):
    sp_count = len(fasta_prep[ortho])
    data.append((ortho, sp_count))

df = pandas.DataFrame.from_records(data=data, columns=['ortho', 'sp_count'])
print(df.describe())

output_notebook()
hist = Histogram(df, values="sp_count", title="Histogram of Species Count per Ortho shaired by Aobl and Bjar")
show(hist)

         sp_count
count  152.000000
mean    11.967105
std      0.979375
min      9.000000
25%     11.000000
50%     12.000000
75%     13.000000
max     13.000000
