In [1]:
import os
import gffutils
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
import json
from pyfaidx import Fasta
from Bio import SeqIO
import re
from pprint import pprint as pp

In [2]:
full_species_list = ['Bjar', 'Aobl', 'Bmin', 'Asus', 'Btry', 'Afra', 'Blat', 'Bzon', 'Bcor', 'Ccap', 'Bcur', 'Bole', 'Bdor']
species_list = ["Bcur", "Bdor", "Bole", "Ccap"]
transvestigated_species_set = {'Bcor', 'Blat', 'Bzon', 'Afra', 'Bmin', 'Bjar', 'Aobl'}
gff_path = "./input/gff/"
fasta_path = "./input/fasta/"
groups_fn = "./input/groups_filtered_6181genes.txt"
output_path = "./output/"
input_path = "./input/"
intermediate_path = "./intermediate/"
aligned_fasta_path = "./intermediate/13spp_aligned_trimmed_filtered_fasta/"
p3_out_path = "./input/p3out_200_600_MaxNs2_lowerTM/"

In [3]:
with open(input_path + "alternate_sp.json", 'r') as f:
    alternate_sp = json.load(f)
pp(alternate_sp)

{'Afra': ['Asus', 'Aobl', 'Ccap'],
 'Aobl': ['Afra', 'Asus', 'Ccap'],
 'Asus': ['Afra', 'Aobl', 'Ccap'],
 'Bcor': ['Bzon', 'Bdor'],
 'Bjar': ['Btry', 'Blat', 'Bdor'],
 'Blat': ['Bdor'],
 'Bmin': ['Bole'],
 'Btry': ['Bjar', 'Blat', 'Bdor'],
 'Bzon': ['Bcor', 'Bdor']}


In [4]:
# create handles for all .fasta files in fasta directory
fasta_fn = {name.split('.13spp.fasta')[0]: aligned_fasta_path + name for name in os.listdir(aligned_fasta_path) if
         ((".13spp.fasta" in name) and (".13spp.fasta.fai" not in name))}

In [5]:
# read and parse fasta files for each species
fasta = {}
for ortho in fasta_fn.keys():
    fasta[ortho] = {seq_record.id : seq_record 
                                      for seq_record in SeqIO.parse(fasta_fn[ortho],
                                                                    "fasta", alphabet=IUPAC.ambiguous_dna)}

In [6]:
primer = {}
for p3_out_fn in os.listdir(p3_out_path):
    ortho = p3_out_fn.split('.degenerate.p3.out')[0]
    with open(p3_out_path + p3_out_fn, 'r') as f:
        lines = f.readlines()
        lines = [line.strip().split('=') for line in lines]
        lines = {key:value for key,value in lines if key is not ''}
        if lines['PRIMER_PAIR_NUM_RETURNED'] is not '0':
            test = lines
            left,l_len = lines['PRIMER_LEFT_0'].split(',')
            right,r_len = lines['PRIMER_RIGHT_0'].split(',')
            start = int(left) + int(l_len)
            end = int(right) - int(r_len) + 1
            primer[ortho] = (start,end)
primer

{'orth10028_686-1116': (138, 303),
 'orth10262_574-1105': (218, 467),
 'orth10315_1281-1727': (151, 312),
 'orth10339_505-906': (150, 318),
 'orth10425_0-403': (74, 301),
 'orth10425_453-875': (117, 306),
 'orth2462_53-471': (72, 354),
 'orth2544_667-1171': (189, 433),
 'orth2553_637-1195': (179, 346),
 'orth2566_1289-1740': (91, 290),
 'orth2612_715-1187': (24, 405),
 'orth2616_2941-3434': (121, 409),
 'orth2645_196-696': (71, 404),
 'orth2765_1666-2131': (206, 363),
 'orth2872_268-817': (254, 456),
 'orth2884_2067-2578': (84, 362),
 'orth2894_628-1207': (20, 200),
 'orth2923_57-459': (171, 362),
 'orth2935_486-1024': (129, 292),
 'orth2955_2160-2617': (273, 429),
 'orth3092_555-1082': (27, 469),
 'orth3129_389-852': (44, 337),
 'orth3465_0-564': (123, 383),
 'orth3556_678-1151': (153, 424),
 'orth3682_81-506': (44, 344),
 'orth3696_1155-1653': (126, 303),
 'orth3767_964-1531': (333, 507),
 'orth3814_815-1335': (56, 324),
 'orth3819_0-468': (21, 228),
 'orth3835_0-577': (50, 387),
 'o

In [7]:
test

{'PRIMER_INTERNAL_NUM_RETURNED': '0',
 'PRIMER_LEFT_0': '195,20',
 'PRIMER_LEFT_0_END_STABILITY': '3.9600',
 'PRIMER_LEFT_0_GC_PERCENT': '52.632',
 'PRIMER_LEFT_0_HAIRPIN_TH': '0.00',
 'PRIMER_LEFT_0_PENALTY': '0.780121',
 'PRIMER_LEFT_0_SELF_ANY_TH': '19.57',
 'PRIMER_LEFT_0_SELF_END_TH': '5.34',
 'PRIMER_LEFT_0_SEQUENCE': 'TCRCACACCTATCGCATGCA',
 'PRIMER_LEFT_0_TM': '55.220',
 'PRIMER_LEFT_1': '194,21',
 'PRIMER_LEFT_1_END_STABILITY': '3.9600',
 'PRIMER_LEFT_1_GC_PERCENT': '50.000',
 'PRIMER_LEFT_1_HAIRPIN_TH': '0.00',
 'PRIMER_LEFT_1_PENALTY': '1.013615',
 'PRIMER_LEFT_1_SELF_ANY_TH': '19.57',
 'PRIMER_LEFT_1_SELF_END_TH': '5.34',
 'PRIMER_LEFT_1_SEQUENCE': 'TTCRCACACCTATCGCATGCA',
 'PRIMER_LEFT_1_TM': '56.014',
 'PRIMER_LEFT_2': '195,20',
 'PRIMER_LEFT_2_END_STABILITY': '3.9600',
 'PRIMER_LEFT_2_GC_PERCENT': '52.632',
 'PRIMER_LEFT_2_HAIRPIN_TH': '0.00',
 'PRIMER_LEFT_2_PENALTY': '0.780121',
 'PRIMER_LEFT_2_SELF_ANY_TH': '19.57',
 'PRIMER_LEFT_2_SELF_END_TH': '5.34',
 'PRIMER_LEFT_

In [8]:
# sanity check for indicies

print(Seq(test['PRIMER_LEFT_0_SEQUENCE'], alphabet=IUPAC.ambiguous_dna))

print(Seq(test['SEQUENCE_TEMPLATE'], alphabet=IUPAC.ambiguous_dna)[int(left):int(left) + int(l_len)])

print(Seq(test['PRIMER_RIGHT_0_SEQUENCE'], alphabet=IUPAC.ambiguous_dna).reverse_complement())

print(Seq(test['SEQUENCE_TEMPLATE'], alphabet=IUPAC.ambiguous_dna)[int(right) - int(r_len)+1:int(right)+1])

print(Seq(test['SEQUENCE_TEMPLATE'], alphabet=IUPAC.ambiguous_dna)[int(left) + int(l_len):int(right) - int(r_len)+1])

print(len(Seq(test['SEQUENCE_TEMPLATE'], alphabet=IUPAC.ambiguous_dna)[int(left) + int(l_len):int(right) - int(r_len)+1]))

print(test['PRIMER_PAIR_0_PRODUCT_SIZE'])

TCRCACACCTATCGCATGCA
TCRCACACCTATCGCATGCA
CCTTTCGYAAGGACAGYGCCA
CCTTTCGYAAGGACAGYGCCA
GATGCARCGYTGTCARTCATCRCGCGCCTTTCCYCGYAACGCATCGCGWACYTCRCACTCDTCDGCSGCWGGNGCWCTBTCGCCWACVCGKTCYTTYAGCCARKCCTCMAGYCCRCCMAAGACMATDAAYACRRYGGARAGCCARAAYGATATYACGAAATTTCATTTRCGHYTKGTYGAYAARCTRCGYAAGT
194
235


In [9]:
trimmed_fasta = {ortho:{sp:fasta[ortho][sp][start:end] for sp in fasta[ortho]}}

In [10]:
from copy import deepcopy
padded_fasta = {}
trimmed_fasta = {}
for ortho in fasta.keys():
    if ortho in primer.keys():
        start,end = primer[ortho]
    else:
        continue
    padding = {}
    for sp in full_species_list:
        if sp not in fasta[ortho].keys():
            for alt_sp in alternate_sp[sp]:
                if alt_sp in fasta[ortho].keys():
                    seq = fasta[ortho][alt_sp].seq[start:end]
                    des = fasta[ortho][alt_sp].description
                    des = "PADDING " + des
                    padding[sp] = SeqRecord(seq, id=sp, description=des)
                    break
    trimmed_fasta[ortho] = {sp:fasta[ortho][sp][start:end] for sp in fasta[ortho]}
    padded_fasta[ortho] = padding
    padded_fasta[ortho].update(trimmed_fasta[ortho])

In [11]:
{len(trimmed_fasta[ortho]) for ortho in trimmed_fasta.keys()}

{8, 9, 10, 11, 12, 13}

In [12]:
{len(padded_fasta[ortho]) for ortho in padded_fasta.keys()}

{13}

In [13]:
sp_order = { 'Bcur':1,
             'Bdor':2,
             'Bole':3,
             'Ccap':4,
             'Bcor':5,
             'Blat':6,
             'Bzon':7,
             'Afra':8,
             'Bmin':9,
             'Bjar':10,
             'Aobl':11,
             'Asus':12,
             'Btry':13}

In [14]:
# output fasta to pre_padding_species.json
with open(intermediate_path + "pre_padding_species.json", 'w') as f:
    json.dump({ortho:[sp for sp in trimmed_fasta[ortho]] for ortho in trimmed_fasta}, f)

In [15]:
for ortho in trimmed_fasta.keys():
    with open(output_path + "trimmed_fasta/" + ortho + ".13spp.fasta", "w") as f:
        for seqReq in sorted(trimmed_fasta[ortho].values(), key=lambda x: sp_order[x.id]):
            f.write(seqReq.format("fasta"))

In [16]:
for ortho in padded_fasta.keys():
    with open(output_path + "padded_fasta/" + ortho + ".13spp.fasta", "w") as f:
        for seqReq in sorted(padded_fasta[ortho].values(), key=lambda x: sp_order[x.id]):
            f.write(seqReq.format("fasta"))