In [1]:
import os
import gffutils
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
import json
from pyfaidx import Fasta
from Bio import SeqIO
import re
from pprint import pprint as pp

In [2]:
full_species_list = ['Bjar', 'Aobl', 'Bmin', 'Asus', 'Btry', 'Afra', 'Blat', 'Bzon', 'Bcor', 'Ccap', 'Bcur', 'Bole', 'Bdor']
species_list = ["Bcur", "Bdor", "Bole", "Ccap"]
transvestigated_species_set = {'Bcor', 'Blat', 'Bzon', 'Afra', 'Bmin', 'Bjar', 'Aobl'}
gff_path = "./input/gff/"
fasta_path = "./input/fasta/"
groups_fn = "./input/groups_filtered_6181genes.txt"
output_path = "./output/"
input_path = "./input/"
intermediate_path = "./intermediate/"
aligned_fasta_path = "./intermediate/13spp_aligned_trimmed_filtered_fasta/"
p3_out_path = [ "./input/p3out_200_600_MaxNs1_lowerTM/",
                "./input/p3out_200_600_MaxNs2_lowerTM/",
                "./input/p3out_200_600_MaxNs3_lowerTM/"]
primer_products_path = "./intermediate/primer_products/"

In [3]:
# create handles for all .fasta files in fasta directory
fasta_fn = {name.split('.13spp.fasta')[0]: primer_products_path + name for name in os.listdir(primer_products_path) if
         ((".13spp.fasta" in name) and (".13spp.fasta.fai" not in name))}

In [4]:
primer = [dict() for i in range(3)]
for i in range(3):
    for p3_out_fn in os.listdir(p3_out_path[i]):
        ortho = p3_out_fn.split('.degenerate.p3.out')[0]
        with open(p3_out_path[i] + p3_out_fn, 'r') as f:
            lines = f.readlines()
            lines = [line.strip().split('=') for line in lines]
            lines = {key:value for key,value in lines if key is not ''}
            if lines['PRIMER_PAIR_NUM_RETURNED'] is not '0':
                test=lines
                left,l_len = lines['PRIMER_LEFT_0'].split(',')
                right,r_len = lines['PRIMER_RIGHT_0'].split(',')
                primer[i][ortho] = (str(int(lines['PRIMER_PAIR_0_PRODUCT_SIZE']) - int(l_len) - int(r_len)),
                                 '"' + lines['PRIMER_LEFT_0'] + '"',
                                 '"' + lines['PRIMER_RIGHT_0'] + '"',
                                 lines['PRIMER_LEFT_0_SEQUENCE'],
                                 lines['PRIMER_RIGHT_0_SEQUENCE'],
                                 lines['PRIMER_LEFT_0_TM'],
                                 lines['PRIMER_RIGHT_0_TM'])

In [5]:
data = []
for i in range(3):
    for ortho in primer[i]:
            data.append((ortho, i+1, *primer[i][ortho]))

In [6]:
data = sorted(data, key=lambda x: (x[0], x[1]))

In [7]:
header = ['Exon_Name',
 'ambiguities_allowed',
 'Target_Sequence_Length',
 'PRIMER_LEFT_0',
 'PRIMER_RIGHT_0',
 'PRIMER_LEFT_0_SEQUENCE',
 'PRIMER_RIGHT_0_SEQUENCE',
 'PRIMER_LEFT_0_TM',
 'PRIMER_RIGHT_0_TM']

In [8]:
data = [tuple(str(el) for el in record) for record in data]

In [9]:
import csv
with open("./output/13spp_exon_primer_data.csv", "w") as f:
    f.write(",".join(header))
    for record in data:
        f.write("\n" + ",".join(record))

In [10]:
a = {el[0] for el in data if el[1] == "1"}
b = {el[0] for el in data if el[1] == "2"}
c = {el[0] for el in data if el[1] == "3"}
venn = [set() for i in range(7)]
venn[0] = a - b - c
venn[1] = b - c - a
venn[2] = c - a - b
venn[3] = a & b - c
venn[4] = b & c - a
venn[5] = c & a - b
venn[6] = a & b & c

In [11]:
len(a),len(b),len(c)

(57, 177, 344)

In [12]:
[len(s) for s in venn]

[0, 0, 167, 0, 120, 0, 57]

In [13]:
[len(primer[i]) for i in range(3)]

[57, 177, 344]