In [2]:
#!/usr/bin/env python
# coding: utf-8

import os
import gffutils
from Bio.Alphabet import IUPAC
import json
from Bio import SeqIO
import sqlite3
import argparse
from config import full_species_list, species_list, transvestigated_species_set, summary_fn, \
    primer3_path, unpadded_primer_product_path, db_path, json_path, pi_score_path

In [13]:
summary_fn = "../output0/summory.csv"
primer3_path = "../intermediate0/primer_design/"
unpadded_primer_product_path = "../output0/primerProducts/"
json_path = "../intermediate0/json/"
pi_score_path = "../intermediate0/phylo_informativeness/tapir_out/phylogenetic-informativeness.sqlite"

In [4]:
# create handles for all .fasta files in fasta directory
fasta_fn = {name.split('.13spp.fasta')[0]: unpadded_primer_product_path + name for name in
            os.listdir(unpadded_primer_product_path) if
            ((".13spp.fasta" in name) and (".13spp.fasta.fai" not in name))}

In [6]:
# read and parse fasta files for each species
fasta = {}
for ortho in fasta_fn.keys():
    fasta[ortho] = {seq_record.id: seq_record
                    for seq_record in SeqIO.parse(fasta_fn[ortho],
                                                  "fasta", alphabet=IUPAC.ambiguous_dna)}


In [20]:
primer = {}
for p3_out_fn in [fn for fn in os.listdir(primer3_path) if ".p3.out" in fn]:
    ortho = p3_out_fn.split('.degenerate.p3.out')[0]
    with open(primer3_path + p3_out_fn, 'r') as f:
        lines = f.readlines()
        lines = [line.strip().split('=') for line in lines]
        lines = {key: value for key, value in lines if key is not ''}
        if 'PRIMER_PAIR_NUM_RETURNED' not in lines.keys():
            continue
        if lines['PRIMER_PAIR_NUM_RETURNED'] is not '0':
            primer[ortho] = []
            for variation in range(int(lines['PRIMER_PAIR_NUM_RETURNED'])):
                left, l_len = lines['PRIMER_LEFT_{}'.format(variation)].split(',')
                right, r_len = lines['PRIMER_RIGHT_{}'.format(variation)].split(',')
                primer[ortho].append((
                    str(int(lines['PRIMER_PAIR_{}_PRODUCT_SIZE'.format(variation)]) - int(l_len) - int(r_len)),
                    lines['PRIMER_LEFT_{}_SEQUENCE'.format(variation)],
                    lines['PRIMER_RIGHT_{}_SEQUENCE'.format(variation)],
                    lines['PRIMER_LEFT_{}_TM'.format(variation)],
                    lines['PRIMER_RIGHT_{}_TM'.format(variation)]))


In [21]:
# grab pi scores from sql database
conn = sqlite3.connect(pi_score_path)
name_score = conn.execute("select loci.locus, avg(pi) from loci, discrete where loci.id = discrete.id group by loci.locus").fetchall()
name_score = {line[0].split(".13spp.fasta")[0]: line[1] for line in name_score}


In [22]:
# import pre_padding_species.json
with open(json_path + "pre_padding_species.json", 'r') as f:
    pre_padd_sp = json.load(f)


In [23]:
# import ortholog groups
with open(json_path + "groups.json", 'r') as f:
    parent_groups = json.load(f)


In [24]:
# create handles for all .db files in intermediate directory
gff_fn = {name.split('.gff.db')[0]: db_path + name for name in os.listdir(db_path) if
          ".gff.db" in name}
gff = {key: gffutils.FeatureDB(value) for key, value in gff_fn.items()}

In [34]:
data = []
for ortho_plus in name_score:
    ortho = ortho_plus[:-3]
    variation = int(ortho_plus[-1:])
    for sp in pre_padd_sp[ortho]:
        if 'product' in gff[sp][parent_groups[ortho.split("_")[0]][sp]].attributes.keys():
            product = gff[sp][parent_groups[ortho.split("_")[0]][sp]]['product'][0]
        else:
            product = "N/A"
        score = name_score[ortho_plus]
        data.append((ortho_plus, str(score), sp, product, *primer[ortho][variation]))
