In [3]:
#!/usr/bin/env python
# coding: utf-8

import os
import gffutils
from Bio.Alphabet import IUPAC
import json
from Bio import SeqIO
import sqlite3
import argparse
from config import full_species_list, species_list, transvestigated_species_set, summary_fn, \
    primer3_path, unpadded_primer_product_path, db_path, json_path, pi_score_path

In [14]:
summary_fn = "../output0/summory.csv"
primer3_path = "../intermediate0/primer_design/"
unpadded_primer_product_path = "../output0/primerProducts/"
json_path = "../intermediate0/json/"
pi_score_path = "../intermediate0/phylo_informativeness/sql/"

In [5]:
# create handles for all .fasta files in fasta directory
fasta_fn = {name.split('.13spp.fasta')[0]: unpadded_primer_product_path + name for name in
            os.listdir(unpadded_primer_product_path) if
            ((".13spp.fasta" in name) and (".13spp.fasta.fai" not in name))}

# read and parse fasta files for each species
fasta = {}
for ortho in fasta_fn.keys():
    fasta[ortho] = {seq_record.id: seq_record
                    for seq_record in SeqIO.parse(fasta_fn[ortho],
                                                  "fasta", alphabet=IUPAC.ambiguous_dna)}

primer = {}
for p3_out_fn in [fn for fn in os.listdir(primer3_path) if ".p3.out" in fn]:
    ortho = p3_out_fn.split('.degenerate.p3.out')[0]
    with open(primer3_path + p3_out_fn, 'r') as f:
        lines = f.readlines()
        lines = [line.strip().split('=') for line in lines]
        lines = {key: value for key, value in lines if key is not ''}
        if 'PRIMER_PAIR_NUM_RETURNED' not in lines.keys():
            continue
        if lines['PRIMER_PAIR_NUM_RETURNED'] is not '0':
            primer[ortho] = []
            for variation in range(int(lines['PRIMER_PAIR_NUM_RETURNED'])):
                left, l_len = lines['PRIMER_LEFT_{}'.format(variation)].split(',')
                right, r_len = lines['PRIMER_RIGHT_{}'.format(variation)].split(',')
                primer[ortho].append((
                    str(int(lines['PRIMER_PAIR_{}_PRODUCT_SIZE'.format(variation)]) - int(l_len) - int(r_len)),
                    lines['PRIMER_LEFT_{}_SEQUENCE'.format(variation)],
                    lines['PRIMER_RIGHT_{}_SEQUENCE'.format(variation)],
                    lines['PRIMER_LEFT_{}_TM'.format(variation)],
                    lines['PRIMER_RIGHT_{}_TM'.format(variation)]))



In [None]:
# grab pi scores from sql database
conn = sqlite3.connect(pi_score_path)
name_score = conn.execute("select loci.locus, avg(pi) from loci, discrete where loci.id = discrete.id group by loci.locus").fetchall()
name_score = {line[0].split(".13spp.fasta")[0]: line[1] for line in name_score}

In [20]:
name_score = []
for fn in os.listdir(pi_score_path):
    conn = sqlite3.connect(pi_score_path + fn)
    name_score = name_score + conn.execute("select loci.locus, avg(pi) from loci, discrete where loci.id = discrete.id group by loci.locus").fetchall()
name_score = {line[0].split(".13spp.fasta")[0]: line[1] for line in name_score}
name_score

{'orth10339_505-906_V0': 0.11965817591736035,
 'orth10339_505-906_V1': 0.11972981364195596,
 'orth10339_505-906_V2': 0.12088042803177984,
 'orth10339_505-906_V3': 0.11967331929236469,
 'orth10339_505-906_V4': 0.11914637797444803,
 'orth10362_263-601_V0': 0.1861669961974035,
 'orth10362_263-601_V1': 0.18670778943570618,
 'orth10362_263-601_V2': 0.1859988477866422,
 'orth10362_263-601_V3': 0.18695058477234602,
 'orth10362_263-601_V4': 0.17091053103179074,
 'orth10473_387-639_V0': 0.45664350953582905,
 'orth10473_387-639_V1': 0.4553357976783274,
 'orth10473_387-639_V2': 0.45664350953582905,
 'orth10473_387-639_V3': 0.4553357976783274,
 'orth10473_387-639_V4': 0.4556353970594979,
 'orth2556_0-869_V0': 2.1276404029479936,
 'orth2556_0-869_V1': 2.1291697851411358,
 'orth2556_0-869_V2': 2.1306675243237065,
 'orth2556_0-869_V3': 2.1293829581521733,
 'orth2556_0-869_V4': 2.130205124232249,
 'orth2782_175-1267_V0': 4.119581662095557,
 'orth2782_175-1267_V1': 4.120363931521129,
 'orth2782_175-126

In [13]:
fn

'0.sqlite'

In [None]:
# import pre_padding_species.json
with open(json_path + "pre_padding_species.json", 'r') as f:
    pre_padd_sp = json.load(f)

# import ortholog groups
with open(json_path + "groups.json", 'r') as f:
    parent_groups = json.load(f)

# create handles for all .db files in intermediate directory
gff_fn = {name.split('.gff.db')[0]: db_path + name for name in os.listdir(db_path) if
          ".gff.db" in name}
gff = {key: gffutils.FeatureDB(value) for key, value in gff_fn.items()}

data = []
for ortho_plus in name_score:
    ortho = ortho_plus[:-3]
    variation = int(ortho_plus[-1:])
    for sp in pre_padd_sp[ortho]:
        if 'product' in gff[sp][parent_groups[ortho.split("_")[0]][sp]].attributes.keys():
            product = gff[sp][parent_groups[ortho.split("_")[0]][sp]]['product'][0]
        else:
            product = "N/A"
        score = name_score[ortho_plus]
        data.append((ortho_plus, str(score), sp, product, *primer[ortho][variation]))

sp_order = {'Bcur': 1,
            'Bdor': 2,
            'Bole': 3,
            'Ccap': 4,
            'Bcor': 5,
            'Blat': 6,
            'Bzon': 7,
            'Afra': 8,
            'Bmin': 9,
            'Bjar': 10,
            'Aobl': 11,
            'Asus': 12,
            'Btry': 13}

data = sorted(data, key=lambda x: (x[0], sp_order[x[2]]))

header = ['Exon_Name',
          'PI_Score',
          'Species',
          'Gene_Product',
          'Target_Sequence_Length',
          'PRIMER_LEFT_0_SEQUENCE',
          'PRIMER_RIGHT_0_SEQUENCE',
          'PRIMER_LEFT_0_TM',
          'PRIMER_RIGHT_0_TM']

filename = summary_fn
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, "w") as f:
    f.write(",".join(header))
    for record in data:
        f.write("\n" + ",".join(record))
