In [1]:
import os
import gffutils
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
import json
from pyfaidx import Fasta
from Bio import SeqIO
import re
from pprint import pprint as pp

In [2]:
full_species_list = ['Bjar', 'Aobl', 'Bmin', 'Asus', 'Btry', 'Afra', 'Blat', 'Bzon', 'Bcor', 'Ccap', 'Bcur', 'Bole', 'Bdor']
species_list = ["Bcur", "Bdor", "Bole", "Ccap"]
transvestigated_species_set = {'Bcor', 'Blat', 'Bzon', 'Afra', 'Bmin', 'Bjar', 'Aobl'}
gff_path = "./input/gff/"
fasta_path = "./input/fasta/"
groups_fn = "./input/groups_filtered_6181genes.txt"
output_path = "./output/"
input_path = "./input/"
intermediate_path = "./intermediate/"
aligned_fasta_path = "./intermediate/13spp_aligned_trimmed_filtered_fasta/"
p3_out_path = "./input/P3/"
primer_products_path = "./intermediate/primer_products/"

In [3]:
# create handles for all .fasta files in fasta directory
fasta_fn = {name.split('.13spp.fasta')[0]: primer_products_path + name for name in os.listdir(primer_products_path) if
         ((".13spp.fasta" in name) and (".13spp.fasta.fai" not in name))}

In [1]:
primer = {}
for p3_out_fn in os.listdir(p3_out_path):
    ortho = p3_out_fn.split('.degenerate.p3.out')[0]
    with open(p3_out_path + p3_out_fn, 'r') as f:
        lines = f.readlines()
        lines = [line.strip().split('=') for line in lines]
        lines = {key:value for key,value in lines if key is not ''}
        if lines['PRIMER_PAIR_NUM_RETURNED'] is not '0':
            left,l_len = lines['PRIMER_LEFT_0'].split(',')
            right,r_len = lines['PRIMER_RIGHT_0'].split(',')
            primer[ortho] = (str(int(lines['PRIMER_PAIR_0_PRODUCT_SIZE']) - int(l_len) - int(r_len)),
                             lines['PRIMER_LEFT_0_SEQUENCE'],
                             lines['PRIMER_RIGHT_0_SEQUENCE'],
                             lines['PRIMER_LEFT_0_TM'],
                             lines['PRIMER_RIGHT_0_TM'])
len(primer)

NameError: name 'os' is not defined

In [5]:
with open("input/net_PI_avg_edited.txt", 'r') as f:
    name_score = [line.strip().split() for line in f.readlines()[1:]]
name_score = {line[0].split(".13spp.fasta")[0] : line[1] for line in name_score}
name_score

{'orth10028_686-1116': '4.26486481538003',
 'orth10034_3763-4322': '2.30216568298139',
 'orth10035_908-1372': '1.61196100068671',
 'orth10042_1449-1930': '1.41767419387774',
 'orth10105_534-1065': '3.65135234429511',
 'orth10109_1227-1657': '1.29763311738691',
 'orth10137_1188-1615': '2.19702843687596',
 'orth10141_63-635': '3.93648389715765',
 'orth10224_389-987': '3.26277068563926',
 'orth10233_165-726': '3.3414386496408',
 'orth10262_574-1105': '2.49713352567272',
 'orth10269_2267-2754': '3.06389374876962',
 'orth10270_1066-1466': '3.08421455978465',
 'orth10284_1808-2365': '4.10268009904138',
 'orth10315_1281-1727': '2.39861397882675',
 'orth10339_505-906': '1.77714674842832',
 'orth10425_0-403': '2.70739607632113',
 'orth10425_453-875': '0.572206440918691',
 'orth10532_1225-1640': '1.09366571298634',
 'orth2345_455-1052': '6.13865510256151',
 'orth2347_606-1023': '4.52136572074222',
 'orth2351_737-1215': '5.35582567382585',
 'orth2392_922-1381': '5.2425226509734',
 'orth2395_2323-

In [6]:
# import pre_padding_species.json
with open(intermediate_path + "pre_padding_species.json", 'r') as f:
    pre_padd_sp = json.load(f)

In [7]:
# import ortholog groups
with open(intermediate_path + "groups.json", 'r') as f:
    parent_groups = json.load(f)

In [8]:
# create handles for all .db files in intermediate directory
gff_fn = {name.split('.gff.db')[0]: intermediate_path + name for name in os.listdir(intermediate_path) if ".gff.db" in name}
gff = {key: gffutils.FeatureDB(value) for key, value in gff_fn.items()}
#gff

In [9]:
data = []
for ortho in primer:
    for sp in pre_padd_sp[ortho]:
        if 'product' in gff[sp][parent_groups[ortho.split("_")[0]][sp]].attributes.keys():
            product = gff[sp][parent_groups[ortho.split("_")[0]][sp]]['product'][0]
        else:
            product = "N/A"
        score = name_score[ortho]
        data.append((ortho, score, sp, product, *primer[ortho]))

In [10]:
sp_order = { 'Bcur':1,
             'Bdor':2,
             'Bole':3,
             'Ccap':4,
             'Bcor':5,
             'Blat':6,
             'Bzon':7,
             'Afra':8,
             'Bmin':9,
             'Bjar':10,
             'Aobl':11,
             'Asus':12,
             'Btry':13}

In [14]:
data = sorted(data, key=lambda x: (x[0], sp_order[x[2]]))

In [12]:
header = ['Exon_Name',
 'PI_Score',
 'Species',
 'Gene_Product',
 'Target_Sequence_Length',
 'PRIMER_LEFT_0_SEQUENCE',
 'PRIMER_RIGHT_0_SEQUENCE',
 'PRIMER_LEFT_0_TM',
 'PRIMER_RIGHT_0_TM']

In [15]:
import csv
with open("./output/13spp_exon_primer_data.csv", "w") as f:
    f.write(",".join(header))
    for record in data:
        f.write("\n" + ",".join(record))