In [None]:
import os
import gffutils
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
import json
from pyfaidx import Fasta
from Bio import SeqIO
import re
from pprint import pprint as pp

In [None]:
full_species_list = ['Bjar', 'Aobl', 'Bmin', 'Asus', 'Btry', 'Afra', 'Blat', 'Bzon', 'Bcor', 'Ccap', 'Bcur', 'Bole', 'Bdor']
species_list = ["Bcur", "Bdor", "Bole", "Ccap"]
transvestigated_species_set = {'Bcor', 'Blat', 'Bzon', 'Afra', 'Bmin', 'Bjar', 'Aobl'}
gff_path = "./input/gff/"
fasta_path = "./input/fasta/"
groups_fn = "./input/groups_filtered_6181genes.txt"
output_path = "./output/"
input_path = "./input/"
intermediate_path = "./intermediate/"
aligned_fasta_path = "./intermediate/13spp_aligned_trimmed_filtered_fasta/"

In [None]:
# create handles for all .fasta files in fasta directory
fasta_fn = {name.split('.13spp.fasta')[0]: aligned_fasta_path + name for name in os.listdir(aligned_fasta_path) if
         ((".13spp.fasta" in name) and (".13spp.fasta.fai" not in name))}

In [None]:
# read and parse fasta files for each species
fasta = {}
for ortho in fasta_fn.keys():
    fasta[ortho] = {seq_record.id : seq_record 
                                      for seq_record in SeqIO.parse(fasta_fn[ortho],
                                                                    "fasta", alphabet=IUPAC.ambiguous_dna)}

In [None]:
from Bio import motifs
fasta_degenerate = {}
for ortho in fasta:
    seq = motifs.create([fasta[ortho][sp].upper().seq for sp in fasta[ortho].keys()]).degenerate_consensus
    fasta_degenerate[ortho] = seq
        
#fasta_degenerate

In [None]:
# output
primer_product_size_range = '400-600'
primer_thermodynamic_parameters_path = '/data0/opt/Primer3/primer3-2.3.6/src/primer3_config/'
primer_max_ns_accepted = '1'
primer_liberal_base = '1'
for ortho in fasta.keys():
    with open("output/" + ortho + ".degenerate.p3", "w") as f:
        sequence_id = ortho
        sequence_template = str(fasta_degenerate[ortho])
        f.write(
            "SEQUENCE_ID={}\n"
            "SEQUENCE_TEMPLATE={}\n"
            "PRIMER_PRODUCT_SIZE_RANGE={}\n"
            "PRIMER_THERMODYNAMIC_PARAMETERS_PATH={}\n"
            "PRIMER_MAX_NS_ACCEPTED={}\n"
            "PRIMER_LIBERAL_BASE={}\n"
            "=".format(
                sequence_id,
                sequence_template,
                primer_product_size_range,
                primer_thermodynamic_parameters_path,
                primer_max_ns_accepted,
                primer_liberal_base))

In [None]:
ortho = 'orth2472_1850-2286'

In [None]:
m = motifs.create([fasta[ortho][sp].upper().seq[:10] for sp in fasta[ortho].keys()])

In [None]:
m.consensus

In [None]:
m.degenerate_consensus

In [None]:
m.counts

In [None]:
m.anticonsensus

In [None]:
m.background

In [None]:
m.format("pfm")

In [None]:
m.format("jaspar")

In [None]:
m.format("transfac")

In [None]:
m.instances

In [None]:
m.pseudocounts

In [None]:
m.pssm

In [None]:
m.pwm