In [1]:
import os
import gffutils
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
import json
from pyfaidx import Fasta
from Bio import SeqIO
import re
from pprint import pprint as pp

In [2]:
full_species_list = ['Bjar', 'Aobl', 'Bmin', 'Asus', 'Btry', 'Afra', 'Blat', 'Bzon', 'Bcor', 'Ccap', 'Bcur', 'Bole', 'Bdor']
species_list = ["Bcur", "Bdor", "Bole", "Ccap"]
transvestigated_species_set = {'Bcor', 'Blat', 'Bzon', 'Afra', 'Bmin', 'Bjar', 'Aobl'}
gff_path = "./input/gff/"
fasta_path = "./input/fasta/"
groups_fn = "./input/groups_filtered_6181genes.txt"
output_path = "./output/"
input_path = "./input/"
intermediate_path = "./intermediate/"
aligned_fasta_path = "./intermediate/13spp_aligned_trimmed_filtered_fasta/"

In [3]:
# create handles for all .fasta files in fasta directory
fasta_fn = {name.split('.13spp.fasta')[0]: aligned_fasta_path + name for name in os.listdir(aligned_fasta_path) if
         ((".13spp.fasta" in name) and (".13spp.fasta.fai" not in name))}

In [4]:
# read and parse fasta files for each species
fasta = {}
for ortho in fasta_fn.keys():
    fasta[ortho] = {seq_record.id : seq_record 
                                      for seq_record in SeqIO.parse(fasta_fn[ortho],
                                                                    "fasta", alphabet=IUPAC.ambiguous_dna)}

In [5]:
from Bio import motifs
fasta_degenerate = {}
for ortho in fasta:
    seq = motifs.create([fasta[ortho][sp].upper().seq for sp in fasta[ortho].keys()]).degenerate_consensus
    fasta_degenerate[ortho] = seq
        
#fasta_degenerate

In [7]:
# output
primer_product_size_range = '400-600'
primer_thermodynamic_parameters_path = '/data0/opt/Primer3/primer3-2.3.6/src/primer3_config/'
primer_max_ns_accepted = '1'
primer_liberal_base = '1'
for ortho in fasta.keys():
    with open("output/" + ortho + ".degenerate.p3", "w") as f:
        sequence_id = ortho
        sequence_template = str(fasta_degenerate[ortho])
        f.write(
            "SEQUENCE_ID={}\n"
            "SEQUENCE_TEMPLATE={}\n"
            "PRIMER_PRODUCT_SIZE_RANGE={}\n"
            "PRIMER_THERMODYNAMIC_PARAMETERS_PATH={}\n"
            "PRIMER_MAX_NS_ACCEPTED={}\n"
            "PRIMER_LIBERAL_BASE={}\n"
            "=".format(
                sequence_id,
                sequence_template,
                primer_product_size_range,
                primer_thermodynamic_parameters_path,
                primer_max_ns_accepted,
                primer_liberal_base))

In [19]:
ortho = 'orth2472_1850-2286'

In [28]:
m = motifs.create([fasta[ortho][sp].upper().seq[:10] for sp in fasta[ortho].keys()])

In [29]:
m.consensus

Seq('AATTCGCTTA', IUPACAmbiguousDNA())

In [30]:
m.degenerate_consensus

Seq('AATTCGCTTA', IUPACAmbiguousDNA())

In [31]:
m.counts

{'A': [9, 9, 0, 0, 0, 2, 0, 0, 2, 9],
 'B': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'C': [0, 0, 0, 0, 9, 0, 9, 0, 2, 0],
 'D': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'G': [0, 0, 0, 0, 0, 5, 0, 0, 0, 0],
 'H': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'K': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'M': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'N': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'R': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'S': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'T': [0, 0, 9, 9, 0, 2, 0, 9, 5, 0],
 'V': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'W': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'Y': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [40]:
m.anticonsensus

Seq('GGGGGCGGGG', IUPACAmbiguousDNA())

In [41]:
m.background

{'A': 0.06666666666666667,
 'B': 0.06666666666666667,
 'C': 0.06666666666666667,
 'D': 0.06666666666666667,
 'G': 0.06666666666666667,
 'H': 0.06666666666666667,
 'K': 0.06666666666666667,
 'M': 0.06666666666666667,
 'N': 0.06666666666666667,
 'R': 0.06666666666666667,
 'S': 0.06666666666666667,
 'T': 0.06666666666666667,
 'V': 0.06666666666666667,
 'W': 0.06666666666666667,
 'Y': 0.06666666666666667}

In [47]:
m.format("pfm")

'  9.00   9.00   0.00   0.00   0.00   2.00   0.00   0.00   2.00   9.00\n  0.00   0.00   0.00   0.00   9.00   0.00   9.00   0.00   2.00   0.00\n  0.00   0.00   0.00   0.00   0.00   5.00   0.00   0.00   0.00   0.00\n  0.00   0.00   9.00   9.00   0.00   2.00   0.00   9.00   5.00   0.00\n'

In [48]:
m.format("jaspar")

'>None \nA [  9.00   9.00   0.00   0.00   0.00   2.00   0.00   0.00   2.00   9.00]\nC [  0.00   0.00   0.00   0.00   9.00   0.00   9.00   0.00   2.00   0.00]\nG [  0.00   0.00   0.00   0.00   0.00   5.00   0.00   0.00   0.00   0.00]\nT [  0.00   0.00   9.00   9.00   0.00   2.00   0.00   9.00   5.00   0.00]\n'

In [49]:
m.format("transfac")

'P0      A      B      C      D      G      H      K      M      N      R      S      T      V      W      Y\n01      9      0      0      0      0      0      0      0      0      0      0      0      0      0      0      A\n02      9      0      0      0      0      0      0      0      0      0      0      0      0      0      0      A\n03      0      0      0      0      0      0      0      0      0      0      0      9      0      0      0      T\n04      0      0      0      0      0      0      0      0      0      0      0      9      0      0      0      T\n05      0      0      9      0      0      0      0      0      0      0      0      0      0      0      0      C\n06      2      0      0      0      5      0      0      0      0      0      0      2      0      0      0      G\n07      0      0      9      0      0      0      0      0      0      0      0      0      0      0      0      C\n08      0      0      0      0      0      0      0      0      0      0      

In [51]:
m.instances

[Seq('AATTCGCTTA', IUPACAmbiguousDNA()),
 Seq('AATTCGCTCA', IUPACAmbiguousDNA()),
 Seq('AATTCACTTA', IUPACAmbiguousDNA()),
 Seq('AATTCGCTTA', IUPACAmbiguousDNA()),
 Seq('AATTCTCTAA', IUPACAmbiguousDNA()),
 Seq('AATTCTCTCA', IUPACAmbiguousDNA()),
 Seq('AATTCGCTAA', IUPACAmbiguousDNA()),
 Seq('AATTCGCTTA', IUPACAmbiguousDNA()),
 Seq('AATTCACTTA', IUPACAmbiguousDNA())]

In [54]:
m.pseudocounts

{'A': 0.0,
 'B': 0.0,
 'C': 0.0,
 'D': 0.0,
 'G': 0.0,
 'H': 0.0,
 'K': 0.0,
 'M': 0.0,
 'N': 0.0,
 'R': 0.0,
 'S': 0.0,
 'T': 0.0,
 'V': 0.0,
 'W': 0.0,
 'Y': 0.0}

In [56]:
m.pssm

{'A': [3.906890595608518,
  3.906890595608518,
  -inf,
  -inf,
  -inf,
  1.7369655941662057,
  -inf,
  -inf,
  1.7369655941662057,
  3.906890595608518],
 'B': [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
 'C': [-inf,
  -inf,
  -inf,
  -inf,
  3.906890595608518,
  -inf,
  3.906890595608518,
  -inf,
  1.7369655941662057,
  -inf],
 'D': [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
 'G': [-inf,
  -inf,
  -inf,
  -inf,
  -inf,
  3.0588936890535687,
  -inf,
  -inf,
  -inf,
  -inf],
 'H': [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
 'K': [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
 'M': [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
 'N': [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
 'R': [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
 'S': [-inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
 'T': [-inf,
  -inf,
  3.906890595608518,
  3.906890595608518,
  -inf,
  1.73

In [57]:
m.pwm

{'A': (1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.2222222222222222,
  0.0,
  0.0,
  0.2222222222222222,
  1.0),
 'B': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
 'C': (0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.2222222222222222, 0.0),
 'D': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
 'G': (0.0, 0.0, 0.0, 0.0, 0.0, 0.5555555555555556, 0.0, 0.0, 0.0, 0.0),
 'H': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
 'K': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
 'M': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
 'N': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
 'R': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
 'S': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
 'T': (0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.2222222222222222,
  0.0,
  1.0,
  0.5555555555555556,
  0.0),
 'V': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
 'W': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
 'Y': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)}