In [None]:
import json
import os
import shutil

import gffutils
from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from pyfaidx import Fasta


json_path = "../data/part01/intermediate/json/"
db_path = "../data/part01/intermediate/gff_databases/"
fasta_path = "../data/part01/input/fasta/"
toy_data_dir = "toy_data/"

In [None]:
# import ortholog groups
with open(json_path + "groups.json", 'r') as f:
    parent_groups = json.load(f)
    
# create handles for all .db files in intermediate directory
gff_fn = {name.split('.gff.db')[0]: db_path + name for name in os.listdir(db_path) if
          ".gff.db" in name}
gff = {key: gffutils.FeatureDB(value) for key, value in gff_fn.items()}

# create handles for all .fasta files in fasta directory
fasta_fn = {name.split('.nt.fasta')[0]: fasta_path + name for name in os.listdir(fasta_path) if
            ((".nt.fasta" in name) and (".nt.fasta.fai" not in name))}
fasta = {}
for sp,fn in fasta_fn.items():
    fasta[sp] = {seq_record.id: seq_record
                            for seq_record in SeqIO.parse(fn, "fasta", alphabet=IUPAC.ambiguous_dna)}

In [None]:
good = ["orth10018",
        "orth10019",
        "orth10023",
        "orth10028",
        "orth10034",
        "orth10035"]


rejected_on_second_alignment = ["orth4341",
                              "orth4271",
                              "orth2902"]

rejected_on_first_alignment = ['orth10015',
                               'orth10018',
                               'orth10019',
                               'orth10020',
                               'orth10022',
                               'orth10023']

In [None]:
toys = set(good + rejected_on_second_alignment + rejected_on_first_alignment)
toy_groups = {key:parent_groups[key] for key in toys}

In [None]:
sp_seqid = {}
for ortho in toy_groups:
    for sp in toy_groups[ortho]:
        if sp not in sp_seqid:
            sp_seqid[sp] = set()
        sp_seqid[sp].add(toy_groups[ortho][sp])

In [None]:
shutil.rmtree(toy_data_dir, ignore_errors=True)
for d in ["gff", "fasta"]:
    os.makedirs(toy_data_dir + d, exist_ok=True)
for sp in sp_seqid:
    with open(toy_data_dir + "gff/" + sp + ".gff", "w") as f:
        for parent in sp_seqid[sp]:
            m = gff[sp][parent]
            f.write(str(m) + "\n")
            for c in gff[sp].children(m):
                f.write(str(c) + "\n")
    with open(toy_data_dir + "fasta/" + sp + ".fasta", "w") as f:
        scafs = set()
        for parent in sp_seqid[sp]:
            m = gff[sp][parent]
            scafs.add(m.chrom)
        for scaf in scafs:
            f.write(fasta[sp][scaf].format("fasta"))