In [10]:
import gffutils
import os
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
from pprint import pprint as pp
import time

In [11]:
gff_path = "input/gff/"
fasta_path = "input/fasta"
db_path = "intermediate/"
species_list = ["Bcur", "Bdor", "Bole", "Ccap"]
ortho_group_path = "input/new_groups_filter.txt"

In [12]:
def create_db(species):
    gff_name = species + ".gff"
    created = False
    if not os.path.isfile(os.path.join(os.path.abspath(os.path.curdir),db_path + gff_name + ".db")):
        fn = gffutils.example_filename(os.path.join(os.path.abspath(os.path.curdir),gff_path + gff_name))
        db = gffutils.create_db(fn,
                                dbfn=db_path + gff_name + ".db",
                                force=True,
                                merge_strategy='merge',
                                id_spec=['ID', 'Name'])
        created = True
    return (species, created)

In [13]:
def connect_db(species):
    gff_name = species + ".gff"
    if os.path.isfile(os.path.join(os.path.abspath(os.path.curdir),db_path + gff_name + ".db")):
        db = gffutils.FeatureDB(os.path.join(os.path.abspath(os.path.curdir),db_path + gff_name + ".db"))
    return (species, db)

In [14]:
def get_exons(species):
    species,db = connect_db(species)
    return (species, [exon for exon in db.features_of_type('exon')])

In [15]:
def get_ortho_groups(ortho_group_path):
    # import ortholog groups
    with open(ortho_group_path, 'r') as f:
        groups_raw = f.readlines()
        groups_raw = [line.strip() for line in groups_raw]
    groups = dict()
    for line in groups_raw:
        ortho, data = line.split(':')
        ortho = ortho.strip()
        data = data.strip().split()
        data = {elem.split("|")[0] : elem.split("|")[1] for elem in data}
        groups[ortho] = data
    return(groups)

In [16]:
print("created?\n" +
      "--------")
with Pool(len(species_list)) as p: results = {sp:db for sp,db in p.map(create_db, species_list)}
for sp,status in results.items():
    print("{}: {}".format(sp, status))

created?
--------
Bole: False
Bdor: False
Ccap: False
Bcur: False


In [7]:
start = time.clock()
gff_dbs = {sp:db for sp,db in [connect_db(sp) for sp in species_list]}
exons = {}
for sp in species_list:
    exons[sp] = get_exons(sp)[1]
    print("{} exons: {}".format(sp, len(exons[sp])))
end = time.clock()
print("time: {}".format(end - start))

Bcur exons: 139222
Bdor exons: 106567
Bole exons: 110294
Ccap exons: 148755
time: 29.126488


In [18]:
get_ortho_groups(ortho_group_path)

{'orth5079': {'Bcur': 'XP_011183264.1',
  'Bdor': 'XP_011213159.1',
  'Bole': 'XP_014102466.1',
  'Ccap': 'XP_004518896.1'},
 'orth8604': {'Bcur': 'XP_011191089.1',
  'Bdor': 'XP_011200648.1',
  'Bole': 'XP_014088512.1',
  'Ccap': 'XP_004530622.1'},
 'orth8446': {'Bcur': 'XP_011190749.1',
  'Bdor': 'XP_011202917.1',
  'Bole': 'XP_014097415.1',
  'Ccap': 'XP_004518720.1'},
 'orth4885': {'Bcur': 'XP_011182788.1',
  'Bdor': 'XP_011200805.1',
  'Bole': 'XP_014103068.1',
  'Ccap': 'XP_004525826.1'},
 'orth5456': {'Bcur': 'XP_011184225.1',
  'Bdor': 'XP_011197925.1',
  'Bole': 'XP_014091193.1',
  'Ccap': 'XP_004517413.1'},
 'orth10652': {'Bcur': 'XP_011196166.1',
  'Bdor': 'XP_011201657.1',
  'Bole': 'XP_014100851.1',
  'Ccap': 'XP_004525572.1'},
 'orth5158': {'Bcur': 'XP_011183467.1',
  'Bdor': 'XP_011206437.1',
  'Bole': 'XP_014085958.1',
  'Ccap': 'XP_012160166.1'},
 'orth5786': {'Bcur': 'XP_011184652.1',
  'Bdor': 'XP_011206211.1',
  'Bole': 'XP_014090249.1',
  'Ccap': 'XP_004534832.1'},