In [1]:
import gffutils
import os
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
from pprint import pprint as pp
import time

In [2]:
gff_path = "input/gff/"
fasta_path = "input/fasta"
db_path = "intermediate/"
species_list = ["Bcur", "Bdor", "Bole", "Ccap"]

In [3]:
def create_db(species):
    gff_name = species + ".gff"
    created = False
    if not os.path.isfile(os.path.join(os.path.abspath(os.path.curdir),db_path + gff_name + ".db")):
        fn = gffutils.example_filename(os.path.join(os.path.abspath(os.path.curdir),gff_path + gff_name))
        db = gffutils.create_db(fn,
                                dbfn=db_path + gff_name + ".db",
                                force=True,
                                merge_strategy='merge',
                                id_spec=['ID', 'Name'])
        created = True
    return (species, created)

In [4]:
def connect_db(species):
    gff_name = species + ".gff"
    if os.path.isfile(os.path.join(os.path.abspath(os.path.curdir),db_path + gff_name + ".db")):
        db = gffutils.FeatureDB(os.path.join(os.path.abspath(os.path.curdir),db_path + gff_name + ".db"))
    return (species, db)

In [5]:
def get_exons(species):
    species,db = connect_db(species)
    return (species, [exon for exon in db.features_of_type('exon')])

In [6]:
print("created?\n" +
      "--------")
with Pool(len(species_list)) as p: results = {sp:db for sp,db in p.map(create_db, species_list)}
for sp,status in results.items():
    print("{}: {}".format(sp, status))

created?
--------
Bole: False
Bdor: False
Bcur: False
Ccap: False


In [7]:
start = time.clock()
gff_dbs = {sp:db for sp,db in [connect_db(sp) for sp in species_list]}
exons = {}
for sp in species_list:
    exons[sp] = [exon for exon in gff_dbs[sp].features_of_type('exon')]
    print("{} exons: {}".format(sp, len(exons[sp])))
end = time.clock()
print("time: {}".format(end - start))

Bcur exons: 139222
Bdor exons: 106567
Bole exons: 110294
Ccap exons: 148755
time: 33.675389


In [8]:
start = time.clock()
with Pool(len(species_list)) as p: exons = {sp:exons for sp,exons in p.map(get_exons, species_list)}
for sp in species_list:
    print("{} exons: {}".format(sp, len(exons[sp])))
end = time.clock()
print("time: {}".format(end - start))

Bcur exons: 139222
Bdor exons: 106567
Bole exons: 110294
Ccap exons: 148755
time: 29.014419000000004


In [9]:
start = time.clock()
with ThreadPool(len(species_list)) as p: exons = {sp:exons for sp,exons in p.map(get_exons, species_list)}
for sp in species_list:
    print("{} exons: {}".format(sp, len(exons[sp])))
end = time.clock()
print("time: {}".format(end - start))

Bcur exons: 139222
Bdor exons: 106567
Bole exons: 110294
Ccap exons: 148755
time: 130.422873
