Skip to content

Commit

Permalink
Merge pull request #1042 from andrewkern/dros_annot
Browse files Browse the repository at this point in the history
merging the annotation side generalization. 

closes #618, resolves #562, close #659
  • Loading branch information
andrewkern committed Oct 21, 2021
2 parents 80dbefe + b7cd71f commit 27918f1
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 295 deletions.
10 changes: 5 additions & 5 deletions maintenance/annotation_maint.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,18 +96,18 @@ def download_process_annotations():
if spc.annotations:
for an in spc.annotations:
CHROM_IDS = [chrom.id for chrom in spc.genome.chromosomes]
logger.info(f"Downloading GFF file {spc.id}")
logger.info(f"Downloading GFF file {an.id}")
gff = get_gff_recarray(an.url, an.gff_sha256)
logger.info("extracting exons {spc.id}")
# this is fragile-- is ensembl_havana always a feature?
logger.info(f"extracting annotations {an.id}")
exons = gff[
np.where(
np.logical_and(
gff.source == "ensembl_havana", gff.type == "exon"
gff.source == an.annotation_source,
gff.type == an.annotation_type,
)
)
]
logger.info(f"merging overlapping regions {spc.id}")
logger.info(f"merging overlapping regions {an.id}")
# create zarr store and zarr root
spc_name_path = os.path.join(annot_path, spc.id)
os.makedirs(spc_name_path, exist_ok=True)
Expand Down
4 changes: 3 additions & 1 deletion stdpopsim/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
@attr.s(kw_only=True)
class Annotation:
"""
Class representing a GFF3 annotation file.
Class representing an annotation track.
:ivar str ~.id: String that uniquely identifies the annotation.
:ivar species: The species to which this annotation applies.
Expand All @@ -37,6 +37,8 @@ class Annotation:
description = attr.ib()
citations = attr.ib(factory=list)
file_pattern = attr.ib()
annotation_source = attr.ib()
annotation_type = attr.ib()

def __attrs_post_init__(self):
self._cache = stdpopsim.CachedData(
Expand Down
1 change: 1 addition & 0 deletions stdpopsim/catalog/DroMel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from . import genetic_maps # noqa: F401
from . import demographic_models # noqa: F401
from . import dfes # noqa: F401
from . import annotations # noqa: F401
61 changes: 61 additions & 0 deletions stdpopsim/catalog/DroMel/annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import stdpopsim

_species = stdpopsim.get_species("DroMel")
# exons
_an = stdpopsim.Annotation(
species=_species,
id="FlyBase_BDGP6.32.51_exons",
description="FlyBase exon annotations on BDGP6",
url=(
"http://ftp.ebi.ac.uk/ensemblgenomes/pub/current/metazoa/"
"gff3/drosophila_melanogaster/"
"Drosophila_melanogaster.BDGP6.32.51.gff3.gz"
),
gff_sha256="d882d9a2af1c090ad69b4c81e54b809506f7a8d5fdd90597c6ed05c79ad502bc",
intervals_url=(
"https://stdpopsim.s3-us-west-2.amazonaws.com/"
"annotations/DroMel/FlyBase_BDGP6.32.51_exons.tar.gz"
),
intervals_sha256="680ee9b0a565f85c561cd3672927cb2bc8649405a750532f91e1717b3ed8b993",
citations=[
stdpopsim.Citation(
year=2014,
author="Hoskins et al",
doi="https://doi.org/10.1101/gr.185579.114",
reasons={stdpopsim.CiteReason.ANNOTATION},
)
],
file_pattern="flybase_exons_{id}.txt",
annotation_source="FlyBase",
annotation_type="exon",
)
_species.add_annotations(_an)

_an2 = stdpopsim.Annotation(
species=_species,
id="FlyBase_BDGP6.32.51_CDS",
description="FlyBase CDS annotations on BDGP6",
url=(
"http://ftp.ebi.ac.uk/ensemblgenomes/pub/current/metazoa/"
"gff3/drosophila_melanogaster/"
"Drosophila_melanogaster.BDGP6.32.51.gff3.gz"
),
gff_sha256="d882d9a2af1c090ad69b4c81e54b809506f7a8d5fdd90597c6ed05c79ad502bc",
intervals_url=(
"https://stdpopsim.s3-us-west-2.amazonaws.com/"
"annotations/DroMel/FlyBase_BDGP6.32.51_CDS.tar.gz"
),
intervals_sha256="5f202f454e7d0051f863a5146e775c85e4f3e39bab434cc96701d46026eb7364",
citations=[
stdpopsim.Citation(
year=2014,
author="Hoskins et al",
doi="https://doi.org/10.1101/gr.185579.114",
reasons={stdpopsim.CiteReason.ANNOTATION},
)
],
file_pattern="flybase_CDS_{id}.txt",
annotation_source="FlyBase",
annotation_type="CDS",
)
_species.add_annotations(_an2)
39 changes: 35 additions & 4 deletions stdpopsim/catalog/HomSap/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@

_an = stdpopsim.Annotation(
species=_species,
id="Ensembl_GRCh38_104_gff3",
description="Ensembl GFF3 annotations on GRCh38",
id="ensembl_havana_104_exons",
description="Ensembl Havana exon annotations on GRCh38",
url=(
"ftp://ftp.ensembl.org/pub/release-104/"
"gff3/homo_sapiens/Homo_sapiens.GRCh38.104.gff3.gz"
),
gff_sha256="313ad46bd4af78b45b9f5d8407bbcbd3f87f4be0747060e84b3b5eb931530ec1",
intervals_url=(
"https://stdpopsim.s3-us-west-2.amazonaws.com/"
"annotations/HomSap/Ensembl_GRCh38_104_gff3.tar.gz"
"annotations/HomSap/ensembl_havana_104_exons.tar.gz"
),
intervals_sha256="b0e864ec87274f3084e0d93161c8ed959b845c97a42cbad5bfe33f54c862716d",
intervals_sha256="5c356d092b31fa40bfce434994de276e9040ed9a80fc047a5e3b94410157f1cf",
citations=[
stdpopsim.Citation(
year=2018,
Expand All @@ -25,5 +25,36 @@
)
],
file_pattern="ensembl_havana_exons_{id}.txt",
annotation_source="ensembl_havana",
annotation_type="exon",
)
_species.add_annotations(_an)

# add CDS
_an2 = stdpopsim.Annotation(
species=_species,
id="ensembl_havana_104_CDS",
description="Ensembl Havana CDS annotations on GRCh38",
url=(
"ftp://ftp.ensembl.org/pub/release-104/"
"gff3/homo_sapiens/Homo_sapiens.GRCh38.104.gff3.gz"
),
gff_sha256="313ad46bd4af78b45b9f5d8407bbcbd3f87f4be0747060e84b3b5eb931530ec1",
intervals_url=(
"https://stdpopsim.s3-us-west-2.amazonaws.com/"
"annotations/HomSap/ensembl_havana_104_CDS.tar.gz"
),
intervals_sha256="24b36e6f88a6d995ecaee7ef965baa4d2ea850058ee2f5084efdbb0ea47f1c8e",
citations=[
stdpopsim.Citation(
year=2018,
author="Hunt et al",
doi="https://doi.org/10.1093/database/bay119",
reasons={stdpopsim.CiteReason.ANNOTATION},
)
],
file_pattern="ensembl_havana_CDS_{id}.txt",
annotation_source="ensembl_havana",
annotation_type="CDS",
)
_species.add_annotations(_an2)

0 comments on commit 27918f1

Please sign in to comment.