Skip to content

Commit

Permalink
Merge pull request #676 from grahamgower/cache
Browse files Browse the repository at this point in the history
Factor out download/cache code and add checksums.
  • Loading branch information
grahamgower committed Nov 3, 2020
2 parents 408e49e + f59464e commit b9d2720
Show file tree
Hide file tree
Showing 15 changed files with 652 additions and 352 deletions.
9 changes: 9 additions & 0 deletions docs/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -921,11 +921,20 @@ which the genome is defined) as shown below:
description="FILL_ME",
long_description="FILL_ME",
url=("https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/dir/filename"),
sha256="FILL_ME",
file_pattern="name_{id}_more_name.txt",
citations=[_genetic_map_citation])
_species.add_genetic_map(_gm)
The SHA256 checksum of the the genetic map tarball can be obtained using the
``sha256sum`` command from GNU coreutils. If this is not available on your
system, the following can instead be used:

.. code-block:: sh
python -c 'from stdpopsim.utils import sha256; print(sha256("genetic_map.tgz"))'
Once all this is done, submit a PR containing the code changes and wait for directions
on whom to send the compressed archive of genetic maps to (currently Andrew Kern is the
primary uploader but please wait to send files to him until directed).
Expand Down
128 changes: 44 additions & 84 deletions stdpopsim/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,12 @@
Infrastructure for defining information about genome annotation.
"""
import logging

import attr
import pandas
import pathlib
import warnings
import os
import urllib.request
from . import cache
import zarr
import tempfile

import stdpopsim

logger = logging.getLogger(__name__)

Expand All @@ -34,116 +31,80 @@ def zarr_to_dataframe(path):
return df


@attr.s
class Annotation(object):
@attr.s(kw_only=True)
class Annotation:
"""
Class represnting a Annotation file
assume GFF3/GTF or similar
:ivar url: The URL where the packed and compressed GTF can be found
:vartype url: str
:ivar species_id: species id
:vartype id: str
:ivar species: a `stdpopsim.species` instance
:ivar annotation_description: description of annotation file
:vartype annotation_description: str
Class representing a GFF3 annotation file.
:ivar str ~.id: String that uniquely identifies the annotation.
:ivar species: The species to which this annotation applies.
:vartype species: :class:`.Species`
:ivar str url: The URL where the packed and compressed GFF3 can be found.
:ivar str zarr_url: The URL of the zarr cache of the GFF3.
:ivar str zarr_sha256: The SHA256 checksum of the zarr cache.
:ivar str ~.description: One line description of the annotation.
:ivar citations: List of citations for the annotation.
:vartype citations: list of :class:`.Citation`
"""
url = attr.ib(default=None)
zarr_url = attr.ib(default=None)
species = attr.ib(default=None)
id = attr.ib(default=None)
file_name = attr.ib(default=None)
description = attr.ib(default=None)
id = attr.ib()
species = attr.ib()
url = attr.ib()
zarr_url = attr.ib()
zarr_sha256 = attr.ib()
description = attr.ib()
citations = attr.ib(factory=list)
long_description = attr.ib(default=None)

def __attrs_post_init__(self):
self.file_name = os.path.basename(self.zarr_url)
self._cache = stdpopsim.CachedData(
namespace=f"annotations/{self.species.id}",
url=self.zarr_url,
sha256=self.zarr_sha256,
extract=False,
)

@property
def annot_cache_dir(self):
return pathlib.Path(cache.get_cache_dir()) / "annotations"

@property
def species_cache_dir(self):
return self.annot_cache_dir / self.species.id
def cache_path(self):
return self._cache.cache_path

def __str__(self):
s = "GTF Annotation:\n"
s += "\tspecies = {}\n".format(self.species.name)
s += "\tid = {}\n".format(self.id)
s += "\turl = {}\n".format(self.url)
s += "\tzarr url = {}\n".format(self.zarr_url)
s += "\tzarr url = {}\n".format(self.zarr_url)
s += "\tcached = {}\n".format(self.is_cached())
s += "\tcache_dir = {}\n".format(self.species_cache_dir)
s += "\tcache_path = {}\n".format(self.cache_path)
return s

def is_cached(self):
"""
Returns True if this annotation is cached locally.
"""
return os.path.exists(self.species_cache_dir)
return self._cache.is_valid()

def download(self):
"""
Downloads the zarr from the source URL and stores it in the
cache directory. If the annotation directory already exists it is first
removed.
Downloads the zarr URL and stores it in the cache directory.
"""
self.file_name = os.path.basename(self.zarr_url)
if self.is_cached():
logger.info(f"Clearing cache {self.species_cache_dir}")
with tempfile.TemporaryDirectory() as tempdir:
dest = pathlib.Path(tempdir) / "will_be_deleted"
os.rename(self.annot_cache_dir, dest)
logger.debug(f"Checking species cache directory {self.species_cache_dir}")
os.makedirs(self.species_cache_dir, exist_ok=True)
download_file = f'{self.species_cache_dir}/{self.file_name}'
logger.info(f"Downloading Zarr file '{self.id}' from {self.zarr_url}")
logger.info(f"download_file: {download_file}")
logger.info(f"species_cache_dir: {self.species_cache_dir}")
if os.path.exists(download_file):
warnings.warn("multiple downloads?")
try:
urllib.request.urlretrieve(self.zarr_url, filename=download_file)
except urllib.error.URLError:
print(f"could not connect to {self.zarr_url}")
raise
logger.debug("Download Zarr complete")
logger.info(f"Storing Zarr in {self.species_cache_dir}")
self._cache.download()

def get_chromosome_annotations(self, id):
"""
Returns the pandas dataframe for
the chromosome with the specified id.
Returns the pandas dataframe for the chromosome with the specified id.
"""
chrom = self.species.genome.get_chromosome(id)
if not self.is_cached():
self.download()
annot_file = os.path.join(self.species_cache_dir, self.file_name)
if id is None:
raise ValueError("bad chrom id")
chr_prefix = "chr" # building this in for future generalization
if id.startswith(chr_prefix):
id = id[len(chr_prefix):]
if os.path.exists(annot_file):
bed = zarr_to_dataframe(annot_file)
assert type(bed) == pandas.DataFrame
ret = bed[bed.seqid == id]
if len(ret) == 0:
raise ValueError
else:
ret = None
raise ValueError(
"Warning: annotation file not found for chromosome: '{}'"
" on annotation: '{}', no annotations will be used".format(
id, self.id))
bed = zarr_to_dataframe(str(self.cache_path))
assert type(bed) == pandas.DataFrame
ret = bed[bed.seqid == chrom.id]
if len(ret) == 0:
raise ValueError(f"No annotations found for {id}")
return ret

def get_annotation_type_from_chromomosome(self, a_type, chrom_id, full_table=False):
"""
Returns all elements of
type a_type from chromosome
specified
Returns all elements of type a_type from chromosome specified
"""
annots = self.get_chromosome_annotations(chrom_id)
subset = annots[annots.type == a_type]
Expand All @@ -157,8 +118,7 @@ def get_annotation_type_from_chromomosome(self, a_type, chrom_id, full_table=Fal

def get_genes_from_chromosome(self, chrom_id, full_table=False):
"""
Returns all elements of
type gene from annotation
Returns all elements of type gene from annotation
"""
return self.get_annotation_type_from_chromomosome('gene', chrom_id,
full_table)
130 changes: 130 additions & 0 deletions stdpopsim/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,14 @@
import pathlib
import logging
import os
import urllib.parse
import tempfile
import warnings

import appdirs
import attr

from . import utils

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -43,3 +49,127 @@ def get_cache_dir():


set_cache_dir()


@attr.s(kw_only=True)
class CachedData:
"""
Downloadable data that will be cached locally.
The downloadable should be a single file. The local cache may
correspond to this same file, or to its extracted contents. In the latter
case, the downloaded file will be removed after archive extraction.
The downloaded file is compared against the expected SHA256 checksum,
and if correct, the checksum is then also stored locally.
:ivar str namespace: The namespace under which the cache will be stored.
This will be converted into a folder, by constructing folders in the
cache corresponding to each component of the namespace.
E.g. if we're on a unix system with cache under ``/path/to/cache``, and
``namespace="foo/bar"``, the cached data will live under
``/path/to/cache/foo/bar``.
:ivar str url: The URL of the data to be cached.
:ivar str sha256: The SHA256 checksum of the downloaded file.
:ivar bool extract: True if the downloaded file is a tarball that should be
extracted into the cached namespace, False otherwise.
"""
namespace = attr.ib(type=str)
url = attr.ib(type=str)
sha256 = attr.ib(type=str)
extract = attr.ib(type=bool)

def __attrs_post_init__(self):
u = urllib.parse.urlparse(self.url)
self._basename = pathlib.PurePath(u.path).name

@property
def sha256_file(self):
return get_cache_dir() / self.namespace / f"{self._basename}.sha256"

@property
def cache_path(self):
# the cache path could be a folder or a file, depending on self.extract
path = get_cache_dir() / self.namespace
if not self.extract:
path = path / self._basename
return path

def is_cached(self):
"""
Returns True if the data is cached locally.
"""
return self.cache_path.exists()

def is_valid(self):
"""
Returns True if the cached data matches the checksum.
"""
is_valid = False
if self.is_cached() and self.sha256_file.exists():
with open(self.sha256_file, "r") as f:
cached_sha256 = f.read().strip()
is_valid = self.sha256 == cached_sha256
return is_valid

def download(self):
"""
Downloads the file from the source URL and stores it in the cache.
If the local cache already exists, it is first removed.
"""
if self.is_cached():
logger.info(f"Clearing cache {self.cache_path}")
with tempfile.TemporaryDirectory(dir=get_cache_dir()) as tempdir:
# Atomically move to a temporary directory, which will be automatically
# deleted on exit.
dest = pathlib.Path(tempdir) / "will_be_deleted"
os.rename(self.cache_path, dest)

self.cache_path.parent.mkdir(parents=True, exist_ok=True)

logger.info(f"Downloading {self.url}")
# os.rename will not work on some Unixes if the source and dest are on
# different file systems. Keep the tempdir in the same directory as
# the destination to ensure it's on the same file system.
with tempfile.TemporaryDirectory(dir=get_cache_dir()) as tempdir:
tempdir = pathlib.Path(tempdir)
local_path = tempdir / "downloaded"
utils.download(self.url, local_path)

logger.debug("Checking SHA256")
download_sha256 = utils.sha256(local_path)
if download_sha256 != self.sha256:
# TODO: use a more appropriate exception here.
raise ValueError(
f"Expected SHA256={self.sha256}, but downloaded file has"
f"{download_sha256}."
)

if self.extract:
extract_dir = tempdir / "extracted"
extract_dir.mkdir()
logger.debug(f"Extracting {local_path}")
utils.untar(local_path, extract_dir)
local_path = extract_dir

# If this has all gone OK up to here we can now move the
# data into the cache location. This should minimise the
# chances of having malformed data in the cache.
logger.info(f"Saving to {self.cache_path}")
# os.rename is atomic, and will raise an OSError if the destination
# is a directory and already exists. Therefore, if we see the map
# exists we assume that some other process has already downloaded
# it, and raise a warning.
# If the source and destination are regular files (such as when
# self.extract==False), the destination will be silently replaced
# on unix systems, but FileExistsError will be raised on windows.
try:
os.rename(local_path, self.cache_path)
except (OSError, FileExistsError):
warnings.warn(
"Error occured renaming map directory. Are multiple processes"
"downloading this map at the same time?")
return

# Write out the checksum.
with open(self.sha256_file, "w") as f:
print(self.sha256, file=f)
1 change: 1 addition & 0 deletions stdpopsim/catalog/AraTha/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
url=(
"https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/"
"AraTha/salome2012_maps.tar.gz"),
sha256="49745e1cab87d59e33eacfdf66303839632d3b07883dd55a99fe1dc27b336ac6",
file_pattern="arab_chr{id}_map_loess.txt",
citations=[stdpopsim.Citation(
doi="https://doi.org/10.1038/hdy.2011.95",
Expand Down
1 change: 1 addition & 0 deletions stdpopsim/catalog/CanFam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@
""",
url="https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/"
"CanFam/dog_genetic_maps.tar.gz",
sha256="585afb424615e2fb0825d807db0b10fe1c797a6dbb804ecbb3fef5e8387d194f",
file_pattern="chr{id}_average_canFam3.1.txt",
citations=[
_CampbellEtAl.because(stdpopsim.CiteReason.GEN_MAP)
Expand Down
1 change: 1 addition & 0 deletions stdpopsim/catalog/DroMel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
url=(
"https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/"
"DroMel/comeron2012_maps.tar.gz"),
sha256="08185a0e3b0ad26eefe69fc6bdb8f3f599a760e11e87dd343335b33d1563f62a",
file_pattern="genetic_map_comeron2012_dm6_chr{id}.txt",
citations=[stdpopsim.Citation(
author="Comeron et al",
Expand Down

0 comments on commit b9d2720

Please sign in to comment.