Merge pull request #676 from grahamgower/cache

Factor out download/cache code and add checksums.
popsim-consortium · Nov 3, 2020 · b9d2720 · b9d2720
2 parents 408e49e + f59464e
commit b9d2720
Show file tree

Hide file tree

Showing 15 changed files with 652 additions and 352 deletions.
diff --git a/docs/development.rst b/docs/development.rst
@@ -921,11 +921,20 @@ which the genome is defined) as shown below:
         description="FILL_ME",
         long_description="FILL_ME",
         url=("https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/dir/filename"),
+        sha256="FILL_ME",
         file_pattern="name_{id}_more_name.txt",
         citations=[_genetic_map_citation])
 
     _species.add_genetic_map(_gm)
 
+The SHA256 checksum of the the genetic map tarball can be obtained using the
+``sha256sum`` command from GNU coreutils. If this is not available on your
+system, the following can instead be used:
+
+.. code-block:: sh
+
+   python -c 'from stdpopsim.utils import sha256; print(sha256("genetic_map.tgz"))'
+
 Once all this is done, submit a PR containing the code changes and wait for directions
 on whom to send the compressed archive of genetic maps to (currently Andrew Kern is the
 primary uploader but please wait to send files to him until directed).

diff --git a/stdpopsim/annotations.py b/stdpopsim/annotations.py
@@ -2,15 +2,12 @@
 Infrastructure for defining information about genome annotation.
 """
 import logging
+
 import attr
 import pandas
-import pathlib
-import warnings
-import os
-import urllib.request
-from . import cache
 import zarr
-import tempfile
+
+import stdpopsim
 
 logger = logging.getLogger(__name__)
 
@@ -34,116 +31,80 @@ def zarr_to_dataframe(path):
     return df
 
 
-@attr.s
-class Annotation(object):
+@attr.s(kw_only=True)
+class Annotation:
     """
-    Class represnting a Annotation file
-    assume GFF3/GTF or similar
-
-    :ivar url: The URL where the packed and compressed GTF can be found
-    :vartype url: str
-    :ivar species_id: species id
-    :vartype id: str
-    :ivar species: a `stdpopsim.species` instance
-    :ivar annotation_description: description of annotation file
-    :vartype annotation_description: str
+    Class representing a GFF3 annotation file.
+
+    :ivar str ~.id: String that uniquely identifies the annotation.
+    :ivar species: The species to which this annotation applies.
+    :vartype species: :class:`.Species`
+    :ivar str url: The URL where the packed and compressed GFF3 can be found.
+    :ivar str zarr_url: The URL of the zarr cache of the GFF3.
+    :ivar str zarr_sha256: The SHA256 checksum of the zarr cache.
+    :ivar str ~.description: One line description of the annotation.
+    :ivar citations: List of citations for the annotation.
+    :vartype citations: list of :class:`.Citation`
     """
-    url = attr.ib(default=None)
-    zarr_url = attr.ib(default=None)
-    species = attr.ib(default=None)
-    id = attr.ib(default=None)
-    file_name = attr.ib(default=None)
-    description = attr.ib(default=None)
+    id = attr.ib()
+    species = attr.ib()
+    url = attr.ib()
+    zarr_url = attr.ib()
+    zarr_sha256 = attr.ib()
+    description = attr.ib()
     citations = attr.ib(factory=list)
-    long_description = attr.ib(default=None)
 
     def __attrs_post_init__(self):
-        self.file_name = os.path.basename(self.zarr_url)
+        self._cache = stdpopsim.CachedData(
+            namespace=f"annotations/{self.species.id}",
+            url=self.zarr_url,
+            sha256=self.zarr_sha256,
+            extract=False,
+        )
 
     @property
-    def annot_cache_dir(self):
-        return pathlib.Path(cache.get_cache_dir()) / "annotations"
-
-    @property
-    def species_cache_dir(self):
-        return self.annot_cache_dir / self.species.id
+    def cache_path(self):
+        return self._cache.cache_path
 
     def __str__(self):
         s = "GTF Annotation:\n"
         s += "\tspecies   = {}\n".format(self.species.name)
         s += "\tid        = {}\n".format(self.id)
         s += "\turl       = {}\n".format(self.url)
-        s += "\tzarr url       = {}\n".format(self.zarr_url)
+        s += "\tzarr url  = {}\n".format(self.zarr_url)
         s += "\tcached    = {}\n".format(self.is_cached())
-        s += "\tcache_dir = {}\n".format(self.species_cache_dir)
+        s += "\tcache_path = {}\n".format(self.cache_path)
         return s
 
     def is_cached(self):
         """
         Returns True if this annotation is cached locally.
         """
-        return os.path.exists(self.species_cache_dir)
+        return self._cache.is_valid()
 
     def download(self):
         """
-        Downloads the zarr from the source URL and stores it in the
-        cache directory. If the annotation directory already exists it is first
-        removed.
+        Downloads the zarr URL and stores it in the cache directory.
         """
-        self.file_name = os.path.basename(self.zarr_url)
-        if self.is_cached():
-            logger.info(f"Clearing cache {self.species_cache_dir}")
-            with tempfile.TemporaryDirectory() as tempdir:
-                dest = pathlib.Path(tempdir) / "will_be_deleted"
-                os.rename(self.annot_cache_dir, dest)
-        logger.debug(f"Checking species cache directory {self.species_cache_dir}")
-        os.makedirs(self.species_cache_dir, exist_ok=True)
-        download_file = f'{self.species_cache_dir}/{self.file_name}'
-        logger.info(f"Downloading Zarr file '{self.id}' from {self.zarr_url}")
-        logger.info(f"download_file: {download_file}")
-        logger.info(f"species_cache_dir: {self.species_cache_dir}")
-        if os.path.exists(download_file):
-            warnings.warn("multiple downloads?")
-        try:
-            urllib.request.urlretrieve(self.zarr_url, filename=download_file)
-        except urllib.error.URLError:
-            print(f"could not connect to {self.zarr_url}")
-            raise
-        logger.debug("Download Zarr complete")
-        logger.info(f"Storing Zarr in {self.species_cache_dir}")
+        self._cache.download()
 
     def get_chromosome_annotations(self, id):
         """
-        Returns the pandas dataframe for
-        the chromosome with the specified id.
+        Returns the pandas dataframe for the chromosome with the specified id.
         """
+        chrom = self.species.genome.get_chromosome(id)
         if not self.is_cached():
             self.download()
-        annot_file = os.path.join(self.species_cache_dir, self.file_name)
-        if id is None:
-            raise ValueError("bad chrom id")
-        chr_prefix = "chr"  # building this in for future generalization
-        if id.startswith(chr_prefix):
-            id = id[len(chr_prefix):]
-        if os.path.exists(annot_file):
-            bed = zarr_to_dataframe(annot_file)
-            assert type(bed) == pandas.DataFrame
-            ret = bed[bed.seqid == id]
-            if len(ret) == 0:
-                raise ValueError
-        else:
-            ret = None
-            raise ValueError(
-                "Warning: annotation file not found for chromosome: '{}'"
-                " on annotation: '{}', no annotations will be used".format(
-                    id, self.id))
+        bed = zarr_to_dataframe(str(self.cache_path))
+        assert type(bed) == pandas.DataFrame
+        ret = bed[bed.seqid == chrom.id]
+        if len(ret) == 0:
+            raise ValueError(f"No annotations found for {id}")
         return ret
 
     def get_annotation_type_from_chromomosome(self, a_type, chrom_id, full_table=False):
         """
-        Returns all elements of
-        type a_type from chromosome
-        specified
+        Returns all elements of type a_type from chromosome specified
         """
         annots = self.get_chromosome_annotations(chrom_id)
         subset = annots[annots.type == a_type]
@@ -157,8 +118,7 @@ def get_annotation_type_from_chromomosome(self, a_type, chrom_id, full_table=Fal
 
     def get_genes_from_chromosome(self, chrom_id, full_table=False):
         """
-        Returns all elements of
-        type gene from annotation
+        Returns all elements of type gene from annotation
         """
         return self.get_annotation_type_from_chromomosome('gene', chrom_id,
                                                           full_table)
diff --git a/stdpopsim/cache.py b/stdpopsim/cache.py
@@ -4,8 +4,14 @@
 import pathlib
 import logging
 import os
+import urllib.parse
+import tempfile
+import warnings
 
 import appdirs
+import attr
+
+from . import utils
 
 logger = logging.getLogger(__name__)
 
@@ -43,3 +49,127 @@ def get_cache_dir():
 
 
 set_cache_dir()
+
+
+@attr.s(kw_only=True)
+class CachedData:
+    """
+    Downloadable data that will be cached locally.
+
+    The downloadable should be a single file. The local cache may
+    correspond to this same file, or to its extracted contents. In the latter
+    case, the downloaded file will be removed after archive extraction.
+    The downloaded file is compared against the expected SHA256 checksum,
+    and if correct, the checksum is then also stored locally.
+
+    :ivar str namespace: The namespace under which the cache will be stored.
+        This will be converted into a folder, by constructing folders in the
+        cache corresponding to each component of the namespace.
+        E.g. if we're on a unix system with cache under ``/path/to/cache``, and
+        ``namespace="foo/bar"``, the cached data will live under
+        ``/path/to/cache/foo/bar``.
+    :ivar str url: The URL of the data to be cached.
+    :ivar str sha256: The SHA256 checksum of the downloaded file.
+    :ivar bool extract: True if the downloaded file is a tarball that should be
+        extracted into the cached namespace, False otherwise.
+    """
+    namespace = attr.ib(type=str)
+    url = attr.ib(type=str)
+    sha256 = attr.ib(type=str)
+    extract = attr.ib(type=bool)
+
+    def __attrs_post_init__(self):
+        u = urllib.parse.urlparse(self.url)
+        self._basename = pathlib.PurePath(u.path).name
+
+    @property
+    def sha256_file(self):
+        return get_cache_dir() / self.namespace / f"{self._basename}.sha256"
+
+    @property
+    def cache_path(self):
+        # the cache path could be a folder or a file, depending on self.extract
+        path = get_cache_dir() / self.namespace
+        if not self.extract:
+            path = path / self._basename
+        return path
+
+    def is_cached(self):
+        """
+        Returns True if the data is cached locally.
+        """
+        return self.cache_path.exists()
+
+    def is_valid(self):
+        """
+        Returns True if the cached data matches the checksum.
+        """
+        is_valid = False
+        if self.is_cached() and self.sha256_file.exists():
+            with open(self.sha256_file, "r") as f:
+                cached_sha256 = f.read().strip()
+            is_valid = self.sha256 == cached_sha256
+        return is_valid
+
+    def download(self):
+        """
+        Downloads the file from the source URL and stores it in the cache.
+        If the local cache already exists, it is first removed.
+        """
+        if self.is_cached():
+            logger.info(f"Clearing cache {self.cache_path}")
+            with tempfile.TemporaryDirectory(dir=get_cache_dir()) as tempdir:
+                # Atomically move to a temporary directory, which will be automatically
+                # deleted on exit.
+                dest = pathlib.Path(tempdir) / "will_be_deleted"
+                os.rename(self.cache_path, dest)
+
+        self.cache_path.parent.mkdir(parents=True, exist_ok=True)
+
+        logger.info(f"Downloading {self.url}")
+        # os.rename will not work on some Unixes if the source and dest are on
+        # different file systems. Keep the tempdir in the same directory as
+        # the destination to ensure it's on the same file system.
+        with tempfile.TemporaryDirectory(dir=get_cache_dir()) as tempdir:
+            tempdir = pathlib.Path(tempdir)
+            local_path = tempdir / "downloaded"
+            utils.download(self.url, local_path)
+
+            logger.debug("Checking SHA256")
+            download_sha256 = utils.sha256(local_path)
+            if download_sha256 != self.sha256:
+                # TODO: use a more appropriate exception here.
+                raise ValueError(
+                    f"Expected SHA256={self.sha256}, but downloaded file has"
+                    f"{download_sha256}."
+                )
+
+            if self.extract:
+                extract_dir = tempdir / "extracted"
+                extract_dir.mkdir()
+                logger.debug(f"Extracting {local_path}")
+                utils.untar(local_path, extract_dir)
+                local_path = extract_dir
+
+            # If this has all gone OK up to here we can now move the
+            # data into the cache location. This should minimise the
+            # chances of having malformed data in the cache.
+            logger.info(f"Saving to {self.cache_path}")
+            # os.rename is atomic, and will raise an OSError if the destination
+            # is a directory and already exists. Therefore, if we see the map
+            # exists we assume that some other process has already downloaded
+            # it, and raise a warning.
+            # If the source and destination are regular files (such as when
+            # self.extract==False), the destination will be silently replaced
+            # on unix systems, but FileExistsError will be raised on windows.
+            try:
+                os.rename(local_path, self.cache_path)
+            except (OSError, FileExistsError):
+                warnings.warn(
+                    "Error occured renaming map directory. Are multiple processes"
+                    "downloading this map at the same time?")
+                return
+
+            # Write out the checksum.
+            with open(self.sha256_file, "w") as f:
+                print(self.sha256, file=f)
diff --git a/stdpopsim/catalog/AraTha/__init__.py b/stdpopsim/catalog/AraTha/__init__.py
@@ -99,6 +99,7 @@
     url=(
         "https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/"
         "AraTha/salome2012_maps.tar.gz"),
+    sha256="49745e1cab87d59e33eacfdf66303839632d3b07883dd55a99fe1dc27b336ac6",
     file_pattern="arab_chr{id}_map_loess.txt",
     citations=[stdpopsim.Citation(
         doi="https://doi.org/10.1038/hdy.2011.95",

diff --git a/stdpopsim/catalog/CanFam/__init__.py b/stdpopsim/catalog/CanFam/__init__.py
@@ -139,6 +139,7 @@
         """,
     url="https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/"
         "CanFam/dog_genetic_maps.tar.gz",
+    sha256="585afb424615e2fb0825d807db0b10fe1c797a6dbb804ecbb3fef5e8387d194f",
     file_pattern="chr{id}_average_canFam3.1.txt",
     citations=[
             _CampbellEtAl.because(stdpopsim.CiteReason.GEN_MAP)

diff --git a/stdpopsim/catalog/DroMel/__init__.py b/stdpopsim/catalog/DroMel/__init__.py
@@ -95,6 +95,7 @@
     url=(
         "https://stdpopsim.s3-us-west-2.amazonaws.com/genetic_maps/"
         "DroMel/comeron2012_maps.tar.gz"),
+    sha256="08185a0e3b0ad26eefe69fc6bdb8f3f599a760e11e87dd343335b33d1563f62a",
     file_pattern="genetic_map_comeron2012_dm6_chr{id}.txt",
     citations=[stdpopsim.Citation(
         author="Comeron et al",