Skip to content

Commit

Permalink
Merge pull request #40 from mcveanlab/zero-map-bugfix
Browse files Browse the repository at this point in the history
Fix race condition in map cache.
  • Loading branch information
jeromekelleher committed Apr 7, 2019
2 parents c8249ac + 1982ac8 commit b2662ad
Show file tree
Hide file tree
Showing 9 changed files with 246 additions and 55 deletions.
1 change: 0 additions & 1 deletion requirements/development.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,3 @@ sphinx-argparse
sphinx_rtd_theme
msprime
appdirs
requests
1 change: 0 additions & 1 deletion requirements/rtd.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# Requirements needed when building on readthedocs.
setuptools_scm
appdirs
requests
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
keywords='simulations, recombination map, models',
packages=['stdpopsim'],
include_package_data=True,
install_requires=["msprime", "requests", "appdirs"],
install_requires=["msprime", "appdirs"],
url='https://github.com/popgensims/stdpopsim',
project_urls={
'Bug Reports': 'https://github.com/popgensims/stdpopsim/issues',
Expand Down
3 changes: 3 additions & 0 deletions stdpopsim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
except ImportError:
pass

# Cache handling for downloaded data.
from . cache import * # NOQA

# Internal modules. Import here to flatten the namespace.
from . genetic_maps import * # NOQA
from . models import * # NOQA
Expand Down
45 changes: 45 additions & 0 deletions stdpopsim/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
Cache handling for downloaded data.
"""
import pathlib
import logging
import os

import appdirs

logger = logging.getLogger(__name__)

_cache_dir = None


def set_cache_dir(cache_dir=None):
"""
The cache_dir is the directory in which stdpopsim stores and checks for
downloaded data. If the specified cache_dir is not None, this value is
converted to a pathlib.Path instance, which is used as the cache directory.
If cache_dir is None (the default), the cache directory is set either from
the environment variable `STDPOPSIM_CACHE` if it exists, or set to the
default location using the :mod:`appdirs` module.
No checks for existance, writability, etc. are performed by this function.
"""
if cache_dir is None:
cache_dir = os.environ.get("STDPOPSIM_CACHE", None)
if cache_dir is None:
cache_dir = appdirs.user_cache_dir("stdpopsim", "popgensims")
global _cache_dir
_cache_dir = pathlib.Path(cache_dir)
logging.info(f"Set cache_dir to {_cache_dir}")


def get_cache_dir():
"""
Returns the directory used to cache material downloaded by stdpopsim as a
pathlib.Path instance. Defaults to a directory 'stdpopsim' in a user cache directory
(e.g., ~/.cache/stdopsim on Unix flavours). See the :func:`.set_cache_dir` function
for how this value can be set.
"""
return _cache_dir


set_cache_dir()
48 changes: 34 additions & 14 deletions stdpopsim/genetic_maps.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
"""
Infrastructure for managing genetic maps.
"""
import os.path
import pathlib
import tempfile
import tarfile
import logging
import contextlib
import shutil
import inspect
import warnings
import os
import urllib.request

import appdirs
import requests
import msprime

import stdpopsim

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -43,6 +44,13 @@ def register_genetic_map(genetic_map):
registered_maps[key] = genetic_map


def all_genetic_maps():
"""
Returns all registered genetic maps.
"""
return list(registered_maps.values())


@contextlib.contextmanager
def cd(path):
"""
Expand Down Expand Up @@ -91,9 +99,9 @@ class GeneticMap(object):
"""

def __init__(self):
self.cache_dir = appdirs.user_cache_dir("stdpopsim", "popgensims")
self.species_cache_dir = os.path.join(self.cache_dir, self.species)
self.map_cache_dir = os.path.join(self.species_cache_dir, self.name)
self.cache_dir = pathlib.Path(stdpopsim.get_cache_dir()) / "genetic_maps"
self.species_cache_dir = self.cache_dir / self.species
self.map_cache_dir = self.species_cache_dir / self.name

# We use a bit of trickery here to dynamically get the species name and
# map ID from the subclass.
Expand Down Expand Up @@ -138,18 +146,21 @@ def download(self):
"""
if self.is_cached():
logger.info("Clearing cache {}".format(self.map_cache_dir))
shutil.rmtree(self.map_cache_dir)
with tempfile.TemporaryDirectory(dir=self.species_cache_dir) as tempdir:
# Atomically move to a temporary directory, which will be automatically
# deleted on exit.
os.rename(self.map_cache_dir, tempdir)
logger.debug("Making species cache directory {}".format(self.species_cache_dir))
os.makedirs(self.species_cache_dir, exist_ok=True)

logger.info("Downloading genetic map '{}' from {}".format(self.name, self.url))
response = requests.get(self.url, stream=True)
with tempfile.TemporaryDirectory() as tempdir:
# os.rename will not work on some Unixes if the source and dest are on
# different file systems. Keep the tempdir in the same directory as
# the destination to ensure it's on the same file system.
with tempfile.TemporaryDirectory(dir=self.species_cache_dir) as tempdir:
download_file = os.path.join(tempdir, "downloaded")
extract_dir = os.path.join(tempdir, "extracted")
with open(download_file, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
f.write(chunk)
urllib.request.urlretrieve(self.url, filename=download_file)
logger.debug("Extracting genetic map")
os.makedirs(extract_dir)
with tarfile.open(download_file, 'r') as tf:
Expand All @@ -167,7 +178,16 @@ def download(self):
# extracted directory into the cache location. This should
# minimise the chances of having malformed maps in the cache.
logger.info("Storing map in {}".format(self.map_cache_dir))
shutil.move(extract_dir, self.map_cache_dir)
# os.rename is atomic, and will raise an OSError if the directory
# already exists. Therefore, if we see the map exists we assume
# that some other thread has already dowloaded it and raise a
# warning.
try:
os.rename(extract_dir, self.map_cache_dir)
except OSError:
warnings.warn(
"Error occured renaming map directory. Are several threads/processes"
"downloading this map at the same time?")

def contains_chromosome_map(self, name):
"""
Expand Down
44 changes: 44 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Package definition for tests. Defined to allow cross-importing.
"""
import unittest
import tempfile

import stdpopsim


class CacheReadingTest(unittest.TestCase):
"""
This should be used as the superclass of all tests that use downloaded data.
Rather than using the standard userdir for downloaded data, we use this
local directory that can be easily removed and modified without fear of
interfering with production code.
"""
cache_dir = "test_cache"
saved_cache_dir = None
saved_urls = {}

def setUp(self):
self.saved_cache_dir = stdpopsim.get_cache_dir()
stdpopsim.set_cache_dir(self.cache_dir)

def tearDown(self):
stdpopsim.set_cache_dir(self.saved_cache_dir)


class CacheWritingTest(unittest.TestCase):
"""
This should be used as the superclass of all tests that alter the
downloaded data cache in any non-standard way.
"""
saved_cache_dir = None
tmp_cache_dir = None

def setUp(self):
self.saved_cache_dir = stdpopsim.get_cache_dir()
self.tmp_cache_dir = tempfile.TemporaryDirectory()
stdpopsim.set_cache_dir(self.tmp_cache_dir.name)

def tearDown(self):
stdpopsim.set_cache_dir(self.saved_cache_dir)
del self.tmp_cache_dir
39 changes: 39 additions & 0 deletions tests/test_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""
Tests for the cache management code.
"""
import pathlib
import os

import appdirs

import stdpopsim
import tests


class TestSetCacheDir(tests.CacheWritingTest):
"""
Tests the set_cache_dir function.
"""
paths = [
"/somefile", "/some/other/file/", "relative/path", "relative/path/"]

def test_paths(self):
for test in self.paths:
stdpopsim.set_cache_dir(test)
self.assertEqual(stdpopsim.get_cache_dir(), pathlib.Path(test))
stdpopsim.set_cache_dir(pathlib.Path(test))
self.assertEqual(stdpopsim.get_cache_dir(), pathlib.Path(test))

def test_none(self):
stdpopsim.set_cache_dir(None)
cache_dir = pathlib.Path(appdirs.user_cache_dir("stdpopsim", "popgensims"))
self.assertEqual(stdpopsim.get_cache_dir(), cache_dir)

def test_environment_var(self):
try:
for test in self.paths:
os.environ["STDPOPSIM_CACHE"] = test
stdpopsim.set_cache_dir()
self.assertEqual(stdpopsim.get_cache_dir(), pathlib.Path(test))
finally:
os.environ.pop("STDPOPSIM_CACHE")

0 comments on commit b2662ad

Please sign in to comment.