From 41867c486a9a52624dac5eb96f8383c034d93af0 Mon Sep 17 00:00:00 2001 From: David Hoese Date: Tue, 9 Apr 2019 12:12:13 -0500 Subject: [PATCH] Add GOES-16 ABI Hurricane Florence demo data case Includes a lot of changes to make fancier glob patterns possible on GCP --- satpy/demo/__init__.py | 66 +++++++++++++++++++++++-- satpy/demo/_google_cloud_platform.py | 23 +++++++-- satpy/tests/test_demo.py | 74 +++++++++++++++++++++++++--- satpy/tests/test_writers.py | 2 +- 4 files changed, 150 insertions(+), 15 deletions(-) diff --git a/satpy/demo/__init__.py b/satpy/demo/__init__.py index ebf00a6d12..322d01e546 100644 --- a/satpy/demo/__init__.py +++ b/satpy/demo/__init__.py @@ -56,7 +56,7 @@ LOG = logging.getLogger(__name__) -def makedirs(directory, exist_ok=False): +def _makedirs(directory, exist_ok=False): """Python 2.7 friendly os.makedirs. After python 2.7 is dropped, just use `os.makedirs` with `existsok=True`. @@ -69,7 +69,7 @@ def makedirs(directory, exist_ok=False): def get_us_midlatitude_cyclone_abi(base_dir='.', method=None, force=False): - """Get GOES-16 ABI (CONUS sector) data from March 14th 00:00Z. + """Get GOES-16 ABI (CONUS sector) data from 2019-03-14 00:00Z. Args: base_dir (str): Base directory for downloaded files. @@ -80,6 +80,8 @@ def get_us_midlatitude_cyclone_abi(base_dir='.', method=None, force=False): the local system. Warning: May delete non-demo files stored in download directory. + Total size: ~110MB + """ if method is None: method = 'gcsfs' @@ -90,7 +92,63 @@ def get_us_midlatitude_cyclone_abi(base_dir='.', method=None, force=False): from ._google_cloud_platform import get_bucket_files patterns = ['gs://gcp-public-data-goes-16/ABI-L1b-RadC/2019/073/00/*0002*.nc'] subdir = os.path.join(base_dir, 'abi_l1b', '20190314_us_midlatitude_cyclone') - makedirs(subdir, exist_ok=True) + _makedirs(subdir, exist_ok=True) filenames = get_bucket_files(patterns, subdir, force=force) - assert len(filenames) == 16, "Not all ABI files could be downloaded" + assert len(filenames) == 16, "Not all files could be downloaded" + return filenames + + +def get_hurricane_florence_abi(base_dir='.', method=None, force=False, + channels=range(1, 17), num_frames=10): + """Get GOES-16 ABI (Meso sector) data from 2018-09-11 13:00Z to 17:00Z. + + Args: + base_dir (str): Base directory for downloaded files. + method (str): Force download method for the data if not already cached. + Allowed options are: 'gcsfs'. Default of ``None`` will + choose the best method based on environment settings. + force (bool): Force re-download of data regardless of its existence on + the local system. Warning: May delete non-demo files stored in + download directory. + channels (list): Channels to include in download. Defaults to all + 16 channels. + num_frames (int or slice): Number of frames to download. Maximum + 240 frames. Default 10 frames. + + Size per frame (all channels): ~15MB + + Total size (default 10 frames, all channels): ~124MB + + Total size (240 frames, all channels): ~3.5GB + + """ + if method is None: + method = 'gcsfs' + if method not in ['gcsfs']: + raise NotImplementedError("Demo data download method '{}' not " + "implemented yet.".format(method)) + if isinstance(num_frames, (int, float)): + frame_slice = slice(0, num_frames) + else: + frame_slice = num_frames + + from ._google_cloud_platform import get_bucket_files + + patterns = [] + for channel in channels: + # patterns += ['gs://gcp-public-data-goes-16/ABI-L1b-RadM/2018/254/1[3456]/' + # '*C{:02d}*s20182541[3456]*.nc'.format(channel)] + patterns += [( + 'gs://gcp-public-data-goes-16/ABI-L1b-RadM/2018/254/13/*C{:02d}*s201825413*.nc'.format(channel), + 'gs://gcp-public-data-goes-16/ABI-L1b-RadM/2018/254/14/*C{:02d}*s201825414*.nc'.format(channel), + 'gs://gcp-public-data-goes-16/ABI-L1b-RadM/2018/254/15/*C{:02d}*s201825415*.nc'.format(channel), + 'gs://gcp-public-data-goes-16/ABI-L1b-RadM/2018/254/16/*C{:02d}*s201825416*.nc'.format(channel), + )] + subdir = os.path.join(base_dir, 'abi_l1b', '20180911_hurricane_florence_abi_l1b') + _makedirs(subdir, exist_ok=True) + filenames = get_bucket_files(patterns, subdir, force=force, pattern_slice=frame_slice) + + actual_slice = frame_slice.indices(240) # 240 max frames + num_frames = int((actual_slice[1] - actual_slice[0]) / actual_slice[2]) + assert len(filenames) == len(channels) * num_frames, "Not all files could be downloaded" return filenames diff --git a/satpy/demo/_google_cloud_platform.py b/satpy/demo/_google_cloud_platform.py index 40c8f78f88..35309279b0 100644 --- a/satpy/demo/_google_cloud_platform.py +++ b/satpy/demo/_google_cloud_platform.py @@ -41,18 +41,24 @@ def is_google_cloud_instance(): return False -def get_bucket_files(glob_pattern, base_dir, force=False): +def get_bucket_files(glob_pattern, base_dir, force=False, pattern_slice=slice(None)): """Helper function to download files from Google Cloud Storage. Args: glob_pattern (str or list): Glob pattern string or series of patterns used to search for on Google Cloud Storage. The pattern should - include the "gs://" protocol prefix. + include the "gs://" protocol prefix. If a list of lists, then the + results of each sublist pattern are concatenated and the result is + treated as one pattern result. This is important for things like + ``pattern_slice`` and complicated glob patterns not supported by + GCP. base_dir (str): Root directory to place downloaded files on the local system. force (bool): Force re-download of data regardless of its existence on the local system. Warning: May delete non-demo files stored in download directory. + pattern_slice (slice): Slice object to limit the number of files + returned by each glob pattern. """ if gcsfs is None: @@ -67,16 +73,25 @@ def get_bucket_files(glob_pattern, base_dir, force=False): fs = gcsfs.GCSFileSystem(token='anon') filenames = [] for gp in glob_pattern: - for fn in fs.glob(gp): + # handle multiple glob patterns being treated as one pattern + # for complicated patterns that GCP can't handle + if isinstance(gp, str): + glob_results = list(fs.glob(gp)) + else: + # flat list of results + glob_results = [fn for pat in gp for fn in fs.glob(pat)] + + for fn in glob_results[pattern_slice]: ondisk_fn = os.path.basename(fn) ondisk_pathname = os.path.join(base_dir, ondisk_fn) filenames.append(ondisk_pathname) - LOG.info("Downloading: {}".format(ondisk_pathname)) if force and os.path.isfile(ondisk_pathname): os.remove(ondisk_pathname) elif os.path.isfile(ondisk_pathname): + LOG.info("Found existing: {}".format(ondisk_pathname)) continue + LOG.info("Downloading: {}".format(ondisk_pathname)) fs.get('gs://' + fn, ondisk_pathname) if not filenames: diff --git a/satpy/tests/test_demo.py b/satpy/tests/test_demo.py index 00b5d6b9e9..0fc0ad15ff 100644 --- a/satpy/tests/test_demo.py +++ b/satpy/tests/test_demo.py @@ -30,10 +30,37 @@ import mock +class _GlobHelper(object): + """Create side effect function for mocking gcsfs glob method.""" + + def __init__(self, num_results): + """Initialize side_effect function for mocking gcsfs glob method. + + Args: + num_results (int or list): Number of results for each glob call + to return. If a list then number of results per call. The + last number is used for any additional calls. + + """ + self.current_call = 0 + if not isinstance(num_results, (list, tuple)): + num_results = [num_results] + self.num_results = num_results + + def __call__(self, pattern): + """The side effect function to be called as glob.""" + try: + num_results = self.num_results[self.current_call] + except IndexError: + num_results = self.num_results[-1] + self.current_call += 1 + return [pattern + '.{:03d}'.format(idx) for idx in range(num_results)] + + class TestDemo(unittest.TestCase): """Test demo data download functions.""" - @mock.patch('satpy.demo.google_cloud_platform.gcsfs') + @mock.patch('satpy.demo._google_cloud_platform.gcsfs') def test_get_us_midlatitude_cyclone_abi(self, gcsfs_mod): """Test data download function.""" from satpy.demo import get_us_midlatitude_cyclone_abi @@ -41,7 +68,9 @@ def test_get_us_midlatitude_cyclone_abi(self, gcsfs_mod): gcsfs_inst = mock.MagicMock() gcsfs_mod.GCSFileSystem.return_value = gcsfs_inst gcsfs_inst.glob.return_value = ['a.nc', 'b.nc'] + # expected 16 files, got 2 self.assertRaises(AssertionError, get_us_midlatitude_cyclone_abi) + # unknown access method self.assertRaises(NotImplementedError, get_us_midlatitude_cyclone_abi, method='unknown') gcsfs_inst.glob.return_value = ['a.nc'] * 16 @@ -50,18 +79,47 @@ def test_get_us_midlatitude_cyclone_abi(self, gcsfs_mod): for fn in filenames: self.assertEqual(expected, fn) + @mock.patch('satpy.demo._google_cloud_platform.gcsfs') + def test_get_hurricane_florence_abi(self, gcsfs_mod): + """Test data download function.""" + from satpy.demo import get_hurricane_florence_abi + gcsfs_mod.GCSFileSystem = mock.MagicMock() + gcsfs_inst = mock.MagicMock() + gcsfs_mod.GCSFileSystem.return_value = gcsfs_inst + # only return 5 results total + gcsfs_inst.glob.side_effect = _GlobHelper([5, 0]) + # expected 16 files * 10 frames, got 16 * 5 + self.assertRaises(AssertionError, get_hurricane_florence_abi) + self.assertRaises(NotImplementedError, get_hurricane_florence_abi, method='unknown') + + gcsfs_inst.glob.side_effect = _GlobHelper([int(240 / 16), 0, 0, 0] * 16) + filenames = get_hurricane_florence_abi() + self.assertEqual(10 * 16, len(filenames)) + + gcsfs_inst.glob.side_effect = _GlobHelper([int(240 / 16), 0, 0, 0] * 16) + filenames = get_hurricane_florence_abi(channels=[2, 3, 4]) + self.assertEqual(10 * 3, len(filenames)) + + gcsfs_inst.glob.side_effect = _GlobHelper([int(240 / 16), 0, 0, 0] * 16) + filenames = get_hurricane_florence_abi(channels=[2, 3, 4], num_frames=5) + self.assertEqual(5 * 3, len(filenames)) + + gcsfs_inst.glob.side_effect = _GlobHelper([int(240 / 16), 0, 0, 0] * 16) + filenames = get_hurricane_florence_abi(num_frames=5) + self.assertEqual(5 * 16, len(filenames)) + class TestGCPUtils(unittest.TestCase): """Test Google Cloud Platform utilities.""" - @mock.patch('satpy.demo.google_cloud_platform.urlopen') + @mock.patch('satpy.demo._google_cloud_platform.urlopen') def test_is_gcp_instance(self, uo): """Test is_google_cloud_instance.""" from satpy.demo._google_cloud_platform import is_google_cloud_instance, URLError uo.side_effect = URLError("Test Environment") self.assertFalse(is_google_cloud_instance()) - @mock.patch('satpy.demo.google_cloud_platform.gcsfs') + @mock.patch('satpy.demo._google_cloud_platform.gcsfs') def test_get_bucket_files(self, gcsfs_mod): """Test get_bucket_files basic cases.""" from satpy.demo._google_cloud_platform import get_bucket_files @@ -73,11 +131,15 @@ def test_get_bucket_files(self, gcsfs_mod): expected = [os.path.join('.', 'a.nc'), os.path.join('.', 'b.nc')] self.assertEqual(expected, filenames) + gcsfs_inst.glob.side_effect = _GlobHelper(10) + filenames = get_bucket_files(['*.nc', '*.txt'], '.', pattern_slice=slice(2, 5)) + self.assertEqual(len(filenames), 3 * 2) + gcsfs_inst.glob.side_effect = None # reset mock side effect + gcsfs_inst.glob.return_value = ['a.nc', 'b.nc'] self.assertRaises(OSError, get_bucket_files, '*.nc', 'does_not_exist') - # touch the file - open('a.nc', 'w').close() + open('a.nc', 'w').close() # touch the file gcsfs_inst.get.reset_mock() gcsfs_inst.glob.return_value = ['a.nc'] filenames = get_bucket_files('*.nc', '.') @@ -96,7 +158,7 @@ def test_get_bucket_files(self, gcsfs_mod): gcsfs_inst.glob.return_value = [] self.assertRaises(OSError, get_bucket_files, '*.nc', '.') - @mock.patch('satpy.demo.google_cloud_platform.gcsfs', None) + @mock.patch('satpy.demo._google_cloud_platform.gcsfs', None) def test_no_gcsfs(self): """Test that 'gcsfs' is required.""" from satpy.demo._google_cloud_platform import get_bucket_files diff --git a/satpy/tests/test_writers.py b/satpy/tests/test_writers.py index 741776e2b7..6a2d4f94dc 100644 --- a/satpy/tests/test_writers.py +++ b/satpy/tests/test_writers.py @@ -43,7 +43,7 @@ def mkdir_p(path): # Use for python 2.7 compatibility # When python 2.7 support is dropped just use - # `os.makedirs(path, exist_ok=True)` + # `os._makedirs(path, exist_ok=True)` try: os.makedirs(path) except OSError as exc: # Python >2.5