diff --git a/.travis.yml b/.travis.yml index 2b205a8..425eb68 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,7 +37,7 @@ install: - pip install pysatCDF >/dev/null # Custom pysat install - cd .. - - git clone https://github.com/pysat/pysat.git + - git clone --single-branch --branch develop-3 https://github.com/pysat/pysat.git - cd pysat - git checkout develop-3 - python setup.py install diff --git a/docs/figures/gnss_tec_vtec_example.png b/docs/figures/gnss_tec_vtec_example.png new file mode 100644 index 0000000..8edb00f Binary files /dev/null and b/docs/figures/gnss_tec_vtec_example.png differ diff --git a/pysatMadrigal/instruments/__init__.py b/pysatMadrigal/instruments/__init__.py index 2eb5b01..0ad832e 100644 --- a/pysatMadrigal/instruments/__init__.py +++ b/pysatMadrigal/instruments/__init__.py @@ -1,4 +1,8 @@ -from pysatMadrigal.instruments import dmsp_ivm, jro_isr +# Import Madrigal instruments +from pysatMadrigal.instruments import dmsp_ivm, gnss_tec, jro_isr + +# Import Madrigal methods from pysatMadrigal.instruments import methods # noqa F401 -__all__ = ['dmsp_ivm', 'jro_isr'] +# Define variable name with all available instruments +__all__ = ['dmsp_ivm', 'gnss_tec', 'jro_isr'] diff --git a/pysatMadrigal/instruments/dmsp_ivm.py b/pysatMadrigal/instruments/dmsp_ivm.py index 3997ae3..a33fa5b 100644 --- a/pysatMadrigal/instruments/dmsp_ivm.py +++ b/pysatMadrigal/instruments/dmsp_ivm.py @@ -64,8 +64,9 @@ import numpy as np import pandas as pds +from pysat.instruments.methods import general as ps_gen + from pysatMadrigal.instruments.methods import madrigal as mad_meth -from pysat.instruments.methods import general as mm_gen logger = logging.getLogger(__name__) @@ -86,11 +87,10 @@ # use the default CDAWeb method dmsp_fname1 = {'utd': 'dms_ut_{year:4d}{month:02d}{day:02d}_', '': 'dms_{year:4d}{month:02d}{day:02d}_'} -dmsp_fname2 = {'utd': '.{version:03d}.hdf5', '': 's?.{version:03d}.hdf5'} +dmsp_fname2 = {'utd': '.{version:03d}.{file_type}', + '': 's?.{version:03d}.{file_type}'} supported_tags = {ss: {kk: dmsp_fname1[kk] + ss[1:] + dmsp_fname2[kk] for kk in inst_ids[ss]} for ss in inst_ids.keys()} -list_files = functools.partial(mm_gen.list_files, - supported_tags=supported_tags) # madrigal tags madrigal_inst_code = 8100 @@ -135,8 +135,63 @@ def init(self): return +def list_files(tag=None, inst_id=None, data_path=None, format_str=None, + supported_tags=supported_tags, + fake_daily_files_from_monthly=False, delimiter=None, + file_type=''): + """Return a Pandas Series of every data file for this Instrument + + + Parameters + ----------- + tag : string or NoneType + Denotes type of file to load. Accepted types are . + (default=None) + inst_id : string or NoneType + Specifies the satellite ID for a constellation. Not used. + (default=None) + data_path : string or NoneType + Path to data directory. If None is specified, the value previously + set in Instrument.files.data_path is used. (default=None) + format_str : string or NoneType + User specified file format. If None is specified, the default + formats associated with the supplied tags are used. (default=None) + supported_tags : dict or NoneType + keys are inst_id, each containing a dict keyed by tag + where the values file format template strings. (default=None) + fake_daily_files_from_monthly : bool + Some CDAWeb instrument data files are stored by month, interfering + with pysat's functionality of loading by day. This flag, when true, + appends daily dates to monthly files internally. These dates are + used by load routine in this module to provide data by day. + delimiter : string + Delimiter string upon which files will be split (e.g., '.') + file_type : string + File format for Madrigal data. Load routines currently only accepts + 'hdf5' and 'netCDF4', but any of the Madrigal options may be used + here. (default='netCDF4') + + Returns + -------- + out : pysat.Files.from_os : pysat._files.Files + A class containing the verified available files + + """ + if supported_tags[inst_id][tag].find('{file_type}') > 0: + supported_tags[inst_id][tag] = supported_tags[inst_id][tag].format( + file_type=file_type) + + out = ps_gen.list_files( + tag=tag, inst_id=inst_id, data_path=data_path, format_str=format_str, + supported_tags=supported_tags, + fake_daily_files_from_monthly=fake_daily_files_from_monthly, + delimiter=delimiter) + + return out + + def download(date_array, tag='', inst_id='', data_path=None, user=None, - password=None): + password=None, file_type='hdf5'): """Downloads data from Madrigal. Parameters @@ -158,9 +213,13 @@ def download(date_array, tag='', inst_id='', data_path=None, user=None, error if user not supplied. (default=None) password : string Password for data download. (default=None) + file_type : string + File format for Madrigal data. Load routines currently only accepts + 'hdf5' and 'netCDF4', but any of the Madrigal options may be used + here. (default='hdf5') - Notes - ----- + Note + ---- The user's names should be provided in field user. Ritu Karidhal should be entered as Ritu+Karidhal @@ -174,53 +233,44 @@ def download(date_array, tag='', inst_id='', data_path=None, user=None, mad_meth.download(date_array, inst_code=str(madrigal_inst_code), kindat=str(madrigal_tag[inst_id][tag]), data_path=data_path, user=user, password=password) + return -def default(inst): - pass - - -def clean(inst): +def clean(self): """Routine to return DMSP IVM data cleaned to the specified level - 'Clean' enforces that both RPA and DM flags are <= 1 - 'Dusty' <= 2 - 'Dirty' <= 3 - 'None' None + Note + ---- + Supports 'clean', 'dusty', 'dirty' - Routine is called by pysat, and not by the end user directly. + 'clean' enforces that both RPA and DM flags are <= 1 + 'dusty' <= 2 + 'dirty' <= 3 + 'none' Causes pysat to skip this routine - Parameters - ----------- - inst : pysat.Instrument - Instrument class object, whose attribute clean_level is used to return - the desired level of data selectivity. - - Notes - -------- - Supports 'clean', 'dusty', 'dirty' + Routine is called by pysat, and not by the end user directly. """ - if inst.tag == 'utd': - if inst.clean_level == 'clean': - idx, = np.where((inst['rpa_flag_ut'] <= 1) - & (inst['idm_flag_ut'] <= 1)) - elif inst.clean_level == 'dusty': - idx, = np.where((inst['rpa_flag_ut'] <= 2) - & (inst['idm_flag_ut'] <= 2)) - elif inst.clean_level == 'dirty': - idx, = np.where((inst['rpa_flag_ut'] <= 3) - & (inst['idm_flag_ut'] <= 3)) + if self.tag == 'utd': + if self.clean_level == 'clean': + idx, = np.where((self['rpa_flag_ut'] <= 1) + & (self['idm_flag_ut'] <= 1)) + elif self.clean_level == 'dusty': + idx, = np.where((self['rpa_flag_ut'] <= 2) + & (self['idm_flag_ut'] <= 2)) + elif self.clean_level == 'dirty': + idx, = np.where((self['rpa_flag_ut'] <= 3) + & (self['idm_flag_ut'] <= 3)) else: - idx = slice(0, inst.index.shape[0]) + idx = slice(0, self.index.shape[0]) else: - if inst.clean_level in ['clean', 'dusty', 'dirty']: + if self.clean_level in ['clean', 'dusty', 'dirty']: logger.warning('this level 1 data has no quality flags') - idx = slice(0, inst.index.shape[0]) + idx = slice(0, self.index.shape[0]) # downselect data based upon cleaning conditions above - inst.data = inst[idx] + self.data = self[idx] return diff --git a/pysatMadrigal/instruments/gnss_tec.py b/pysatMadrigal/instruments/gnss_tec.py new file mode 100644 index 0000000..8444cb5 --- /dev/null +++ b/pysatMadrigal/instruments/gnss_tec.py @@ -0,0 +1,291 @@ +# -*- coding: utf-8 -*-. +"""Supports the MIT Haystack GNSS TEC data products + +The Global Navigation Satellite System (GNSS) is used in conjunction with a +world-wide receiver network to produce total electron content (TEC) data +products, including vertical and line-of-sight TEC. + +Downloads data from the MIT Haystack Madrigal Database. + +Properties +---------- +platform + 'gnss' +name + 'tec' +tag + 'vtec' + +Examples +-------- +:: + + import datetime + import pysat + import pysatMadrigal as pymad + + vtec = pysat.Instrument(inst_module=pymad.instruments.gnss_tec, tag='vtec') + vtec.download(dt.datetime(2017, 11, 19), dt.datetime(2017, 11, 20), + user='Firstname+Lastname', password='email@address.com') + vtec.load(date=dt.datetime(2017, 11, 19)) + + +Note +---- + Please provide name and email when downloading data with this routine. + +""" + +import datetime as dt +import functools +import numpy as np + +from pysat.instruments.methods import general as ps_gen + +from pysatMadrigal.instruments.methods import madrigal as mad_meth + +import logging +logger = logging.getLogger(__name__) + + +platform = 'gnss' +name = 'tec' +tags = {'vtec': 'vertical TEC'} +inst_ids = {'': [tag for tag in tags.keys()]} +_test_dates = {'': {'vtec': dt.datetime(2017, 11, 19)}} +pandas_format = False + +# Support for the list files routine +# Use the default pysat method within a local routine that defines the +# file type +dname = '{{year:02d}}{{month:02d}}{{day:02d}}' +vname = '.{{version:03d}}' +supported_tags = {ss: {'vtec': ''.join(['gps', dname, 'g', vname, + ".{file_type}"])} + for ss in inst_ids.keys()} + +# madrigal tags +madrigal_inst_code = 8000 +madrigal_tag = {'': {'vtec': 3500}} # , 'los': 3505}} + +# support listing files currently available on remote server (Madrigal) +list_remote_files = functools.partial(mad_meth.list_remote_files, + supported_tags=supported_tags, + inst_code=madrigal_inst_code) + + +def init(self): + """Initializes the Instrument object with values specific to GNSS TEC + + Runs once upon instantiation. + + """ + + ackn_str = ''.join(["GPS TEC data products and access through the ", + "Madrigal distributed data system are provided to ", + "the community by the Massachusetts Institute of ", + "Technology under support from U.S. National Science", + " Foundation grant AGS-1242204. Data for the TEC ", + "processing is provided by the following ", + "organizations: UNAVCO, Scripps Orbit and Permanent", + " Array Center, Institut Geographique National, ", + "France, International GNSS Service, The Crustal ", + "Dynamics Data Information System (CDDIS), National ", + "Geodetic Survey, Instituto Brasileiro de Geografia", + "e Estatística, RAMSAC CORS of Instituto Geográfico", + " Nacional del la República Agentina, Arecibo ", + "Observatory, Low-Latitude Ionospheric Sensor ", + "Network (LISN), Topcon Positioning Systems, Inc., ", + "Canadian High Arctic Ionospheric Network, ", + "Institute of Geology and Geophysics, Chinese ", + "Academy of Sciences, China Meterorology ", + "Administration, Centro di Ricerche Sismogiche, ", + "Système d’Observation du Niveau des Eaux Littorales", + " (SONEL), RENAG : REseau NAtional GPS permanent, ", + "and GeoNet—the official source of geological ", + "hazard information for New Zealand.\n", + mad_meth.cedar_rules()]) + + logger.info(ackn_str) + self.acknowledgements = ackn_str + self.references = "Rideout and Coster (2006) doi:10.1007/s10291-006-0029-5" + + return + + +def list_files(tag=None, inst_id=None, data_path=None, format_str=None, + supported_tags=supported_tags, + fake_daily_files_from_monthly=False, two_digit_year_break=99, + delimiter=None, file_type=''): + """Return a Pandas Series of every data file for this Instrument + + + Parameters + ----------- + tag : string or NoneType + Denotes type of file to load. Accepted types are . + (default=None) + inst_id : string or NoneType + Specifies the satellite ID for a constellation. Not used. + (default=None) + data_path : string or NoneType + Path to data directory. If None is specified, the value previously + set in Instrument.files.data_path is used. (default=None) + format_str : string or NoneType + User specified file format. If None is specified, the default + formats associated with the supplied tags are used. (default=None) + supported_tags : dict or NoneType + keys are inst_id, each containing a dict keyed by tag + where the values file format template strings. (default=None) + fake_daily_files_from_monthly : bool + Some CDAWeb instrument data files are stored by month, interfering + with pysat's functionality of loading by day. This flag, when true, + appends daily dates to monthly files internally. These dates are + used by load routine in this module to provide data by day. + two_digit_year_break : int + If filenames only store two digits for the year, then + '1900' will be added for years >= two_digit_year_break + and '2000' will be added for years < two_digit_year_break. + delimiter : string + Delimiter string upon which files will be split (e.g., '.') + file_type : string + File format for Madrigal data. Load routines currently only accepts + 'hdf5' and 'netCDF4', but any of the Madrigal options may be used + here. (default='netCDF4') + + Returns + -------- + out : pysat.Files.from_os : pysat._files.Files + A class containing the verified available files + + """ + if supported_tags[inst_id][tag].find('{file_type}') > 0: + supported_tags[inst_id][tag] = supported_tags[inst_id][tag].format( + file_type=file_type) + + out = ps_gen.list_files( + tag=tag, inst_id=inst_id, data_path=data_path, format_str=format_str, + supported_tags=supported_tags, + fake_daily_files_from_monthly=fake_daily_files_from_monthly, + two_digit_year_break=two_digit_year_break, delimiter=delimiter) + + return out + + +def download(date_array, tag='', inst_id='', data_path=None, user=None, + password=None, url='http://cedar.openmadrigal.org', + file_type='netCDF4'): + """Downloads data from Madrigal. + + Parameters + ---------- + date_array : array-like + list of datetimes to download data for. The sequence of dates need not + be contiguous. + tag : string + Tag identifier used for particular dataset. This input is provided by + pysat. (default='') + inst_id : string + Instrument ID string identifier used for particular dataset. This input + is provided by pysat. (default='') + data_path : string + Path to directory to download data to. (default=None) + user : string + User string input used for download. Provided by user and passed via + pysat. (default=None) + password : string + Password for data download. (default=None) + url : string + URL for Madrigal site (default='http://cedar.openmadrigal.org') + file_type : string + File format for Madrigal data. Load routines currently only accepts + 'hdf5' and 'netCDF4', but any of the Madrigal options may be used + here. (default='netCDF4') + + Note + ---- + The user's names should be provided in field user. Anthea Coster should + be entered as Anthea+Coster + + The password field should be the user's email address. These parameters + are passed to Madrigal when downloading. + + The affiliation field is set to pysat to enable tracking of pysat + downloads. + + """ + mad_meth.download(date_array, inst_code=str(madrigal_inst_code), + kindat=str(madrigal_tag[inst_id][tag]), + data_path=data_path, user=user, password=password, + file_type=file_type, url=url) + + return + + +def load(fnames, tag=None, inst_id=None, file_type='netCDF4'): + """ Routine to load the GNSS TEC data + + Parameters + ----------- + fnames : list + List of filenames + tag : string or NoneType + tag name used to identify particular data set to be loaded. + This input is nominally provided by pysat itself. (default=None) + inst_id : string or NoneType + Instrument ID used to identify particular data set to be loaded. + This input is nominally provided by pysat itself. (default=None) + file_type : string + File format for Madrigal data. Currently only accepts 'hdf5' and + 'netCDF4', but any of the supported Madrigal options may be used here. + (default='netCDF4') + + Returns + -------- + data : xarray.Dataset + Object containing satellite data + meta : pysat.Meta + Object containing metadata such as column names and units + + """ + # Define the xarray coordinate dimensions (apart from time) + # Not needed for netCDF + xcoords = {'vtec': {('time', 'gdlat', 'glon', 'kindat', 'kinst'): + ['gdalt', 'tec', 'dtec'], + ('time', ): ['year', 'month', 'day', 'hour', 'min', + 'sec', 'ut1_unix', 'ut2_unix', 'recno']}} + + # Load the specified data + data, meta = mad_meth.load(fnames, tag, inst_id, + xarray_coords=xcoords[tag], + file_type=file_type) + + # Squeeze the kindat and kinst 'coordinates', but keep them as floats + squeeze_dims = np.array(['kindat', 'kinst']) + squeeze_mask = [sdim in data.coords for sdim in squeeze_dims] + if np.any(squeeze_mask): + data = data.squeeze(dim=squeeze_dims[squeeze_mask]) + + # Fix the units for tec and dtec + if tag == 'vtec': + meta['tec'] = {meta.units_label: 'TECU'} + meta['dtec'] = {meta.units_label: 'TECU'} + + return data, meta + + +def clean(self): + """Routine to return GNSS TEC data at a specific level + + Note + ---- + Supports 'clean', 'dusty', 'dirty', or 'None'. + Routine is called by pysat, and not by the end user directly. + + """ + if self.tag == "vtec": + logger.info("".join(["Data provided at a clean level, further ", + "cleaning may be performed using the ", + "measurement error 'dtec'"])) + + return diff --git a/pysatMadrigal/instruments/jro_isr.py b/pysatMadrigal/instruments/jro_isr.py index 5b110f7..5779b8c 100644 --- a/pysatMadrigal/instruments/jro_isr.py +++ b/pysatMadrigal/instruments/jro_isr.py @@ -66,15 +66,13 @@ # support list files routine # use the default CDAWeb method jro_fname1 = 'jro{year:4d}{month:02d}{day:02d}' -jro_fname2 = '.{version:03d}.hdf5' +jro_fname2 = '.{version:03d}.{file_type}' supported_tags = {ss: {'drifts': jro_fname1 + "drifts" + jro_fname2, 'drifts_ave': jro_fname1 + "drifts_avg" + jro_fname2, 'oblique_stan': jro_fname1 + jro_fname2, 'oblique_rand': jro_fname1 + "?" + jro_fname2, 'oblique_long': jro_fname1 + "?" + jro_fname2} for ss in inst_ids.keys()} -list_files = functools.partial(mm_gen.list_files, - supported_tags=supported_tags) # madrigal tags madrigal_inst_code = 10 @@ -86,9 +84,6 @@ supported_tags=supported_tags, inst_code=madrigal_inst_code) -# support load routine -load = functools.partial(mad_meth.load, xarray_coords=['gdalt']) - # Madrigal will sometimes include multiple days within a file # labeled with a single date. # Filter out this extra data using the pysat nanokernel processing queue. @@ -123,8 +118,63 @@ def init(self): return +def list_files(tag=None, inst_id=None, data_path=None, format_str=None, + supported_tags=supported_tags, + fake_daily_files_from_monthly=False, + delimiter=None, file_type=''): + """Return a Pandas Series of every data file for this Instrument + + + Parameters + ----------- + tag : string or NoneType + Denotes type of file to load. Accepted types are . + (default=None) + inst_id : string or NoneType + Specifies the satellite ID for a constellation. Not used. + (default=None) + data_path : string or NoneType + Path to data directory. If None is specified, the value previously + set in Instrument.files.data_path is used. (default=None) + format_str : string or NoneType + User specified file format. If None is specified, the default + formats associated with the supplied tags are used. (default=None) + supported_tags : dict or NoneType + keys are inst_id, each containing a dict keyed by tag + where the values file format template strings. (default=None) + fake_daily_files_from_monthly : bool + Some CDAWeb instrument data files are stored by month, interfering + with pysat's functionality of loading by day. This flag, when true, + appends daily dates to monthly files internally. These dates are + used by load routine in this module to provide data by day. + delimiter : string + Delimiter string upon which files will be split (e.g., '.') + file_type : string + File format for Madrigal data. Load routines currently only accepts + 'hdf5' and 'netCDF4', but any of the Madrigal options may be used + here. (default='netCDF4') + + Returns + -------- + out : pysat.Files.from_os : pysat._files.Files + A class containing the verified available files + + """ + if supported_tags[inst_id][tag].find('{file_type}') > 0: + supported_tags[inst_id][tag] = supported_tags[inst_id][tag].format( + file_type=file_type) + + out = ps_gen.list_files( + tag=tag, inst_id=inst_id, data_path=data_path, format_str=format_str, + supported_tags=supported_tags, + fake_daily_files_from_monthly=fake_daily_files_from_monthly, + delimiter=delimiter) + + return out + + def download(date_array, tag='', inst_id='', data_path=None, user=None, - password=None): + password=None, file_type='hdf5'): """Downloads data from Madrigal. Parameters @@ -146,6 +196,9 @@ def download(date_array, tag='', inst_id='', data_path=None, user=None, error if user not supplied. (default=None) password : string Password for data download. (default=None) + file_type : string + File format for Madrigal data. Currently only accept 'netcdf4' and + 'hdf5'. (default='hdf5') Notes ----- @@ -161,7 +214,88 @@ def download(date_array, tag='', inst_id='', data_path=None, user=None, """ mad_meth.download(date_array, inst_code=str(madrigal_inst_code), kindat=str(madrigal_tag[inst_id][tag]), - data_path=data_path, user=user, password=password) + data_path=data_path, user=user, password=password, + file_type=file_type) + + +def load(fnames, tag=None, inst_id=None, file_type='hdf5'): + """ Routine to load the JRO ISR data + + Parameters + ----------- + fnames : list + List of filenames + tag : string or NoneType + tag name used to identify particular data set to be loaded. + This input is nominally provided by pysat itself. (default=None) + inst_id : string or NoneType + Instrument ID used to identify particular data set to be loaded. + This input is nominally provided by pysat itself. (default=None) + file_type : string + File format for Madrigal data. Currently only accept 'netcdf4' and + 'hdf5'. (default='hdf5') + + Returns + -------- + data : xarray.Dataset + Object containing satellite data + meta : pysat.Meta + Object containing metadata such as column names and units + + """ + # Define the xarray coordinate dimensions (apart from time) + xcoords = {'drifts': {('time', 'gdalt', 'gdlatr', 'gdlonr', 'kindat', + 'kinst'): ['nwlos', 'range', 'vipn2', 'dvipn2', + 'vipe1', 'dvipe1', 'vi72', 'dvi72', + 'vi82', 'dvi82', 'paiwl', 'pacwl', + 'pbiwl', 'pbcwl', 'pciel', 'pccel', + 'pdiel', 'pdcel', 'jro10', 'jro11'], + ('time', ): ['year', 'month', 'day', 'hour', 'min', + 'sec', 'spcst', 'pl', 'cbadn', 'inttms', + 'azdir7', 'eldir7', 'azdir8', 'eldir8', + 'jro14', 'jro15', 'jro16', 'ut1_unix', + 'ut2_unix', 'recno']}, + 'drifts_ave': {('time', 'gdalt', 'gdlatr', 'gdlonr', 'kindat', + 'kinst'): ['altav', 'range', 'vipn2', 'dvipn2', + 'vipe1', 'dvipe1'], + ('time', ): ['year', 'month', 'day', 'hour', + 'min', 'sec', 'spcst', 'pl', + 'cbadn', 'inttms', 'ut1_unix', + 'ut2_unix', 'recno']}, + 'oblique_stan': {('time', 'gdalt', 'gdlatr', 'gdlonr', 'kindat', + 'kinst'): ['rgate', 'ne', 'dne', 'te', 'dte', + 'ti', 'dti', 'ph+', 'dph+', 'phe+', + 'dphe+'], + ('time', ): ['year', 'month', 'day', 'hour', + 'min', 'sec', 'azm', 'elm', + 'pl', 'inttms', 'tfreq', + 'ut1_unix', 'ut2_unix', 'recno']}, + 'oblique_rand': {('time', 'gdalt', 'gdlatr', 'gdlonr', 'kindat', + 'kinst'): ['rgate', 'pop', 'dpop', 'te', 'dte', + 'ti', 'dti', 'ph+', 'dph+', 'phe+', + 'dphe+'], + ('time', ): ['year', 'month', 'day', 'hour', + 'min', 'sec', 'azm', 'elm', + 'pl', 'inttms', 'tfreq', + 'ut1_unix', 'ut2_unix', 'recno']}, + 'oblique_long': {('time', 'gdalt', 'gdlatr', 'gdlonr', 'kindat', + 'kinst'): ['rgate', 'pop', 'dpop', 'te', 'dte', + 'ti', 'dti', 'ph+', 'dph+', 'phe+', + 'dphe+'], + ('time', ): ['year', 'month', 'day', 'hour', + 'min', 'sec', 'azm', 'elm', + 'pl', 'inttms', 'tfreq', + 'ut1_unix', 'ut2_unix', 'recno']}} + + # Load the specified data + data, meta = mad_meth.load(fnames, tag, inst_id, + xarray_coords=xcoords[tag], + file_type=file_type) + + # Squeeze the kindat and kinst 'coordinates', but keep them as floats + data = data.squeeze(dim=['kindat', 'kinst', 'gdlatr', 'gdlonr']) + + return data, meta def clean(self): @@ -169,10 +303,10 @@ def clean(self): Notes -------- - Supports 'clean', 'dusty', 'dirty' - 'Clean' is unknown for oblique modes, over 200 km for drifts - 'Dusty' is unknown for oblique modes, over 200 km for drifts - 'Dirty' is unknown for oblique modes, over 200 km for drifts + Supports 'clean' + 'clean' is unknown for oblique modes, over 200 km for drifts + 'dusty' is the same as clean + 'Dirty' is the same as clean 'None' None Routine is called by pysat, and not by the end user directly. diff --git a/pysatMadrigal/instruments/methods/madrigal.py b/pysatMadrigal/instruments/methods/madrigal.py index 3889889..ac34a7c 100644 --- a/pysatMadrigal/instruments/methods/madrigal.py +++ b/pysatMadrigal/instruments/methods/madrigal.py @@ -4,15 +4,12 @@ """ -from __future__ import absolute_import -from __future__ import print_function - import datetime as dt import logging import numpy as np import os import pandas as pds -import sys +import xarray as xr import h5py from madrigalWeb import madrigalWeb @@ -20,6 +17,7 @@ import pysat logger = logging.getLogger(__name__) +file_types = ['hdf5', 'netCDF4', 'simple'] def cedar_rules(): @@ -36,9 +34,8 @@ def cedar_rules(): return ackn -# support load routine -def load(fnames, tag=None, inst_id=None, xarray_coords=[]): - """Loads data from Madrigal into Pandas. +def load(fnames, tag=None, inst_id=None, xarray_coords=[], file_type='hdf5'): + """Loads data from Madrigal into Pandas or XArray This routine is called as needed by pysat. It is not intended for direct user interaction. @@ -58,114 +55,226 @@ def load(fnames, tag=None, inst_id=None, xarray_coords=[]): This input is nominally provided by pysat itself. (default='') xarray_coords : list List of keywords to use as coordinates if xarray output is desired - instead of a Pandas DataFrame (default=[]) + instead of a Pandas DataFrame. Can build an xarray Dataset + that have different coordinate dimensions by providing a dict + inside the list instead of coordinate variable name strings. Each dict + will have a tuple of coordinates as the key and a list of variable + strings as the value. For example, + xarray_coords=[{('time',): ['year', 'doy'], + ('time', 'gdalt'): ['data1', 'data2']}]. (default=[]) + file_type : string + File format for Madrigal data. Currently only accept 'netCDF4' and + 'hdf5'. (default='hdf5') Returns ------- - data : pds.DataFrame or xr.DataSet - A pandas DataFrame or xarray DataSet holding the data from the HDF5 + data : pds.DataFrame or xr.Dataset + A pandas DataFrame or xarray Dataset holding the data from the HDF5 file metadata : pysat.Meta Metadata from the HDF5 file, as well as default values from pysat - Examples - -------- - :: - - inst = pysat.Instrument('jro', 'isr', 'drifts') - inst.load(2010, 18) - """ + # Test the file format + if file_type not in ['netCDF4', 'hdf5']: + raise ValueError('unknown file format {:s}'.format(file_type)) - # Ensure 'time' wasn't included as a coordinate, since it is the default - if 'time' in xarray_coords: - xarray_coords.pop(xarray_coords.index('time')) - - # Open the specified file - filed = h5py.File(fnames[0], 'r') - # data - file_data = filed['Data']['Table Layout'] - # metadata - file_meta = filed['Metadata']['Data Parameters'] - # load up what is offered into pysat.Meta + # Initialize the output meta = pysat.Meta() - meta.info = {'acknowledgements': - ' '.join(["See 'meta.Experiment_Notes' for instrument", - "specific acknowledgements\n", cedar_rules()]), - 'references': "See 'meta.Experiment_Notes' for references"} labels = [] - for item in file_meta: - # handle difference in string output between python 2 and 3 - name_string = item[0] - unit_string = item[3] - desc_string = item[1] - if sys.version_info[0] >= 3: - name_string = name_string.decode('UTF-8') - unit_string = unit_string.decode('UTF-8') - desc_string = desc_string.decode('UTF-8') - labels.append(name_string) - meta[name_string.lower()] = {'long_name': name_string, - 'units': unit_string, - 'desc': desc_string} - - # add additional metadata notes - # custom attributes attached to meta are attached to - # corresponding Instrument object when pysat receives - # data and meta from this routine - for key in filed['Metadata']: - if key != 'Data Parameters': - setattr(meta, key.replace(' ', '_'), filed['Metadata'][key][:]) - # data into frame, with labels from metadata - data = pds.DataFrame.from_records(file_data, columns=labels) - # lowercase variable names - data.columns = [item.lower() for item in data.columns] - # datetime index from times - time_keys = np.array(['year', 'month', 'day', 'hour', 'min', 'sec']) - if not np.all([key in data.columns for key in time_keys]): - time_keys = [key for key in time_keys if key not in data.columns] - raise ValueError(' '.join(["unable to construct time index, missing", - "{:}".format(time_keys)])) - - uts = 3600.0 * data.loc[:, 'hour'] + 60.0 * data.loc[:, 'min'] \ - + data.loc[:, 'sec'] - time = pysat.utils.time.create_datetime_index(year=data.loc[:, 'year'], - month=data.loc[:, 'month'], - day=data.loc[:, 'day'], - uts=uts) - # Declare index or recast as xarray - if len(xarray_coords) > 0: - if not np.all([xkey.lower() in data.columns - for xkey in xarray_coords]): - estr = 'unknown coordinate key in {:}, '.format(xarray_coords) - estr += 'use only {:}'.format(data.columns) - raise ValueError(estr) - - # Append time to the data frame and add as the first coordinate - data = data.assign(time=pds.Series(time, index=data.index)) - xarray_coords.insert(0, 'time') - - # Set the indices - data = data.set_index(xarray_coords) - - # Recast the data as an xarray - data = data.to_xarray() - else: - # Set the index to time, and put up a warning if there are duplicate - # times. This could mean the data should be stored as an xarray - # DataSet - data.index = time - - if np.any(time.duplicated()): - logger.warning(' '.join(["duplicated time indices, consider", - "specifing additional coordinates and", - "storing the data as an xarray DataSet"])) + fdata = [] + + # Load the file data + for fname in fnames: + if file_type == "netCDF4": + # Xarray natively opens netCDF data into a Dataset + file_data = xr.open_dataset(fname) + + # Currently not saving file header data, as all metadata is at + # the data variable level + if len(labels) == 0: + for item in file_data.data_vars.keys(): + name_string = item + unit_string = file_data[item].attrs['units'] + desc_string = file_data[item].attrs['description'] + labels.append(name_string) + meta[name_string.lower()] = {'long_name': name_string, + 'units': unit_string, + 'desc': desc_string} + # remove any metadata from xarray + file_data[item].attrs = {} + + # Reset UNIX timestamp as datetime and set it as an index + file_data = file_data.rename({'timestamps': 'time'}) + time_data = pds.to_datetime(file_data['time'], unit='s') + data = file_data.assign_coords({'time': ('time', time_data)}) + + elif file_type == "hdf5": + # Open the specified file and get the data and metadata + filed = h5py.File(fname, 'r') + file_data = filed['Data']['Table Layout'] + file_meta = filed['Metadata']['Data Parameters'] + + # load up what is offered into pysat.Meta if this is the first file + if len(labels) == 0: + for item in file_meta: + name_string = item[0].decode('UTF-8') + unit_string = item[3].decode('UTF-8') + desc_string = item[1].decode('UTF-8') + labels.append(name_string) + meta[name_string.lower()] = {'long_name': name_string, + 'units': unit_string, + 'desc': desc_string} + + # Add additional metadata notes. Custom attributes attached to meta + # are attached to corresponding Instrument object when pysat + # receives data and meta from this routine + for key in filed['Metadata']: + if key != 'Data Parameters': + setattr(meta, key.replace(' ', '_'), + filed['Metadata'][key][:]) + + # data into frame, with labels from metadata + data = pds.DataFrame.from_records(file_data, columns=labels) + + # lowercase variable names + data.columns = [item.lower() for item in data.columns] + + # datetime index from times + time_keys = np.array(['year', 'month', 'day', 'hour', 'min', 'sec']) + if not np.all([key in data.columns for key in time_keys]): + time_keys = [key for key in time_keys + if key not in data.columns] + raise ValueError(' '.join(["unable to construct time index, ", + "missing {:}".format(time_keys)])) + + uts = 3600.0 * data.loc[:, 'hour'] + 60.0 * data.loc[:, 'min'] \ + + data.loc[:, 'sec'] + time = pysat.utils.time.create_datetime_index( + year=data.loc[:, 'year'], month=data.loc[:, 'month'], + day=data.loc[:, 'day'], uts=uts) + + # Ensure we don't try to create an xarray object with only time as + # the coordinate + coord_len = len(xarray_coords) + if 'time' in xarray_coords: + coord_len -= 1 + + # Declare index or recast as xarray + if coord_len > 0: + # If a list was provided, recast as a dict and grab the data + # columns + if not isinstance(xarray_coords, dict): + xarray_coords = {tuple(xarray_coords): + [col for col in data.columns + if col not in xarray_coords]} + + # Determine the order in which the keys should be processed: + # Greatest to least number of dimensions + len_dict = {len(xcoords): xcoords + for xcoords in xarray_coords.keys()} + coord_order = [len_dict[xkey] for xkey in sorted( + [lkey for lkey in len_dict.keys()], reverse=True)] + + # Append time to the data frame + data = data.assign(time=pds.Series(time, index=data.index)) + + # Cycle through each of the coordinate dimensions + xdatasets = list() + for xcoords in coord_order: + if not np.all([xkey.lower() in data.columns + for xkey in xcoords]): + raise ValueError(''.join(['unknown coordinate key in ', + '[{:}], use'.format(xcoords), + ' only: {:}'.format( + data.columns)])) + if not np.all([xkey.lower() in data.columns + for xkey in xarray_coords[xcoords]]): + data_mask = [xkey.lower() in data.columns + for xkey in xarray_coords[xcoords]] + if np.all(~np.array(data_mask)): + raise ValueError(''.join(['all provided data ', + 'variables [', + '{:}] are '.format( + xarray_coords[ + xcoords]), + 'unknown, use only: ', + '{:}'.format( + data.columns)])) + else: + logger.warning(''.join(['unknown data variable in', + ' [{:}], use only'.format( + xarray_coords[xcoords]), + ': {:}'.format( + data.columns)])) + + # Remove the coordinates that aren't present + temp = np.array(xarray_coords[xcoords])[data_mask] + xarray_coords[xcoords] = list(temp) + + # Select the desired data values + sel_data = data[list(xcoords) + xarray_coords[xcoords]] + + # Remove duplicates before indexing, to ensure data with + # the same values at different locations are kept + sel_data = sel_data.drop_duplicates() + + # Set the indices + sel_data = sel_data.set_index(list(xcoords)) + + # Recast as an xarray + xdatasets.append(sel_data.to_xarray()) + + # Merge all of the datasets + for i in np.arange(1, len(xdatasets)): + xdatasets[0] = xdatasets[0].merge(xdatasets[i]) + + # Test to see that all data was retrieved + test_variables = [xkey for xkey + in xdatasets[0].variables.keys()] + ltest = len(test_variables) + ldata = len(data.columns) + + if ltest != ldata: + if ltest < ldata: + estr = 'missing: {:}'.format( + ' '.join([dvar for dvar in data.columns + if dvar not in test_variables])) + else: + estr = 'have extra: {:}'.format( + ' '.join([tvar for tvar in test_variables + if tvar not in data.columns])) + raise ValueError(''.join(['coordinates not supplied ', + 'for all data columns: ', + '{:d} != '.format(ltest), + '{:d}; '.format(ldata), + estr])) + + data = xdatasets[0] + else: + # Set the index to time + data.index = time + + # Raise a logging warning if there are duplicate times. This + # means the data should be stored as an xarray Dataset + if np.any(time.duplicated()): + logger.warning(''.join(["duplicated time indices, consider", + " specifing additional coordinates", + " and storing the data as an ", + "xarray Dataset"])) + fdata.append(data) + + # If multiple files were loaded, merge the data together + for i in np.arange(1, len(fdata)): + fdata[0] = fdata[0].merge(fdata[i]) + data = fdata[0] return data, meta def download(date_array, inst_code=None, kindat=None, data_path=None, user=None, password=None, url="http://cedar.openmadrigal.org", - file_format='hdf5'): + file_type='hdf5'): """Downloads data from Madrigal. Parameters @@ -189,10 +298,10 @@ def download(date_array, inst_code=None, kindat=None, data_path=None, Password for data download. (default=None) url : string URL for Madrigal site (default='http://cedar.openmadrigal.org') - file_format : string - File format for Madrigal data. Load routines currently only accept - 'hdf5', but any of the Madrigal options may be used here. - (default='hdf5') + file_type : string + File format for Madrigal data. Load routines currently only accepts + 'hdf5' and 'netCDF4', but any of the Madrigal options may be used + here. (default='hdf5') Note ---- @@ -207,6 +316,10 @@ def download(date_array, inst_code=None, kindat=None, data_path=None, """ + if file_type not in file_typees: + raise ValueError("Unknown file format {:}, accepts {:}".format( + file_type, file_types)) + if inst_code is None: raise ValueError("Must supply Madrigal instrument code") @@ -237,11 +350,19 @@ def download(date_array, inst_code=None, kindat=None, data_path=None, start=start, stop=stop) for mad_file in files: + # Build the local filename local_file = os.path.join(data_path, os.path.basename(mad_file.name)) + if local_file.find(file_type) <= 0: + split_file = local_file.split(".") + split_file[-1] = file_type + local_file = ".".join(split_file) if not os.path.isfile(local_file): web_data.downloadFile(mad_file.name, local_file, user, password, - "pysat", format=file_format) + "pysat", format=file_type) + print(os.path.isfile(local_file)) + + return def get_remote_filenames(inst_code=None, kindat=None, user=None, @@ -310,10 +431,10 @@ def get_remote_filenames(inst_code=None, kindat=None, user=None, # TODO, implement user and password values in test code # specific to each instrument if user is None: - print('No user information supplied for download.') + logger.warning('No user information supplied for download.') user = 'pysat_testing' if password is None: - print('Please provide email address in password field.') + logger.warning('Please provide email address in password field.') password = 'pysat_testing@not_real_email.org' # If date_array supplied, overwrite start and stop @@ -436,8 +557,8 @@ def list_remote_files(tag, inst_id, inst_code=None, kindat=None, user=None, stop : dt.datetime Ending time for the file list (defaults to time of run) - Notes - ----- + Note + ---- The user's names should be provided in field user. Ruby Payne-Scott should be entered as Ruby+Payne-Scott @@ -492,13 +613,13 @@ def list_remote_files(tag, inst_id, inst_code=None, kindat=None, user=None, return pysat._files.process_parsed_filenames(stored, two_digit_year_break) -def filter_data_single_date(self): +def filter_data_single_date(inst): """Filters data to a single date. Parameters ---------- - self : pysat.Instrument - This object + inst : pysat.Instrument + Instrument object to which this routine should be attached Note ---- @@ -536,9 +657,12 @@ def filter_data_single_date(self): """ # only do this if loading by date! - if self._load_by_date and self.pad is None: + if inst._load_by_date and inst.pad is None: # identify times for the loaded date - idx, = np.where((self.index >= self.date) - & (self.index < (self.date + pds.DateOffset(days=1)))) + idx, = np.where((inst.index >= inst.date) + & (inst.index < (inst.date + pds.DateOffset(days=1)))) + # downselect from all data - self.data = self[idx] + inst.data = inst[idx] + + return diff --git a/setup.py b/setup.py index 75a7f7f..c7820b4 100644 --- a/setup.py +++ b/setup.py @@ -17,11 +17,15 @@ version = version_file.read().strip() # Define requirements -install_requires = ['pysat', 'pandas', 'xarray', 'numpy'] +# netCDF support requires netCDF4-Python or scipy installation +install_requires = ['pysat', 'pandas', 'xarray', 'numpy', 'netCDF4'] + # packages with Fortran code fortran_install = ['madrigalWeb', 'h5py'] + # flag, True if on readthedocs on_rtd = os.environ.get('READTHEDOCS') == 'True' + # include Fortran for normal install # read the docs doesn't do Fortran if not on_rtd: @@ -43,6 +47,7 @@ "Intended Audience :: Science/Research", 'License :: OSI Approved :: BSD License', "Natural Language :: English", + "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8",