Skip to content

Commit

Permalink
Merge pull request #14 from sfinkens/feedback-mike
Browse files Browse the repository at this point in the history
Attribute Finetuning
  • Loading branch information
sfinkens committed Jul 14, 2020
2 parents b664649 + 10b245e commit 916a3b9
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 74 deletions.
5 changes: 3 additions & 2 deletions bin/pygac-fdr-mda-update
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import argparse
import logging
from pygac_fdr.metadata import MetadataCollector, update_metadata
from pygac_fdr.metadata import MetadataCollector, MetadataUpdater
from pygac_fdr.utils import logging_on


Expand All @@ -34,4 +34,5 @@ if __name__ == '__main__':

collector = MetadataCollector()
mda = collector.read_sql(args.dbfile)
update_metadata(mda)
updater = MetadataUpdater()
updater.update(mda)
5 changes: 4 additions & 1 deletion bin/pygac-fdr-run
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import argparse
import logging
import satpy
from pygac_fdr.reader import read_gac
from pygac_fdr.writer import NetcdfWriter
from pygac_fdr.config import read_config
Expand Down Expand Up @@ -56,11 +57,13 @@ if __name__ == '__main__':
config['controls']['debug'] = args.debug

# Process files
satpy.CHUNK_SIZE = config['controls'].get('pytroll_chunk_size', 1024)
for filename in args.filenames:
LOG.info('Processing file {}'.format(filename))
try:
scene = read_gac(filename, reader_kwargs=config['controls'].get('reader_kwargs'))
writer = NetcdfWriter(global_attrs=config.get('metadata'),
writer = NetcdfWriter(global_attrs=config.get('global_attrs'),
gac_header_attrs=config.get('gac_header_attrs'),
fname_fmt=config['output'].get('fname_fmt'),
encoding=config['netcdf'].get('encoding'),
engine=config['netcdf'].get('engine'),
Expand Down
14 changes: 11 additions & 3 deletions etc/pygac-fdr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,20 @@ controls:
tle_name: TLE_%(satname)s.txt
tle_thresh: 1000
calibration_file: /data/avhrr_gac/calibration.json
pytroll_chunk_size: 1024
output:
output_dir: /data/avhrr_gac/output/
fname_fmt: AVHRR-GAC_FDR_{processing_level}_{platform}_{start_time}_{end_time}_{processing_mode}_{disposition_mode}_{creation_time}_{version_int:04d}.nc
metadata:
global_attrs:
id: DOI:10.5676/EUM/AVHRR_GAC_L1C_FDR/V0100
title: AVHRR GAC L1C FDR
product_version: 1.0.0
institution: EUMETSAT, ops@eumetsat.int
institution: EUMETSAT
creator_name: EUMETSAT
creator_url: https://www.eumetsat.int/
creator_email: ops@eumetsat.int
naming_authority: int.eumetsat
comment: Developed in cooperation with EUM CM SAF and the PyTroll community.
comment: Developed in cooperation with EUMETSAT CM SAF and the PyTroll community.
summary: >-
Fundamental Data Record (FDR) of measurements from the Advanced Very High Resolution Radiometer (AVHRR) at
full Global Area Coverage (GAC) resolution. AVHRR GAC measurements have been calibrated to Level 1C using the
Expand Down Expand Up @@ -44,6 +48,10 @@ metadata:
standard_name_vocabulary: CF Standard Name Table v73
licence: EUMETSAT data policy https://www.eumetsat.int/website/home/AboutUs/WhoWeAre/LegalFramework/DataPolicy/index.html
history:
gac_header_attrs:
title: Raw GAC Header
references: >-
NOAA POD & KLM user guides, https://www1.ncdc.noaa.gov/pub/data/satellite/publications/podguides/
netcdf:
engine: netcdf4
encoding:
Expand Down
153 changes: 103 additions & 50 deletions pygac_fdr/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,16 @@ class QualityFlags(IntEnum):
'dtype': np.int16,
'fill_value': FILL_VALUE_INT},
{'name': 'equator_crossing_longitude',
'long_name': 'Longitude where ascending node crosses the equator',
'long_name': 'Longitude where ascending node crosses the equator (can happen twice per file)',
'units': 'degrees_east',
'dtype': np.float64,
'fill_value': FILL_VALUE_FLOAT},
{'name': 'equator_crossing_time',
'long_name': 'UTC time when ascending node crosses the equator',
'long_name': 'UTC time when ascending node crosses the equator (can happen twice per file)',
'units': 'seconds since 1970-01-01 00:00:00',
'calendar': 'standard',
'dtype': np.float64,
'fill_value': FILL_VALUE_INT},
'fill_value': FILL_VALUE_FLOAT},
{'name': 'global_quality_flag',
'long_name': 'Global quality flag',
'comment': 'If this flag is everything else than "ok", it is recommended not '
Expand Down Expand Up @@ -127,8 +127,10 @@ def read_sql(self, dbfile):
with sqlite3.connect(dbfile) as con:
mda = pd.read_sql('select * from metadata', con)
mda = mda.set_index(['platform', 'level_1'])
for time_col in ['start_time', 'end_time', 'equator_crossing_time']:
mda[time_col] = mda[time_col].astype('datetime64[ns]')
mda.fillna(value=np.nan, inplace=True)
for col in mda.columns:
if 'time' in col:
mda[col] = mda[col].astype('datetime64[ns]')
return mda

def _collect_metadata(self, filenames):
Expand All @@ -138,16 +140,18 @@ def _collect_metadata(self, filenames):
LOG.debug('Collecting metadata from {}'.format(filename))
with xr.open_dataset(filename) as ds:
midnight_line = np.float64(self._get_midnight_line(ds['acq_time']))
eq_cross_lon, eq_cross_time = self._get_equator_crossing(ds)
eq_cross_lons, eq_cross_times = self._get_equator_crossings(ds)
rec = {'platform': ds.attrs['platform'].split('>')[-1].strip(),
'start_time': ds['acq_time'].values[0],
'end_time': ds['acq_time'].values[-1],
'along_track': ds.dims['y'],
'filename': filename,
'orbit_number_start': ds.attrs['orbit_number_start'],
'orbit_number_end': ds.attrs['orbit_number_end'],
'equator_crossing_longitude': eq_cross_lon,
'equator_crossing_time': eq_cross_time,
'equator_crossing_longitude_1': eq_cross_lons[0],
'equator_crossing_time_1': eq_cross_times[0],
'equator_crossing_longitude_2': eq_cross_lons[1],
'equator_crossing_time_2': eq_cross_times[1],
'midnight_line': midnight_line,
'overlap_free_start': np.nan,
'overlap_free_end': np.nan,
Expand All @@ -172,11 +176,11 @@ def _get_midnight_line(self, acq_time):
return incr[0]
return np.nan

def _get_equator_crossing(self, ds):
"""Determine where the ascending node crosses the equator.
def _get_equator_crossings(self, ds):
"""Determine where the ascending node(s) cross the equator.
Returns:
Longitude and UTC time
Longitudes and UTC times of first and second equator crossing, if any, NaN else.
"""
# Use coordinates in the middle of the swath
mid_swath = ds['latitude'].shape[1] // 2
Expand All @@ -185,9 +189,13 @@ def _get_equator_crossing(self, ds):
sign_change = np.sign(lat_shift) != np.sign(lat)
ascending = lat_shift > lat
lat_eq = lat.where(sign_change & ascending, drop=True)
if len(lat_eq) > 0:
return lat_eq['longitude'].values[0], lat_eq['acq_time'].values[0]
return np.nan, np.datetime64('NaT')

num_cross = min(2, len(lat_eq)) # Two crossings max
eq_cross_lons = np.full(2, np.nan)
eq_cross_times = np.full(2, np.datetime64('NaT'), dtype=lat_eq['acq_time'].dtype)
eq_cross_lons[0:num_cross] = lat_eq['longitude'].values[0:num_cross]
eq_cross_times[0:num_cross] = lat_eq['acq_time'].values[0:num_cross]
return eq_cross_lons, eq_cross_times

def _set_redundant_flag(self, df, window=20):
"""Flag redundant files in the given data frame.
Expand Down Expand Up @@ -346,39 +354,84 @@ def _calc_overlap(self, df, open_end=False):
return df


def update_metadata(mda):
"""Add additional metadata to level 1c files."""
# Since xarray cannot modify files in-place, use netCDF4 directly. See
# https://github.com/pydata/xarray/issues/2029.
for _, row in mda.iterrows():
LOG.debug('Updating metadata in {}'.format(row['filename']))
with netCDF4.Dataset(filename=row['filename'], mode='r+') as nc:
nc_acq_time = nc.variables['acq_time']
for mda in ADDITIONAL_METADATA:
mda = mda.copy()
var_name = mda.pop('name')
fill_value = mda.pop('fill_value')

# Create nc variable
try:
nc_var = nc.createVariable(var_name, datatype=mda.pop('dtype'),
fill_value=fill_value)
except RuntimeError:
# Variable already there
nc_var = nc.variables[var_name]

# Write data to nc variable. Since netCDF4 cannot handle NaN nor NaT, disable
# auto-masking, and set null-data to fill value manually. Furthermore, match
# timestamp encoding with the acq_time variable.
data = row[var_name]
nc_var.set_auto_mask(False)
if pd.isnull(data):
data = fill_value
elif isinstance(data, pd.Timestamp):
data = encode_cf_datetime(data, units=nc_acq_time.units,
calendar=nc_acq_time.calendar)[0]
nc_var[:] = data

# Set attributes of nc variable
for key, val in mda.items():
nc_var.setncattr(key, val)
class MetadataUpdater:
def update(self, mda):
"""Add additional metadata to level 1c files.
Since xarray cannot modify files in-place, use netCDF4 directly. See
https://github.com/pydata/xarray/issues/2029.
"""
mda = self._to_xarray(mda)
mda = self._stack(mda)
for irow in range(mda.dims['row']):
row = mda.isel(row=irow)
LOG.debug('Updating metadata in {}'.format(row['filename'].item()))
with netCDF4.Dataset(filename=row['filename'].item(), mode='r+') as nc:
self._update_file(nc=nc, row=row)

def _to_xarray(self, mda):
"""Convert pandas DataFrame to xarray Dataset."""
mda = xr.Dataset(mda)
mda = mda.rename({'dim_0': 'row'})
return mda

def _stack(self, mda):
"""Stack certain columns to simplify the netCDF file."""
# Stack equator crossing longitudes/times along a new dimension
mda['equator_crossing_time'] = (('row', 'num_eq_cross'), np.stack(
[mda['equator_crossing_time_1'].values,
mda['equator_crossing_time_2'].values]).transpose())
mda['equator_crossing_longitude'] = (('row', 'num_eq_cross'), np.stack(
[mda['equator_crossing_longitude_1'].values,
mda['equator_crossing_longitude_2'].values]).transpose())
return mda

def _create_nc_var(self, nc, var_name, fill_value, dtype, shape, dims):
"""Create netCDF variable and dimension."""
# Create dimension if needed (only 1D at the moment)
if dims:
dim_name, size = dims[0], shape[0]
if dim_name not in nc.dimensions:
nc.createDimension(dim_name, size=size)

# Create nc variable
if var_name in nc.variables:
nc_var = nc.variables[var_name]
else:
nc_var = nc.createVariable(var_name,
datatype=dtype,
fill_value=fill_value,
dimensions=dims)
return nc_var

def _update_file(self, nc, row):
"""Update metadata of a single file."""
for add_mda in ADDITIONAL_METADATA:
add_mda = add_mda.copy()
var_name = add_mda.pop('name')
fill_value = add_mda.pop('fill_value')
data = row[var_name]

# Create nc variable
nc_var = self._create_nc_var(nc=nc,
var_name=var_name,
fill_value=fill_value,
dtype=add_mda.pop('dtype'),
dims=data.dims,
shape=data.shape)

# Write data to nc variable. Since netCDF4 cannot handle NaN nor NaT, disable
# auto-masking, and set null-data to fill value manually.
nc_var.set_auto_mask(False)
if np.issubdtype(data.dtype, np.datetime64):
data, _, _ = encode_cf_datetime(data,
units=add_mda['units'],
calendar=add_mda['calendar'])
data = np.nan_to_num(data, nan=fill_value)
else:
data = data.fillna(fill_value).values
nc_var[:] = data

# Set attributes of nc variable
for key, val in add_mda.items():
nc_var.setncattr(key, val)
54 changes: 40 additions & 14 deletions pygac_fdr/tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,24 @@ def test_get_midnight_line(self):
np.datetime64('2010-01-01 00:02:00')])
np.testing.assert_equal(collector._get_midnight_line(acq_time), np.nan)

def test_get_equator_crossing(self):
def test_get_equator_crossings(self):
collector = MetadataCollector()

# No equator crossing
ds = xr.Dataset(
coords={'latitude': (('y', 'x'), [[999, 5.0, 999],
[999, 0.1, 999],
[999, -5.0, 999]]),
'longitude': (('y', 'x'), [[999, 1.0, 999],
[999, 2.0, 999],
[999, 3.0, 999]]),
'acq_time': ('y', np.arange(3).astype('datetime64[s]'))}
)
lons, times = collector._get_equator_crossings(ds)
np.testing.assert_equal(lons, [np.nan, np.nan])
self.assertTrue(np.all(np.isnat(times)))

# One equator crossing
ds = xr.Dataset(
coords={'latitude': (('y', 'x'), [[999, 5.0, 999],
[999, 0.1, 999],
Expand All @@ -291,21 +308,30 @@ def test_get_equator_crossing(self):
'acq_time': ('y', np.arange(5).astype('datetime64[s]'))}
)
ds['acq_time'].attrs['coords'] = 'latitude longitude'
collector = MetadataCollector()
lon, time = collector._get_equator_crossing(ds)
self.assertEqual(lon, 3)
self.assertEqual(time, np.datetime64('1970-01-01 00:00:02'))
lons, times = collector._get_equator_crossings(ds)
np.testing.assert_equal(lons, [3, np.nan])
np.testing.assert_equal(times, [np.datetime64('1970-01-01 00:00:02'),
np.datetime64('NaT')])

# No equator crossing
# More than two equator crossings
ds = xr.Dataset(
coords={'latitude': (('y', 'x'), [[999, 5.0, 999],
[999, 0.1, 999],
[999, -5.0, 999]]),
coords={'latitude': (('y', 'x'), [[999, -1, 999],
[999, 1, 999],
[999, -1, 999],
[999, 1, 999],
[999, -1, 999],
[999, 1, 999]]),
'longitude': (('y', 'x'), [[999, 1.0, 999],
[999, 2.0, 999],
[999, 3.0, 999]]),
'acq_time': ('y', np.arange(3).astype('datetime64[s]'))}
[999, 3.0, 999],
[999, 4.0, 999],
[999, 5.0, 999],
[999, 6.0, 999]]),
'acq_time': ('y', np.arange(6).astype('datetime64[s]'))}
)
lon, time = collector._get_equator_crossing(ds)
np.testing.assert_equal(lon, np.nan)
self.assertTrue(np.isnat(time))
ds['acq_time'].attrs['coords'] = 'latitude longitude'
collector = MetadataCollector()
lons, times = collector._get_equator_crossings(ds)
np.testing.assert_equal(lons, [1, 3])
np.testing.assert_equal(times, [np.datetime64('1970-01-01 00:00:00'),
np.datetime64('1970-01-01 00:00:02')])

0 comments on commit 916a3b9

Please sign in to comment.