Permalink
Browse files

Merge pull request #8170 from shoyer/netcdf-nullpadding

BUG: correctly pad netCDF files with null bytes
  • Loading branch information...
2 parents 00c116a + c8dc558 commit 306a2f0722e0404af6138ebd15c07789afd19893 @rgommers rgommers committed Dec 11, 2017
Showing with 137 additions and 36 deletions.
  1. +65 −24 scipy/io/netcdf.py
  2. +72 −12 scipy/io/tests/test_netcdf.py
View
@@ -62,27 +62,39 @@
NC_DIMENSION = b'\x00\x00\x00\n'
NC_VARIABLE = b'\x00\x00\x00\x0b'
NC_ATTRIBUTE = b'\x00\x00\x00\x0c'
-
+FILL_BYTE = b'\x81'
+FILL_CHAR = b'\x00'
+FILL_SHORT = b'\x80\x01'
+FILL_INT = b'\x80\x00\x00\x01'
+FILL_FLOAT = b'\x7C\xF0\x00\x00'
+FILL_DOUBLE = b'\x47\x9E\x00\x00\x00\x00\x00\x00'
TYPEMAP = {NC_BYTE: ('b', 1),
- NC_CHAR: ('c', 1),
- NC_SHORT: ('h', 2),
- NC_INT: ('i', 4),
- NC_FLOAT: ('f', 4),
- NC_DOUBLE: ('d', 8)}
+ NC_CHAR: ('c', 1),
+ NC_SHORT: ('h', 2),
+ NC_INT: ('i', 4),
+ NC_FLOAT: ('f', 4),
+ NC_DOUBLE: ('d', 8)}
+
+FILLMAP = {NC_BYTE: FILL_BYTE,
+ NC_CHAR: FILL_CHAR,
+ NC_SHORT: FILL_SHORT,
+ NC_INT: FILL_INT,
+ NC_FLOAT: FILL_FLOAT,
+ NC_DOUBLE: FILL_DOUBLE}
REVERSE = {('b', 1): NC_BYTE,
- ('B', 1): NC_CHAR,
- ('c', 1): NC_CHAR,
- ('h', 2): NC_SHORT,
- ('i', 4): NC_INT,
- ('f', 4): NC_FLOAT,
- ('d', 8): NC_DOUBLE,
+ ('B', 1): NC_CHAR,
+ ('c', 1): NC_CHAR,
+ ('h', 2): NC_SHORT,
+ ('i', 4): NC_INT,
+ ('f', 4): NC_FLOAT,
+ ('d', 8): NC_DOUBLE,
- # these come from asarray(1).dtype.char and asarray('foo').dtype.char,
- # used when getting the types from generic attributes.
- ('l', 4): NC_INT,
- ('S', 1): NC_CHAR}
+ # these come from asarray(1).dtype.char and asarray('foo').dtype.char,
+ # used when getting the types from generic attributes.
+ ('l', 4): NC_INT,
+ ('S', 1): NC_CHAR}
class netcdf_file(object):
@@ -429,7 +441,7 @@ def _write_att_array(self, attributes):
self._pack_int(len(attributes))
for name, values in attributes.items():
self._pack_string(name)
- self._write_values(values)
+ self._write_att_values(values)
else:
self.fp.write(ABSENT)
@@ -506,7 +518,7 @@ def _write_var_data(self, name):
if not var.isrec:
self.fp.write(var.data.tostring())
count = var.data.size * var.data.itemsize
- self.fp.write(b'0' * (var._vsize - count))
+ self._write_var_padding(var, var._vsize - count)
else: # record variable
# Handle rec vars with shape[0] < nrecs.
if self._recs > len(var.data):
@@ -529,12 +541,17 @@ def _write_var_data(self, name):
self.fp.write(rec.tostring())
# Padding
count = rec.size * rec.itemsize
- self.fp.write(b'0' * (var._vsize - count))
+ self._write_var_padding(var, var._vsize - count)
pos += self._recsize
self.fp.seek(pos)
self.fp.seek(pos0 + var._vsize)
- def _write_values(self, values):
+ def _write_var_padding(self, var, size):
+ encoded_fill_value = var._get_encoded_fill_value()
+ num_fills = size // len(encoded_fill_value)
+ self.fp.write(encoded_fill_value * num_fills)
+
+ def _write_att_values(self, values):
if hasattr(values, 'dtype'):
nc_type = REVERSE[values.dtype.char, values.dtype.itemsize]
else:
@@ -576,7 +593,7 @@ def _write_values(self, values):
values = values.byteswap()
self.fp.write(values.tostring())
count = values.size * values.itemsize
- self.fp.write(b'0' * (-count % 4)) # pad
+ self.fp.write(b'\x00' * (-count % 4)) # pad
def _read(self):
# Check magic bytes and version
@@ -620,7 +637,7 @@ def _read_att_array(self):
attributes = OrderedDict()
for attr in range(count):
name = asstr(self._unpack_string())
- attributes[name] = self._read_values()
+ attributes[name] = self._read_att_values()
return attributes
def _read_var_array(self):
@@ -732,7 +749,7 @@ def _read_var(self):
return name, dimensions, shape, attributes, typecode, size, dtype_, begin, vsize
- def _read_values(self):
+ def _read_att_values(self):
nc_type = self.fp.read(4)
n = self._unpack_int()
@@ -774,7 +791,7 @@ def _pack_string(self, s):
count = len(s)
self._pack_int(count)
self.fp.write(asbytes(s))
- self.fp.write(b'0' * (-count % 4)) # pad
+ self.fp.write(b'\x00' * (-count % 4)) # pad
def _unpack_string(self):
count = self._unpack_int()
@@ -995,6 +1012,30 @@ def __setitem__(self, index, data):
self.__dict__['data'] = np.resize(self.data, shape).astype(self.data.dtype)
self.data[index] = data
+ def _default_encoded_fill_value(self):
+ """
+ The default encoded fill-value for this Variable's data type.
+ """
+ nc_type = REVERSE[self.typecode(), self.itemsize()]
+ return FILLMAP[nc_type]
+
+ def _get_encoded_fill_value(self):
+ """
+ Returns the encoded fill value for this variable as bytes.
+
+ This is taken from either the _FillValue attribute, or the default fill
+ value for this variable's data type.
+ """
+ if '_FillValue' in self._attributes:
+ fill_value = np.array(self._attributes['_FillValue'],
+ dtype=self.data.dtype).tostring()
+ if len(fill_value) == self.itemsize():
+ return fill_value
+ else:
+ return self._default_encoded_fill_value()
+ else:
+ return self._default_encoded_fill_value()
+
def _get_missing_value(self):
"""
Returns the value denoting "no data" for this variable.
@@ -166,6 +166,66 @@ def test_read_write_sio():
assert_equal(f_64.version_byte, 2)
+def test_bytes():
+ raw_file = BytesIO()
+ f = netcdf_file(raw_file, mode='w')
+ # Dataset only has a single variable, dimension and attribute to avoid
+ # any ambiguity related to order.
+ f.a = 'b'
+ f.createDimension('dim', 1)
+ var = f.createVariable('var', np.int16, ('dim',))
+ var[0] = -9999
+ var.c = 'd'
+ f.sync()
+
+ actual = raw_file.getvalue()
+
+ expected = (b'CDF\x01'
+ b'\x00\x00\x00\x00'
+ b'\x00\x00\x00\x0a'
+ b'\x00\x00\x00\x01'
+ b'\x00\x00\x00\x03'
+ b'dim\x00'
+ b'\x00\x00\x00\x01'
+ b'\x00\x00\x00\x0c'
+ b'\x00\x00\x00\x01'
+ b'\x00\x00\x00\x01'
+ b'a\x00\x00\x00'
+ b'\x00\x00\x00\x02'
+ b'\x00\x00\x00\x01'
+ b'b\x00\x00\x00'
+ b'\x00\x00\x00\x0b'
+ b'\x00\x00\x00\x01'
+ b'\x00\x00\x00\x03'
+ b'var\x00'
+ b'\x00\x00\x00\x01'
+ b'\x00\x00\x00\x00'
+ b'\x00\x00\x00\x0c'
+ b'\x00\x00\x00\x01'
+ b'\x00\x00\x00\x01'
+ b'c\x00\x00\x00'
+ b'\x00\x00\x00\x02'
+ b'\x00\x00\x00\x01'
+ b'd\x00\x00\x00'
+ b'\x00\x00\x00\x03'
+ b'\x00\x00\x00\x04'
+ b'\x00\x00\x00\x78'
+ b'\xd8\xf1\x80\x01')
+
+ assert_equal(actual, expected)
+
+
+def test_encoded_fill_value():
+ with netcdf_file(BytesIO(), mode='w') as f:
+ f.createDimension('x', 1)
+ var = f.createVariable('var', 'S1', ('x',))
+ assert_equal(var._get_encoded_fill_value(), b'\x00')
+ var._FillValue = b'\x01'
+ assert_equal(var._get_encoded_fill_value(), b'\x01')
+ var._FillValue = b'\x00\x00' # invalid, wrong size
+ assert_equal(var._get_encoded_fill_value(), b'\x00')
+
+
def test_read_example_data():
# read any example data files
for fname in glob(pjoin(TEST_DATA_PATH, '*.nc')):
@@ -319,9 +379,9 @@ def test_open_append():
f.close()
-def test_append_recordDimension():
- dataSize = 100
-
+def test_append_recordDimension():
+ dataSize = 100
+
with in_tempdir():
# Create file with record time dimension
with netcdf_file('withRecordDimension.nc', 'w') as f:
@@ -333,28 +393,28 @@ def test_append_recordDimension():
f.createDimension('y', dataSize)
y = f.createVariable('y', 'd', ('y',))
y[:] = np.array(range(dataSize))
- f.createVariable('testData', 'i', ('time', 'x', 'y'))
+ f.createVariable('testData', 'i', ('time', 'x', 'y'))
f.flush()
- f.close()
-
- for i in range(2):
- # Open the file in append mode and add data
+ f.close()
+
+ for i in range(2):
+ # Open the file in append mode and add data
with netcdf_file('withRecordDimension.nc', 'a') as f:
f.variables['time'].data = np.append(f.variables["time"].data, i)
f.variables['testData'][i, :, :] = np.ones((dataSize, dataSize))*i
f.flush()
-
+
# Read the file and check that append worked
- with netcdf_file('withRecordDimension.nc') as f:
+ with netcdf_file('withRecordDimension.nc') as f:
assert_equal(f.variables['time'][-1], i)
assert_equal(f.variables['testData'][-1, :, :].copy(), np.ones((dataSize, dataSize))*i)
assert_equal(f.variables['time'].data.shape[0], i+1)
assert_equal(f.variables['testData'].data.shape[0], i+1)
-
+
# Read the file and check that 'data' was not saved as user defined
# attribute of testData variable during append operation
with netcdf_file('withRecordDimension.nc') as f:
- with assert_raises(KeyError) as ar:
+ with assert_raises(KeyError) as ar:
f.variables['testData']._attributes['data']
ex = ar.value
assert_equal(ex.args[0], 'data')

0 comments on commit 306a2f0

Please sign in to comment.