Skip to content

Commit

Permalink
BUG: correctly pad netCDF files with null bytes
Browse files Browse the repository at this point in the history
Per the netCDF spec, variable names and attributes should be padded with null
padding to the nearest 4-byte boundary, but scipy has been padding with b'0'
instead:
http://www.unidata.ucar.edu/software/netcdf/docs/file_format_specifications.html

This surfaced recently after the netCDF-C library added more stringent checks
in version 4.5.0 which rendered it unable to read files written with SciPy:
Unidata/netcdf-c#657

This change to netCDF-C is likely going to be rolled back, but it's still a
good idea for us to write compliant files. Apparently there are other netCDF
implementations that also insist on padding with nulls.
  • Loading branch information
shoyer committed Nov 18, 2017
1 parent 73f723a commit 7f53fc5
Showing 1 changed file with 12 additions and 8 deletions.
20 changes: 12 additions & 8 deletions scipy/io/netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def _write_att_array(self, attributes):
self._pack_int(len(attributes))
for name, values in attributes.items():
self._pack_string(name)
self._write_values(values)
self._write_att_values(values)
else:
self.fp.write(ABSENT)

Expand Down Expand Up @@ -496,6 +496,10 @@ def _write_var_metadata(self, name):
def _write_var_data(self, name):
var = self.variables[name]

# TODO: for full compliance with the netCDF spec, use _FillValue for
# padding instead of instead of always using using the null byte.
encoded_fill_value = b'\x00'

# Set begin in file header.
the_beguine = self.fp.tell()
self.fp.seek(var._begin)
Expand All @@ -506,7 +510,7 @@ def _write_var_data(self, name):
if not var.isrec:
self.fp.write(var.data.tostring())
count = var.data.size * var.data.itemsize
self.fp.write(b'0' * (var._vsize - count))
self.fp.write(encoded_fill_value * (var._vsize - count))
else: # record variable
# Handle rec vars with shape[0] < nrecs.
if self._recs > len(var.data):
Expand All @@ -529,12 +533,12 @@ def _write_var_data(self, name):
self.fp.write(rec.tostring())
# Padding
count = rec.size * rec.itemsize
self.fp.write(b'0' * (var._vsize - count))
self.fp.write(encoded_fill_value * (var._vsize - count))
pos += self._recsize
self.fp.seek(pos)
self.fp.seek(pos0 + var._vsize)

def _write_values(self, values):
def _write_att_values(self, values):
if hasattr(values, 'dtype'):
nc_type = REVERSE[values.dtype.char, values.dtype.itemsize]
else:
Expand Down Expand Up @@ -576,7 +580,7 @@ def _write_values(self, values):
values = values.byteswap()
self.fp.write(values.tostring())
count = values.size * values.itemsize
self.fp.write(b'0' * (-count % 4)) # pad
self.fp.write(b'\x00' * (-count % 4)) # pad

def _read(self):
# Check magic bytes and version
Expand Down Expand Up @@ -620,7 +624,7 @@ def _read_att_array(self):
attributes = OrderedDict()
for attr in range(count):
name = asstr(self._unpack_string())
attributes[name] = self._read_values()
attributes[name] = self._read_att_values()
return attributes

def _read_var_array(self):
Expand Down Expand Up @@ -732,7 +736,7 @@ def _read_var(self):

return name, dimensions, shape, attributes, typecode, size, dtype_, begin, vsize

def _read_values(self):
def _read_att_values(self):
nc_type = self.fp.read(4)
n = self._unpack_int()

Expand Down Expand Up @@ -774,7 +778,7 @@ def _pack_string(self, s):
count = len(s)
self._pack_int(count)
self.fp.write(asbytes(s))
self.fp.write(b'0' * (-count % 4)) # pad
self.fp.write(b'\x00' * (-count % 4)) # pad

def _unpack_string(self):
count = self._unpack_int()
Expand Down

0 comments on commit 7f53fc5

Please sign in to comment.