# Reading and writing numpys fileformat without np.load and np.save

In the numpy manual there's a nice description of the [.npy fileformat](https://numpy.org/devdocs/reference/generated/numpy.lib.format.html#npy-format), with a note under [capabilities](https://numpy.org/devdocs/reference/generated/numpy.lib.format.html#capabilities) that says:

> Is straightforward to reverse engineer.<br>Datasets often live longer than the programs that created them.<br>A competent developer should be able to create a solution in their preferred programming language to read most .npy files that they have been given without much documentation.

So let's look at a numpy file:

In [1]:
import numpy as np

In [2]:
fn = "my.npy"
arr = np.array([str(i) for i in range(100, 200, 11)] + ["hello", "world"] + ["æ"])
print(arr)

np.save(fn, arr)
data = np.load("my.npy")
assert np.all(data == arr)

['100' '111' '122' '133' '144' '155' '166' '177' '188' '199' 'hello'
 'world' 'æ']


No surprises here. With the print statement, we will know if we get the array right at the other end. 

In [3]:
with open(fn, 'rb') as f:
    print(f.read())

b"\x93NUMPY\x01\x00v\x00{'descr': '<U5', 'fortran_order': False, 'shape': (13,), }                                                           \n1\x00\x00\x000\x00\x00\x000\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x001\x00\x00\x001\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x002\x00\x00\x002\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x003\x00\x00\x003\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x004\x00\x00\x004\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x005\x00\x00\x005\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x006\x00\x00\x006\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x007\x00\x00\x007\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x008\x00\x00\x008\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x009\x00\x00\x009\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00h\x00\x00\x00e\x00\x00\x00l\x00\x00\x00l\x00\x00\x00o\x00\x00\x00w\x00\x00\x00o\x00\x00\x00r\x00\x00\x00l\x00\x00\x00d\x00\x00\x00\xe6\x00\x00\x00\

The header of the binary is also nicely humanly readable.

The [documentation](https://numpy.org/devdocs/reference/generated/numpy.lib.format.html#format-version-1-0) tells us that:

> **Format Version 1.0**<br>
> The first 6 bytes are a magic string: exactly \x93NUMPY.<br>
> The next 1 byte is an unsigned byte: the major version number of the file format, e.g. \x01.<br>
> The next 1 byte is an unsigned byte: the minor version number of the file format, e.g. \x00. Note: the version of the file format is not tied to the version of the numpy package.<br>
> The next 2 bytes form a little-endian unsigned short int: the length of the header data HEADER_LEN.<br>
> The next HEADER_LEN bytes form the header data describing the array’s format. It is an ASCII string which contains a Python literal expression of a dictionary. It is terminated by a newline > (\n) and padded with spaces (\x20) to make the total of len(magic string) + 2 + len(length) + HEADER_LEN be evenly divisible by 64 for alignment purposes.<br>

Let's write a function for that:

In [4]:
import ast

def read(fn):
    with open(fn, "rb") as f:
        arr = f.read(10)
        magic = arr[:6]
        major = ord(arr[6:7])
        assert major == 1, "tablite should never read version 2 or greater"
        minor = ord(arr[7:8])
        header_len = int.from_bytes(arr[8:10], "little")
        header_str = f.read(header_len)
        header = ast.literal_eval(header_str.decode("ascii"))

        assert magic == b"\x93NUMPY"
        assert (len(arr) + header_len) % 64 == 0
        assert isinstance(header, dict)
        dtype = np.dtype(header["descr"])  # dtype will be str
        fortran_order = header["fortran_order"]
        shape = header["shape"]
        assert isinstance(fortran_order, bool)
        assert isinstance(shape, tuple)
        assert len(shape) == 1

        array = np.ndarray(shape, dtype=dtype)

        data = f.read()
        array[:] = np.frombuffer(data, dtype=dtype)
    return array

In [5]:
new_array = read(fn)
new_array

array(['100', '111', '122', '133', '144', '155', '166', '177', '188',
       '199', 'hello', 'world', 'æ'], dtype='<U5')

In [6]:
assert np.all(arr == new_array)

The next question is then whther can I write the fileformat?

First I'll cheat a little using the numpy array, although I'll not use it later...

In [7]:
def write(fn, arr):
    with open(fn, "wb") as f:
        magic = b"\x93NUMPY"
        major = b"\x01"
        minor = b"\x00"
        header = {
            "descr": arr.dtype.str,
            "fortran_order": False,
            "shape": arr.shape,
        }
        header_str = str(header).encode("ascii")
        header_len = len(header_str)
        padding = 64 - ((len(magic) + len(major) + len(minor) + 2 + header_len)) % 64
        f.write(magic)
        f.write(major)
        f.write(minor)
        f.write((header_len + padding).to_bytes(2, "little"))
        f.write(header_str)
        f.write(b" " * (padding - 1) + "\n".encode("ascii"))
        f.write(arr.tobytes())

In [8]:
with open(fn, "rb") as f:
    blob = f.read()  # I read the original numpy file

write(fn, arr)  # I write my file

with open(fn, "rb") as f: # I read my file
    blob2 = f.read()

In [9]:
print(blob)
print(blob2)

b"\x93NUMPY\x01\x00v\x00{'descr': '<U5', 'fortran_order': False, 'shape': (13,), }                                                           \n1\x00\x00\x000\x00\x00\x000\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x001\x00\x00\x001\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x002\x00\x00\x002\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x003\x00\x00\x003\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x004\x00\x00\x004\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x005\x00\x00\x005\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x006\x00\x00\x006\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x007\x00\x00\x007\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x008\x00\x00\x008\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x001\x00\x00\x009\x00\x00\x009\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00h\x00\x00\x00e\x00\x00\x00l\x00\x00\x00l\x00\x00\x00o\x00\x00\x00w\x00\x00\x00o\x00\x00\x00r\x00\x00\x00l\x00\x00\x00d\x00\x00\x00\xe6\x00\x00\x00\

The blobs look identical aside from an extra comma inserted in the numpy format.

With the assertions below we know for sure:

In [10]:
data2 = read(fn)  # i read my file
assert np.all(data2 == arr)  # I check that the data is the same

data3 = np.load(fn)  # I let numpy load my file
assert np.all(data3 == arr)  # I check that the data is the same


In the numpy documentation we can read that [the internal layout](https://numpy.org/devdocs/reference/arrays.ndarray.html#internal-memory-layout-of-an-ndarray) of an ndarray is:

> a contiguous one-dimensional segment of computer memory ... combined with an indexing scheme that maps N integers into the location of an item in the block.
> The ranges in which the indices can vary is specified by the shape of the array. How many bytes each item takes and how the bytes are interpreted is defined by the data-type object associated with the array.
> A segment of memory is inherently 1-dimensional, and there are many different schemes for arranging the items of an N-dimensional array in a 1-dimensional block.
>  In a strided scheme, the N-dimensional index $(n_0,n_1,...,n_{N-1})$ corresponds to the offset (in bytes): $$n_{offset} = \sum_{k=0}^{N-1}s_{k}n_{k}$$ from the beginning of the memory block associated with the array. Here, are integers which specify the strides of the array.

As I'm mainly concerned with the 1-dimensional case, I can use the python bytearray as the closest matching datastructure without having to worry about n-dimensional strides.

In [31]:
import array

In [32]:
a1 = array.array('L',[1,3,2,5,4])
a2 = np.array([1,3,2,5,4],dtype=np.int32)
print("array:   ", bytearray(a1))
print("np.array:", bytearray(a2))

print(a1.tobytes())
print(a2.tobytes())

# strip the numpy array
print(a2.astype('i1').tobytes())

array:    bytearray(b'\x01\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00\x00\x05\x00\x00\x00\x04\x00\x00\x00')
np.array: bytearray(b'\x01\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00\x00\x05\x00\x00\x00\x04\x00\x00\x00')
b'\x01\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00\x00\x05\x00\x00\x00\x04\x00\x00\x00'
b'\x01\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00\x00\x05\x00\x00\x00\x04\x00\x00\x00'
b'\x01\x03\x02\x05\x04'


In [40]:
text = "text"
bext = text.encode("utf-8")
arrt = array.array("u", text)
arrt.tobytes()

b't\x00e\x00x\x00t\x00'

In [48]:
data = [str(i) for i in range(100, 200, 11)] + ["hello", "world"] + ["æ"]
longest = max(len(i) for i in data)
data2 = [i.ljust(longest) for i in data]
bdata = b"".join([bytes(i, "utf-8") for i in data2])
print(bdata)




b'100  111  122  133  144  155  166  177  188  199  helloworld\xc3\xa6    '


In [49]:
for step in range(0,len(bdata), longest):
    print(bdata[step:step+longest])

b'100  '
b'111  '
b'122  '
b'133  '
b'144  '
b'155  '
b'166  '
b'177  '
b'188  '
b'199  '
b'hello'
b'world'
b'\xc3\xa6   '
b' '


In [50]:
for step in range(0,len(bdata), longest):
    print(bdata[step:step+longest].decode('utf-8'))

100  
111  
122  
133  
144  
155  
166  
177  
188  
199  
hello
world
æ   
 


## Conclusions?

I really appreciate the beautiful simplicity of the fileformat. 

I think the next step for me is to read/write `.npy` from `nim`.

In [11]:
import pathlib
pathlib.Path(fn).unlink()