# DataBlock API foundations

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
#export
from exp.nb_07a import *

In [None]:
datasets.URLs.IMAGENETTE_160

## Image ItemList

Previously we were reading in to RAM the whole MNIST dataset at once, loading it as a pickle file. We can't do that for datasets larger than our RAM capacity, so instead we leave the images on disk and just grab the ones we need for each mini-batch as we use them.

Let's use the [imagenette dataset](https://github.com/fastai/imagenette/blob/master/README.md) and build the data blocks we need along the way.

### Get Images

In [3]:
path = datasets.untar_data(datasets.URLs.IMAGENETTE_160)
path

NameError: name 'fdatasets' is not defined

To be able to look at what's inside a directory from a notebook, we add the `.ls` method to `Path` with a monkey-patch.

In [4]:
#export
import PIL, os, mimetypes
Path.ls = lambda x: list(x.iterdir())

In [5]:
path.ls()

NameError: name 'path' is not defined

In [None]:
(path/'val').ls()

Let's have a look inside a class folder (the first class is tench):

In [None]:
path_tench = path/'val'/ 'n01440764'

In [None]:
img_fn = path_tench.ls()[0]
img_fn

In [None]:
img = PIL.Image.open(img_fn)
img

In [None]:
plt.imshow(img)

In [None]:
import numpy
imga = numpy.array(img)

In [None]:
imga.shape

In [None]:
imga[:10, :10, 0]

Just in case there are other files in the directory (models, texts...) we want to keep only the images. Let's not write it out by hand, but instead use what's already on our computer (the MIME types database).

In [None]:
#export
image_extensions = set(k for k,v in mimetypes.types_map.items())

In [None]:
''.join(image_extensions)

In [None]:
#export
def setify(o): return o if isinstance(o, set) else set(listify(o))

In [None]:
test_eq(setify('aa'), {'aa'})
test_eq(setify(['aa',1]), {'aa', 1})
test_eq(setify(None), set())
test_eq(setify(1), {1})
test_eq(setify({1}), {1})

Now let's walk through the directories and grab all the images. The first private function grabs all the images inside a given directory and the second one walks (potentially recursively) through all the folder in `path`.

In [6]:
#export
def _get_files(p, fs, extensions=None):
    p = Path(p)# if it's not a path object already then this converts it into one.
    res = [p/f for f in fs if not f.startswith('.')
          and ((not extensions) or f'.{f.split(".")[-1].lower()}' in extensions)]
    return res

In [7]:
t = [o.name for o in os.scandir(path_tench)]
t = _get_files(path, t, extensions=image_extensions)
t[:3]

NameError: name 'path_tench' is not defined

In [None]:
#export
def get_files(path, extensions=None, recurse=False, include=None):
    path = Path(path)
    extensions = setify(extensions)
    extensions = {e.lower() for e in extensions}
    if recurse:
        res = []
        for i,(p,d,f) in enumerate(os.walk(path)):# returns (dirpath, dirnames, filenames)
            if include is not None and i==0: d[:] = [o for o in d if o in include]
            else:                            d[:] = [o for o in d if ] 
                