In [None]:
#|default_exp datasets

In [None]:
# pip install datasets

In [None]:
cd /home/admin/projects/learn_miniai/

### Hugging Faces datasets

In [None]:
from datasets import load_dataset, load_dataset_builder

In [None]:
name = 'fashion_mnist'
dataset = load_dataset(name)

In [None]:
dataset

In [None]:
dataset_builder = load_dataset_builder(name)

In [None]:
dataset_builder.name

Some of the main attributes and functions of DatasetBuilder include:

* name: the name of the dataset

* info: a DatasetInfo object containing metadata about the dataset

* cache_files: a boolean indicating whether dataset files should be cached

* manual_download: a boolean indicating whether dataset files should be downloaded manually

* download_and_prepare(): a function to download and prepare the dataset

* inspect(): a function to inspect the contents of the dataset

* subsplit(): a function to split the dataset into subsplits

* as_dataset(): a function to convert the dataset to a tf.data.Dataset object

Additionally, some useful DatasetBuilder functions that can be used to manipulate and explore datasets include:

* select(): select a subset of features from the dataset

* filter(): filter examples in the dataset based on a given condition

* shuffle(): shuffle the dataset

* sort(): sort the dataset

In [None]:
print(dataset_builder.info.description)

In [None]:
dataset_builder.info.features

In [None]:
dataset_builder.info.splits

In [None]:
dataset_builder.info.download_checksums

In [None]:
x, y = dataset_builder.info.features
print(x, y)

In [None]:
train_ds = dataset['train']
test_ds = dataset['test']
# they have all functions of a dataset __len__ and __iter__
len(train_ds), next(iter(train_ds)), train_ds[0]

In [None]:
featy = train_ds.features[y]

In [None]:
train_ds.features

In [None]:
train_ds.features

In [None]:
featy.int2str(train_ds[0][y])

In [None]:
featy.str2int('Ankle boot')

In [None]:
train_ds[0][x]

In [None]:
train_ds[0]

In [None]:
from torch import tensor
import numpy as np
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = (1, 1)

In [None]:
plt.imshow(np.array(train_ds[0][x]), cmap='gray')

In [None]:
#|export
import torchvision.transforms.functional as TF

In [None]:
TF.to_tensor(train_ds[0]['image']).shape

In [None]:
def collate_fn(b):
    images = torch.stack([TF.to_tensor(x['image']) for x in b])
    labels = tensor([x['label'] for x in b])
    return images, labels

In [None]:
import torch
from torch.utils.data import DataLoader

train_dl = DataLoader(dataset=train_ds, batch_size=50,shuffle=True, collate_fn=collate_fn)

In [None]:
xb, yb = next(iter(train_dl))

In [None]:
xb.shape

In [None]:
featy.int2str(yb[0].item())

In [None]:
plt.imshow(xb[0,0], cmap='gray', )
plt.title(featy.int2str(yb[0].item()))
plt.show()

In [None]:
def transforms(x):
    x['image'] = [TF.to_tensor(o) for o in x['image']]
    return x

In [None]:
next(iter(train_ds))

In [None]:
tds = train_ds.with_transform(transforms)

In [None]:
next(iter(tds))

In [None]:
train_dl = DataLoader(dataset=tds, batch_size=50)

In [None]:
b = next(iter(train_dl))

In [None]:
b['image'].shape, b['label'].shape

### create inplace function to simplify code

In [None]:
#|export
def inplace(f):
    def _f(b):
        f(b)
        return b
    return _f

In [None]:
x={'a': 1, 'b': 2}
def _transform(x): x['a'] = 2

@inplace
def transform(x): x['a'] = 2
print(transform(x))

In [None]:
@inplace
def transforms(x): x['image'] = [TF.to_tensor(o) for o in x['image']]
    
tds = train_ds.with_transform(transforms)
train_dl = DataLoader(dataset=tds, batch_size=50)

b = next(iter(train_dl))
b['image'].shape, b['label'].shape

In [None]:
#|export
from operator import itemgetter

In [None]:
my_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
list(map(itemgetter(1), my_list))


In [None]:
from torch.utils.data import default_collate

In [None]:
batch = dict(image=[1],label=[2]), dict(image=[3],label=[4])
batch

In [None]:
default_collate(batch)

In [None]:
itemgetter('image', 'label')(default_collate(batch))

In [None]:
itemgetter(*train_ds.features)(default_collate(batch))

In [None]:
def collate_dict(b):
    get = itemgetter('image', 'label')
    def _f(b):
        return get(default_collate(b))
    return _f(b)

In [None]:
train_dl = DataLoader(dataset=tds, batch_size=8, collate_fn=collate_dict)

In [None]:
xb, yb = next(iter(train_dl))
xb.shape, yb.shape

## Plotting images

In [None]:
#|export
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (5, 5)

In [None]:
fig, axs = plt.subplots(2, 2)
axs[0, 0].plot([1, 2, 3], [4, 5, 6])
axs[0, 1].scatter([1, 2, 3], [4, 5, 6])
axs[1, 0].bar([1, 2, 3], [4, 5, 6])
axs[1, 1].hist([1, 2, 2, 3, 3, 3, 3])
plt.show()

In [None]:
# show one image
nrows, ncols= 1, 1
fig, axs = plt.subplots(nrows=1, ncols=1)
fig.set_size_inches(nrows, ncols)
axs.imshow(xb[0][0], cmap='gray')
plt.show()

In [None]:
# show two images
nrows, ncols= 1, 2
fig, axs = plt.subplots(nrows, ncols)
fig.set_size_inches(3, 3)
axs[0].imshow(xb[0][0], cmap='gray')
axs[1].imshow(xb[1][0], cmap='gray')
plt.show()

In [None]:
# show four images
nrows, ncols= 2, 2
fig, axs = plt.subplots(nrows, ncols)
fig.set_size_inches(3, 3)

axs[0,0].imshow(xb[0][0], cmap='gray')
axs[0,0].set_title(f'{featy.int2str(yb[0].item())}', fontsize=11)
axs[0,0].axis('off')

axs[0,1].imshow(xb[1][0], cmap='gray')
axs[0,1].set_title(f'{featy.int2str(yb[1].item())}', fontsize=11)
axs[1,0].imshow(xb[2][0], cmap='gray')
axs[1,0].set_title(f'{featy.int2str(yb[2].item())}', fontsize=11)
axs[1,1].imshow(xb[3][0], cmap='gray')
axs[1,1].set_title(f'{featy.int2str(yb[3].item())}', fontsize=11)

fig.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

In [None]:
#|export
def show_image(img, ax, title=None, fontsize=11):
    ax.imshow(img, cmap='gray')
    if title is not None: ax.set_title(title, fontsize=fontsize)
    ax.axis('off')

In [None]:
nrows, ncols= 2, 2
fig, axs = plt.subplots(nrows, ncols)
fig.set_size_inches(3, 3)

show_image(xb[0][0], axs[0,0], title=featy.int2str(yb[0].item()))
show_image(xb[1][0], axs[0,1], title=featy.int2str(yb[1].item()))
show_image(xb[2][0], axs[1,0], title=featy.int2str(yb[2].item()))
show_image(xb[3][0], axs[1,1], title=featy.int2str(yb[3].item()))

fig.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

In [None]:
nrows, ncols= 2, 2
fig, axs = plt.subplots(nrows, ncols)
fig.set_size_inches(3, 3)

for i, ax in enumerate(axs.reshape(-1)):
    show_image(xb[i][0], ax, title=featy.int2str(yb[i].item()))
    
fig.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

In [None]:
nrows, ncols= 3, 3
fig, axs = plt.subplots(nrows, ncols)
fig.set_size_inches(3, 3)

for img, ax in zip(xb, axs.flat):
    show_image(img[0], ax, title=featy.int2str(yb[i].item()))
    
fig.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

In [None]:
def show_image(img, ax, title=None, fontsize=11):
    if img.shape[0]==1: img = img[0]
    ax.imshow(img, cmap='gray')
    if title is not None: ax.set_title(title, fontsize=fontsize)
    ax.axis('off')

In [None]:
nrows, ncols= 2, 2
fig, axs = plt.subplots(nrows, ncols)
fig.set_size_inches(3, 3)

for i, (img, ax) in enumerate(zip(xb, axs.flat)):
    show_image(img, ax, title=featy.int2str(yb[i].item()))
    
fig.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

In [None]:
titles = None #[featy.int2str(o.item()) for o in yb]
if titles is None: titles = [None] * len(xb)
nrows, ncols= 3, 3
fig, axs = plt.subplots(nrows, ncols)
fig.set_size_inches(3, 3)

for (img, title), ax in zip(zip(xb, titles), axs.flat):
    show_image(img, ax, title=title)
    
fig.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

In [None]:
#|export
def show_images(imgs, titles=None, n_rows=3, n_cols=3):
    if titles is None: titles = [None] * len(imgs)
    fig, axs = plt.subplots(n_rows, n_cols)
    fig.set_size_inches(n_rows * 2, n_cols * 2)

    for (img, title), ax in zip(zip(imgs, titles), axs.flat):
        if len(img.shape) == 3 and img.shape[0] == 1:
            img = img[0]
        show_image(img, ax, title=title)

    fig.subplots_adjust(hspace=0.5, wspace=0.5)
    plt.show()

In [None]:
titles = [featy.int2str(o.item()) for o in yb]
show_images(xb, None, 3, 3)

In [None]:
#|export
from torch.utils.data import default_collate
from miniai.training import get_dls

@inplace
def transforms(x): x['image'] = [TF.to_tensor(o) for o in x['image']]

def collate_dict(ds):
    get = itemgetter(*ds.features)
    def _f(b):
        return get(default_collate(b))
    return _f

class DataLoaders():
    def __init__(self, *args, **kwargs):
        self.train, self.val = args[0:2]
        
    @classmethod
    def from_dd(cls, dd, bs, **kwargs):
        f = collate_dict(dd['train'])
        return cls(*get_dls(*dd.values(), bs, collate_fn=f, **kwargs))

In [None]:
dd_tf = dataset.with_transform(transforms)
bs = 5

In [None]:
dls = DataLoaders.from_dd(dd_tf, 50, num_workers=4)

In [None]:
xb, yb = next(iter(dls.train))

In [None]:
xb.shape, yb.shape

In [1]:
import nbdev; nbdev.nbdev_export()