## KMNIST Data Setup

[KMNIST Dataset description: ](http://codh.rois.ac.jp/kmnist/)


[Dataset on github:](https://github.com/rois-codh/kmnist)

| File            | Examples | Download (MNIST format)    | Download (NumPy format)      |
|-----------------|--------------------|----------------------------|------------------------------|
| Training images | 60,000             | [train-images-idx3-ubyte.gz](http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz) (18MB) | [kmnist-train-imgs.npz](http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz) (18MB)   |
| Training labels | 60,000             | [train-labels-idx1-ubyte.gz](http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz) (30KB) | [kmnist-train-labels.npz](http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz) (30KB)  |
| Testing images  | 10,000             | [t10k-images-idx3-ubyte.gz](http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz) (3MB) | [kmnist-test-imgs.npz](http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz) (3MB)   |
| Testing labels  | 10,000             | [t10k-labels-idx1-ubyte.gz](http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz) (5KB)  | [kmnist-test-labels.npz](http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz) (5KB) |

In [None]:
from pathlib import Path
import requests
import gzip

import struct
from pathlib import Path
import numpy as np
import pandas as pd
from io import BytesIO
from PIL import Image as pi

try:
    from fastai.vision import *
    from fastai.metrics import error_rate
    fastai_imported = True
except Exception as ex:
    print('Switch to fastapi-cpu kernel to train model.')
    fastai_imported = False

In [None]:
from pathlib import Path
import requests

DATA_PATH = Path("../data/raw")
PATH = DATA_PATH / "kmnist"

PATH.mkdir(parents=True, exist_ok=True)

URL = "http://codh.rois.ac.jp/kmnist/dataset/kmnist/"
FILENAMES = ['train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz', 
             't10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz']

for FILENAME in FILENAMES:
    if not (PATH / FILENAME).exists():
            content = requests.get(URL + FILENAME).content
            (PATH / FILENAME).open("wb").write(content)

In [None]:
def extract_kmnist():
    metadata = []
    
    with gzip.open(PATH / 't10k-labels-idx1-ubyte.gz', 'rb') as fp:
        magic, size = struct.unpack(">II", fp.read(8))
        labels = np.frombuffer(fp.read(), dtype=np.dtype(np.uint8).newbyteorder('>'))
        print(magic, size)

    with gzip.open(PATH / 't10k-images-idx3-ubyte.gz', 'rb') as fp:
        magic, size = struct.unpack(">II", fp.read(8))
        nrows, ncols = struct.unpack(">II", fp.read(8))
        data = np.frombuffer(fp.read(), dtype=np.dtype(np.uint8).newbyteorder('>'))
        data = data.reshape((size, nrows, ncols))
        print(magic, size, nrows, ncols)

    VALID_PATH = PATH / 'valid'
    VALID_PATH.mkdir(parents=True, exist_ok=True)
    for n,label in zip(range(len(data)), labels):
        im = pi.fromarray(data[n,:,:])
        outfilename = str(n) + '.png'
        outfile = VALID_PATH / outfilename
        im.save(outfile, format='png')
        metadata.append(['valid/' + outfilename, label])

    # Now Training data.
    with gzip.open(PATH / 'train-labels-idx1-ubyte.gz', 'rb') as fp:
        magic, size = struct.unpack(">II", fp.read(8))
        labels = np.frombuffer(fp.read(), dtype=np.dtype(np.uint8).newbyteorder('>'))
        print(magic, size)

    with gzip.open(PATH / 'train-images-idx3-ubyte.gz', 'r') as fp:
        magic, size = struct.unpack(">II", fp.read(8))
        nrows, ncols = struct.unpack(">II", fp.read(8))
        data = np.frombuffer(fp.read(), dtype=np.dtype(np.uint8).newbyteorder('>'))
        data = data.reshape((size, nrows, ncols))
        print(magic, size, nrows, ncols)
        
    TRAIN_PATH = PATH / 'train'
    TRAIN_PATH.mkdir(parents=True, exist_ok=True)
    for n,label in zip(range(len(data)), labels):
        im = pi.fromarray(data[n,:,:])
        outfilename = str(n) + '.png'
        outfile = TRAIN_PATH / outfilename
        im.save(outfile, format='png')
        metadata.append(['train/' + outfilename, label])

    metadata_df = pd.DataFrame(metadata, columns=['name', 'label'])
    metadata_df.to_csv(PATH / 'labels.csv', index=False)

In [None]:
if not (PATH / 'train').exists():
    extract_kmnist()

In [None]:
if fastai_imported == True:
    data = ImageDataBunch.from_csv(PATH)
    data.show_batch(rows=3, figsize=(5,5))

In [None]:
if fastai_imported == True:
    learn = cnn_learner(data, models.resnet50, metrics=accuracy)
    if not learn.load('kmnist-stage-1-50'):
        print('Could not load model, training instead.')
        learn.fit(4)
        learn.save('kmnist-stage-1-50');

In [None]:
if fastai_imported == True:
    interp = ClassificationInterpretation.from_learner(learn)
    losses,idxs = interp.top_losses()
    len(data.valid_ds)==len(losses)==len(idxs)

In [None]:
if fastai_imported == True:
    interp.plot_top_losses(9, figsize=(15,11))

In [None]:
if fastai_imported == True:
    interp.plot_confusion_matrix(figsize=(12,12), dpi=60)

In [None]:
with gzip.open(PATH / 't10k-images-idx3-ubyte.gz', 'rb') as fp:
    magic, size = struct.unpack(">II", fp.read(8))
    nrows, ncols = struct.unpack(">II", fp.read(8))
    data_raw = np.frombuffer(fp.read(), dtype=np.dtype(np.uint8).newbyteorder('>'))
    data_raw = data_raw.reshape((size, nrows, ncols))
    print(magic, size, nrows, ncols)

images = [pi.fromarray(data_raw[n,:,:]) for n in range(len(data_raw))]

In [None]:
images = [pi.fromarray(data_raw[n,:,:]) for n in range(len(data_raw))]

In [None]:
images[0]

In [None]:
images = []
for n in range(len(data_raw)):
    b = BytesIO()
    im = pi.fromarray(data_raw[n,:,:])
    im.save(b, format='png')
    images.append(b.getvalue())

In [None]:
from ipywidgets import Image as Image_widget

In [None]:
Image_widget(value=images[0])