In [None]:
import struct
from pathlib import Path
import numpy as np
import pandas as pd
from io import BytesIO
from PIL import Image as pi
from ipywidgets import HBox, VBox, Layout, HTML
from ipywidgets import Image as Image_widget

In [None]:
try:
    from fastai.vision import *
    from fastai.metrics import error_rate
    fastai_imported = True
except Exception as ex:
    print('Switch to fastapi-cpu kernel.')
    fastai_imported = False

In [None]:
with open('/Users/ray/data/kmnist/t10k-images-idx3-ubyte', 'rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    nrows, ncols = struct.unpack(">II", f.read(8))
    data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
    data = data.reshape((size, nrows, ncols))

print(magic, size, nrows, ncols)

In [None]:
data.shape

In [None]:
images = [pi.fromarray(data[n,:,:]) for n in range(len(data))]

In [None]:
images = []
for n in range(len(data)):
    b = BytesIO()
    im = pi.fromarray(data[n,:,:])
    im.save(b, format='png')
    images.append(b.getvalue())

In [None]:
# Grid of relearned kanji images.
z_container_layout = Layout(border='0px solid  grey', width='50px', length='50px', margin='0px 0px 0px 0px')
a_container_layout = Layout(border='0px solid red')
b_container_layout = Layout(border='0px solid green', justify_content='flex-start')
c_container_layout = Layout(border='0px solid black', width='50%', flex_direction='column', justify_content='space-around')
no_boxes_per_line = 10
fig1 = VBox(children=[HBox(children=[VBox(children=[Image_widget(value=image, layout=z_container_layout)], layout=a_container_layout) 
                     for image in images[10*m:10*m+10]], layout=b_container_layout) for m in range(13)], layout=c_container_layout)

In [None]:
with open('/Users/ray/data/kmnist/t10k-labels-idx1-ubyte', 'rb') as f:
    magic, size = struct.unpack(">II", f.read(8))
    labels = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))

print(magic, size, nrows, ncols)

In [None]:
# Grid of labels.
hiragana = ['お', 'き', 'す', 'つ', 'な', 'は', 'ま', 'や', 'れ', 'を']
h_labels = [hiragana[l] for l in labels]
a_container_layout = Layout(border='0px solid red')
b_container_layout = Layout(border='0px solid green', justify_content='space-between')
c_container_layout = Layout(border='0px solid black', width='50%', flex_direction='column', justify_content='space-around')
no_boxes_per_line = 10
fig2 = VBox(children=[HBox(children=[HBox(children=[HTML(value=str(label))], layout=a_container_layout) 
                     for label in h_labels[no_boxes_per_line*m:no_boxes_per_line*m+no_boxes_per_line]], layout=b_container_layout) for m in range(13)],
     layout=c_container_layout)

In [None]:
HBox(children=[fig1, fig2], layout=Layout(border='0px solid black', justify_content='space-around'))

In [None]:
!ls /Users/ray/data/kmnist/

In [None]:
character_images = []
for label in [0, 2]:
    subscripts = [n for n,l in enumerate(labels) if l == label]
    character_images.extend([images[n] for n in subscripts[0:30]])

In [None]:
# Grid of relearned kanji images.
z_container_layout = Layout(border='0px solid  grey', width='80px', length='80px', margin='0px 0px 0px 0px')
a_container_layout = Layout(border='0px solid red')
b_container_layout = Layout(border='0px solid green', justify_content='flex-start')
c_container_layout = Layout(border='0px solid black', width='100%', flex_direction='column', justify_content='space-around')
no_boxes_per_line = 10
fig1 = VBox(children=[HBox(children=[VBox(children=[Image_widget(value=image, layout=z_container_layout)], layout=a_container_layout) 
                     for image in character_images[30*m:30*m+30]], layout=b_container_layout) for m in range(10)], layout=c_container_layout)

In [None]:
fig1

In [None]:
# Grid of relearned kanji images.
z_container_layout = Layout(border='0px solid  grey', width='80px', length='80px', margin='0px 0px 0px 0px')
a_container_layout = Layout(border='0px solid red')
b_container_layout = Layout(border='0px solid green', justify_content='flex-start')
c_container_layout = Layout(border='0px solid black', width='30%', flex_direction='column', justify_content='space-around')
no_boxes_per_line = 10
fig1 = VBox(children=[HBox(children=[VBox(children=[Image_widget(value=image, layout=z_container_layout)], layout=a_container_layout) 
                     for image in character_images[10*m:10*m+10]], layout=b_container_layout) for m in range(10)], layout=c_container_layout)

In [None]:
fig1

In [None]:
b = BytesIO()
im = pi.open('images/200014735/image/200014735_00014.jpg')
im.save(b, format='png')

In [None]:
im.size

In [None]:
box = (1000, 820, 5300, 2950)
region = im.crop(box)
imgByteArr = BytesIO()
region.save(imgByteArr, format='PNG')
imgByteArr = imgByteArr.getvalue()

In [None]:
HBox(children=[fig1, VBox(children=[Image_widget(value=imgByteArr)], layout=Layout(width='60%'))], layout=Layout(border='0px solid black', justify_content='space-around'))

In [None]:
['お', 'き', 'す', 'つ', 'な', 'は', 'ま', 'や', 'れ', 'を']

In [None]:
DATA_PATH = Path('/Users/ray/.fastai/data/mnist_sample/')

In [None]:
!ls {DATA_PATH}

In [None]:
PATH = DATA_PATH / 'labels.csv'
!head {PATH}

In [None]:
PATH

In [None]:
PATH

In [None]:
!ls /Users/ray/data/kmnist/

In [None]:
if False:
    metadata = []
    with open('/Users/ray/data/kmnist/t10k-labels-idx1-ubyte', 'rb') as f:
        magic, size = struct.unpack(">II", f.read(8))
        labels = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))

    with open('/Users/ray/data/kmnist/t10k-images-idx3-ubyte', 'rb') as f:
        magic, size = struct.unpack(">II", f.read(8))
        nrows, ncols = struct.unpack(">II", f.read(8))
        data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
        data = data.reshape((size, nrows, ncols))

    for n,label in zip(range(len(data)), labels):
        im = pi.fromarray(data[n,:,:])
        outfile = 'kmnist/valid/' + str(n) + '.png'
        im.save(outfile, format='png')
        metadata.append([outfile, label])

    # Now Training data.
    with open('/Users/ray/data/kmnist/train-labels-idx1-ubyte', 'rb') as f:
        magic, size = struct.unpack(">II", f.read(8))
        labels = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))

    with open('/Users/ray/data/kmnist/train-images-idx3-ubyte', 'rb') as f:
        magic, size = struct.unpack(">II", f.read(8))
        nrows, ncols = struct.unpack(">II", f.read(8))
        data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
        data = data.reshape((size, nrows, ncols))

    for n,label in zip(range(len(data)), labels):
        im = pi.fromarray(data[n,:,:])
        outfile = 'kmnist/train/' + str(n) + '.png'
        im.save(outfile, format='png')
        metadata.append([outfile, label])

    metadata_df = pd.DataFrame(metadata, columns=['name', 'label'])
    metadata_df['name'] = metadata_df.name.str.replace('kmnist/', '')
    metadata_df.to_csv('kmnist/labels.csv', index=False)

In [None]:
if fastai_imported == True:
    data = ImageDataBunch.from_csv('/Users/ray/scratch/japanese_text_analysis/notebooks/kmnist/')
    data.show_batch(rows=3, figsize=(5,5))
    learn = cnn_learner(data, models.resnet50, metrics=accuracy)

In [None]:
if fastai_imported == True:
    if not learn.load('kmnist-stage-1-50'):
        print('Could not load model, training instead.')
        learn.fit(4)
        learn.save('kmnist-stage-1-50');

In [None]:
if fastai_imported == True:
    interp = ClassificationInterpretation.from_learner(learn)
    losses,idxs = interp.top_losses()
    len(data.valid_ds)==len(losses)==len(idxs)

In [None]:
if fastai_imported == True:
    interp.plot_top_losses(9, figsize=(15,11))

In [None]:
if fastai_imported == True:
    interp.plot_confusion_matrix(figsize=(12,12), dpi=60)