# くずし字
<img src="images/kuzushiji.png" alt="kuzushiji illustration" width="100%"/>

## Imports

In [None]:
from pathlib import Path
import requests
from zipfile import ZipFile

import struct
from pathlib import Path
import numpy as np
import pandas as pd
from io import BytesIO
from PIL import Image as pi
from ipywidgets import HBox, VBox, Layout, HTML
from ipywidgets import Image as Image_widget

try:
    from fastai.vision import *
    from fastai.metrics import error_rate
    fastai_imported = True
except Exception as ex:
    print('Switch to fastapi-cpu kernel to train or use model.')
    fastai_imported = False

## Kuzushiji-49

くずし字　[Kuzushiji-49 Dataset description](http://codh.rois.ac.jp/kmnist/)

[Resources for hentaigana](https://wakancambridge.files.wordpress.com/2017/05/useful-resources-for-the-study-of-hentaigana-with-recommended1.pdf)
<img src="images/hentaigana.png" alt="hentaigana chart" width="48%" align="left"/>

[Kuzushiji-49](https://github.com/rois-codh/kmnist) contains 270,912 images spanning 49 classes, and is an extension of the Kuzushiji-MNIST dataset.

| File            | Examples |  Download (NumPy format)      |
|-----------------|--------------------|----------------------------|
| Training images | 232,365            | [k49-train-imgs.npz](http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-imgs.npz) (63MB)   |
| Training labels | 232,365            | [k49-train-labels.npz](http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-labels.npz) (200KB)  |
| Testing images  | 38,547             | [k49-test-imgs.npz](http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-imgs.npz) (11MB)   |
| Testing labels  | 38,547             | [k49-test-labels.npz](http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-labels.npz) (50KB) |

Mapping from class indices to characters: [k49_classmap.csv](http://codh.rois.ac.jp/kmnist/dataset/k49/k49_classmap.csv) (1KB)

In [None]:
path = Path("../data/raw") / "k49"

path.mkdir(parents=True, exist_ok=True)

URL = "http://codh.rois.ac.jp/kmnist/dataset/k49/"
filenames = ['k49-train-imgs.npz', 'k49-train-labels.npz', 
             'k49-test-imgs.npz', 'k49-test-labels.npz',
             'k49_classmap.csv']

for fn in filenames:
    if not (path / fn).exists():
            content = requests.get(URL + fn).content
            (path / fn).open("wb").write(content)

In [None]:
def extract_k49():
    metadata = []
        
    data = np.load(path / 'k49-test-labels.npz')
    array_name = data.files[0]
    test_labels = data[array_name]
    
    data = np.load(path / 'k49-test-imgs.npz')
    array_name = data.files[0]
    test_images = data[array_name]

    valid_path = path / 'valid'
    valid_path.mkdir(parents=True, exist_ok=True)
    for n,label in zip(range(len(test_images)), test_labels):
        im = pi.fromarray(test_images[n,:,:])
        outfilename = str(n) + '.png'
        outfile = valid_path / outfilename
        im.save(outfile, format='png')
        metadata.append(['valid/' + outfilename, label])

    # Now Training data.
    data = np.load(path / 'k49-train-labels.npz')
    array_name = data.files[0]
    train_labels = data[array_name]
    
    data = np.load(path / 'k49-train-imgs.npz')
    array_name = data.files[0]
    train_images = data[array_name]
        
    train_path = path / 'train'
    train_path.mkdir(parents=True, exist_ok=True)
    for n,label in zip(range(len(train_images)), train_labels):
        im = pi.fromarray(train_images[n,:,:])
        outfilename = str(n) + '.png'
        outfile = train_path / outfilename
        im.save(outfile, format='png')
        metadata.append(['train/' + outfilename, label])

    metadata_df = pd.DataFrame(metadata, columns=['name', 'label'])
    metadata_df.to_csv(path / 'labels.csv', index=False)

In [None]:
if not (path / 'train').exists():
    extract_k49()

In [None]:
if fastai_imported == True:
    data = ImageDataBunch.from_csv(path)
    data.show_batch(rows=3, figsize=(5,5))

In [None]:
if fastai_imported == True:
    learn = cnn_learner(data, models.resnet50, metrics=accuracy)
    if not learn.load('kmnist-stage-2-50'):
        print('Could not load model, training instead.')
        learn.fit(4)
        learn.save('kmnist-stage-1-50');

In [None]:
# learn = cnn_learner(data, models.resnet50, metrics=accuracy)
# learn.fit(4, 3e-3)
# learn.save('kmnist-stage-1-50');
# learn.lr_find()
# learn.recorder.plot()
# learn.unfreeze()
# learn.fit_one_cycle(4, slice(3e-5, 3e-4))
# learn.save('kmnist-stage-2-50');

In [None]:
if fastai_imported == True:
    interp = ClassificationInterpretation.from_learner(learn)
    losses,idxs = interp.top_losses()
    len(data.valid_ds)==len(losses)==len(idxs)

In [None]:
if fastai_imported == True:
    interp.plot_top_losses(9, figsize=(15,11))

In [None]:
if fastai_imported == True:
    interp.plot_confusion_matrix(figsize=(12,12), dpi=60)

## Hentaigana

In [None]:
df = pd.read_html('https://en.wikipedia.org/wiki/Hentaigana', attrs={"class": "wikitable"})[-1]

In [None]:
df

In [None]:
hentaigana = {}
hentaigana['あ'] = list('安悪亜愛')
hentaigana['お'] = list('於')
hentaigana['か'] = list('加閑可我駕賀歌哥香家嘉歟謌佳')

In [None]:
hentaigana

In [None]:
path = Path("../data/raw") / "kkanji"
path.mkdir(parents=True, exist_ok=True)

kkanji_url = 'http://codh.rois.ac.jp/kmnist/dataset/kkanji/'
filename = 'kkanji.tar'

if not (path / filename).exists():
    content = requests.get(kkanji_url + filename).content
    (path / filename).open("wb").write(content)
    
pathnames = !tar tvf {path/filename}

In [None]:
images = []
filenames = []

for char in hentaigana['か']:
    char_filenames = []
    char_filenames = [p.split()[-1] for p in pathnames if 'U+%04X' % ord(char) in p]
    char_filenames = char_filenames[1:]
    filenames.extend(char_filenames)

for fn in filenames:
    b = BytesIO()
    im = pi.open(path / fn)
    im.save(b, format='png')
    images.append(b.getvalue())

In [None]:
from math import ceil
no_columns = 20
no_rows = ceil(len(images)/no_columns)

In [None]:
z_container_layout = Layout(border='0px solid  grey', margin='0px 0px 0px 0px')
a_container_layout = Layout(border='0px solid red')
b_container_layout = Layout(border='0px solid green', justify_content='flex-start')
c_container_layout = Layout(border='0px solid black', width='90%', flex_direction='column', justify_content='space-around')
no_boxes_per_line = 10
fig1 = VBox(children=[HBox(children=[VBox(children=[Image_widget(value=image, layout=z_container_layout)], layout=a_container_layout) 
                     for image in images[no_columns*m:no_columns*(m+1)]], layout=b_container_layout) for m in range(no_rows)], 
            layout=c_container_layout)

In [None]:
fig1