Sorry I can't speak and write well.

First write in Japanese, then translate into English and append.

In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
input_path = Path("../input")
%matplotlib inline

# Load Data

In [None]:
# reference
# https://www.kaggle.com/anokas/kuzushiji-visualisation
df_train = pd.read_csv( input_path / 'train.csv')
unicode_trans = pd.read_csv( input_path / 'unicode_translation.csv')
train_image_path = input_path / "train_images"
test_image_path = input_path / "test_images"
unicode_map = {codepoint: char for codepoint, char in unicode_trans.values}

# Train.csv include in Nan Labels

こちらのdiscussionでも指摘されている通り、**train.csv**にnanデータが含まれている

- https://www.kaggle.com/c/kuzushiji-recognition/discussion/100748#latest-580727

In [None]:
# nan count per column
df_train.isnull().sum()

In [None]:
# Show NaN column
nan_df = df_train[df_train.isnull()["labels"] == True]
nan_df.head()

# Check Unique Label
**train.csv**内のユニークなラベルを取り出します。取り出す際の処理は以下のカーネルを参考にしています。

- https://www.kaggle.com/wakamezake/kuzushiji-pytorch-data-preprocessing

In [None]:
# get unique labels
length = 5
labels = []
for label in df_train["labels"]:
    # skip nan
    if type(label) == str:
        split_label = label.split()[::length]
        labels += split_label

**unicode_translation.csv**には存在するが**train.csv**に存在しないラベルが**569**個ある

In [None]:
print("Number of unique_label: {}".format(len(set(labels))))
print("Number of unique_label(unicode_translation.csv): {}".format(unicode_trans.shape[0]))
print("diff: {}".format(abs(len(set(labels)) - unicode_trans.shape[0])))

差集合を使って**unicode_translation.csv**にのみ存在するラベルを取り出したが569個と数が合わない？

In [None]:
unicode_trans_only_labels = set(unicode_trans["Unicode"]) - set(labels)
print("Number of unicode_trans_only_label: {}".format(len(unicode_trans_only_labels)))

In [None]:
unicode_trans_only_df = pd.DataFrame({"Unicode": list(unicode_trans_only_labels)})
unicode_trans_only_df["string"] = unicode_trans_only_df["Unicode"].map(unicode_map)
unicode_trans_only_df.head()

# Label Apperance Bar Plot
ラベルの出現頻度を可視化しました。全てのラベルの出現頻度を描画するととても時間がかかるため、Top10とBottom10のみ表示しています。また図中の表示に日本語の文字表示をすることができなかったため、図の下にUnicodeと文字の対応を載せています。

In [None]:
df_labels = pd.DataFrame({"label": labels})
# df_labels["str"] = df_labels["label"].map(unicode_map)
label_count  = df_labels["label"].value_counts()

In [None]:
# Reference
# https://www.kaggle.com/tejainece/seaborn-barplot-and-pandas-value-counts
# label_count  = df_labels["label"].value_counts()
# plt.figure(figsize=(16,10))
# sns.barplot(label_count.index, label_count.values, alpha=0.8)
# plt.title('all unicode counts')
# plt.ylabel('Number of count', fontsize=12)
# plt.xlabel('unicode', fontsize=12)
# plt.show()

In [None]:
# Reference
# https://www.kaggle.com/tejainece/seaborn-barplot-and-pandas-value-counts
plt.figure(figsize=(10,5))
sns.barplot(label_count[:10,].index, label_count[:10,].values, alpha=0.8)
plt.title('top 10 unicode counts')
plt.ylabel('Number of count', fontsize=12)
plt.xlabel('unicode', fontsize=12)
plt.show()

In [None]:
# convert unicode to str
d = {"unicode": label_count[:10].index.values,
     "str": [unicode_map[l] for l in label_count[:10].index.values]}
pd.DataFrame(d)

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(label_count[-11:-1,].index, label_count[-11:-1,].values, alpha=0.8)
plt.title('bottom 10 unicode counts')
plt.ylabel('Number of count', fontsize=12)
plt.xlabel('unicode', fontsize=12)
plt.show()

In [None]:
# convert unicode to str
d = {"unicode": label_count[-11:-1,].index.values,
     "str": [unicode_map[l] for l in label_count[-11:-1,].index.values]}
pd.DataFrame(d)

hirakanaの出現頻度が高いことがわかりました、では全体の何割程度がhirakanaなのか調べてみます。

In [None]:
# Reference
# https://ja.wikipedia.org/wiki/%E5%B9%B3%E4%BB%AE%E5%90%8D_(Unicode%E3%81%AE%E3%83%96%E3%83%AD%E3%83%83%E3%82%AF)
# > U+3040..U+309F
hirakana = re.compile(r"U\+30[4-9][0-9a-fA-F]")

In [None]:
sum(label_count)

In [None]:
# The appearance rate of Hirakana accounts for about 60% of the whole.
hirakana_counts = df_labels["label"].str.match(hirakana).value_counts()
hirakana_counts / sum(hirakana_counts) * 100

hirakanaの中でもどのhirakanaの出現頻度が高いのか気になります、top10に関してはさきほどと同じ結果になりますので省略します。

In [None]:
df_labels["is_hirakana"] = df_labels["label"].str.match(hirakana)
label_count = df_labels[df_labels["is_hirakana"] == True]["label"].value_counts()
plt.figure(figsize=(10,5))
sns.barplot(label_count[-11:-1,].index, label_count[-11:-1,].values, alpha=0.8)
plt.title('hirakana bottom 10 unicode counts')
plt.ylabel('Number of count', fontsize=12)
plt.xlabel('unicode', fontsize=12)
plt.show()

In [None]:
# convert unicode to str
d = {"unicode": label_count[-11:-1,].index.values,
     "str": [unicode_map[l] for l in label_count[-11:-1,].index.values]}
pd.DataFrame(d)

逆にhirakana以外の出現頻度はどうなっているか気になります。

In [None]:
label_count = df_labels[df_labels["is_hirakana"] == False]["label"].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(label_count[:10,].index, label_count[:10,].values, alpha=0.8)
plt.title('top 10 unicode counts')
plt.ylabel('Number of count', fontsize=12)
plt.xlabel('unicode', fontsize=12)
plt.show()

In [None]:
# convert unicode to str
d = {"unicode": label_count[:10,].index.values,
     "str": [unicode_map[l] for l in label_count[:10,].index.values]}
pd.DataFrame(d)

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(label_count[-11:-1,].index, label_count[-11:-1,].values, alpha=0.8)
plt.title('hirakana bottom 10 unicode counts')
plt.ylabel('Number of count', fontsize=12)
plt.xlabel('unicode', fontsize=12)
plt.show()

In [None]:
# convert unicode to str
d = {"unicode": label_count[-11:-1,].index.values,
     "str": [unicode_map[l] for l in label_count[-11:-1,].index.values]}
pd.DataFrame(d)