# jpgをnumpyに変更する

In [13]:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import glob
import os,sys
from sklearn.preprocessing import LabelBinarizer

dic_katakana = {"a":0,"i":1,"u":2,"e":3,"o":4,"ka":5,"ki":6,"ku":7,"ke":8,"ko":9,"sa":10,"si":11,"su":12,"se":13,"so":14}
pixel = 28

## 元画像をnumpy形式に変換する

In [14]:
# trainデータのファイルパスを取得
li_fpath = glob.glob("./jpg/train/*/*.png")
li_fpath += glob.glob("./jpg/test/*/*.png")
print(len(li_fpath))
li_fpath[:10]

17500


['./jpg/train/se/se_605.png',
 './jpg/train/se/se_60.png',
 './jpg/train/se/se_375.png',
 './jpg/train/se/se_215.png',
 './jpg/train/se/se_995.png',
 './jpg/train/se/se_765.png',
 './jpg/train/se/se_770.png',
 './jpg/train/se/se_980.png',
 './jpg/train/se/se_200.png',
 './jpg/train/se/se_360.png']

In [15]:
# numpy形式に変換
num_image = len(li_fpath)
channel = 1 # グレースケール
data = np.empty((num_image, channel, pixel, pixel))
li_label = []
li_fname = []
li_num = []

for i, fpath in enumerate(li_fpath):
    
    # ファイル名
    fname = os.path.split(fpath)[1]
    li_fname.append(fname)

    # 番号
    li_num.append(int(fname.split("_")[1].split(".png")[0]))
    
    # カタカナ文字
    label_str = fname.split("_")[0]
    label_int = dic_katakana[label_str]
    li_label.append(label_int)
    
    img_ = Image.open(fpath)
    img_ = np.array(img_).astype(np.float32)
    data[i, 0, :] = img_
    
print("data.shape=", data.shape)
print()

# one hotベクトル化
lb = LabelBinarizer()
label =lb.fit_transform(li_label, ).astype('int32')
print(label)

# ファイル名
print(len(li_fname))

# 番号
print(len(li_num))

data.shape= (17500, 1, 28, 28)

[[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
17500
17500


## trainとtestにわける

In [16]:
import pandas as pd
df_fname = pd.DataFrame(li_fname, columns=["fname"])
df_fname["katakana"] = li_label
# df_fname['label'] = df_fname['katakana'].map(dic_katakana)
df_fname["num"] = li_num
df_fname["is_train"] = False
df_fname.reset_index(inplace=True)

for i in range(15):
    if i<5:
        """
        ア~オ
        """
        np.random.seed(1234)
        train_index = np.random.choice(np.arange(1,1501), size=200, replace=False)
    else:
        """
        カ~ソ
        """
        np.random.seed(1234)
        train_index = np.random.choice(np.arange(1,1001), size=200, replace=False)
        
    df_fname.loc[(df_fname["num"].isin(train_index))&(df_fname["katakana"]==i), "is_train"] = True

display(df_fname.head())

# 集計
df = df_fname.groupby(["katakana","is_train"])[["fname"]].count()
df.reset_index().pivot_table(index="katakana",values="fname", columns="is_train", margins=True, aggfunc="sum")


Unnamed: 0,index,fname,katakana,num,is_train
0,0,se_605.png,13,605,False
1,1,se_60.png,13,60,False
2,2,se_375.png,13,375,False
3,3,se_215.png,13,215,True
4,4,se_995.png,13,995,False


is_train,False,True,All
katakana,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1300,200,1500
1,1300,200,1500
2,1300,200,1500
3,1300,200,1500
4,1300,200,1500
5,800,200,1000
6,800,200,1000
7,800,200,1000
8,800,200,1000
9,800,200,1000


In [17]:
is_train = np.array(df_fname["is_train"])
train_data = data[is_train]
train_label = label[is_train]
test_data = data[np.logical_not(is_train)]
test_label = label[np.logical_not(is_train)]

## 分割したデータを出力

In [18]:
np.save("./train_data.npy", train_data)
np.save("./train_label.npy", train_label)
np.save("./test_data.npy", test_data)
np.save("./test_label.npy", test_label)
df_fname.to_csv("is_train.csv")