In [None]:
#https://www.kaggle.com/huanghaicheng1024/preprocessing-of-data
from PIL import Image, ImageEnhance
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import os
import pandas as pd
import re

np.random.seed(1024)

4


In [None]:
#讀取所有文字
train_path = "/kaggle/input/handwritten-chinese-character-hanzi-datasets/CASIA-HWDB_Train/Train" #檔案路徑
test_path = "/kaggle/input/handwritten-chinese-character-hanzi-datasets/CASIA-HWDB_Test/Test" #檔案路徑
train_chinese = os.listdir(train_path)
test_chinese = os.listdir(test_path)
len(train_chinese)
len(test_chinese)

In [None]:
#選擇特定字
chinese = "概率生成及采样方法的一些探讨研究"
#chinese = "概"

In [None]:
#訓練集
df_train = pd.DataFrame(columns=["chinese", "filename"])
for i in chinese:
  l = os.listdir(train_path+'/'+i)
  df_train = df_train.append(pd.DataFrame({
    "chinese":i,
    "filename":l
  }), ignore_index = True)
    
df_train.shape

In [None]:
#將filename列改成漢字編碼形式
df_train["filename"] = df_train['chinese'] + "/" + df_train["filename"]
df_train.head()

In [None]:
#定義用於map的函數

#圖片讀取處理
def img_map(image):
    #將圖片轉為灰階
    image = image.convert('L')
    #縮放圖片大小64*64
    image = image.resize((64,64), Image.ANTIALIAS)
    #增強圖片對比度
    enh_col = ImageEnhance.Contrast(image)
    factor = np.random.uniform(2,3,1)
    image = enh_col.enhance(factor=factor)
    
    return image

#將圖片轉為數字
def to_array(image):
    return np.array(image)

#旋轉圖片
#逆時針角度r
def img_rotate_CCW(image):
  r = np.random.uniform(1, 3, 1)
  return image.rotate(r, fillcolor = "white")

def img_rotate_CW(image):
  r = np.random.uniform(1, 3, 1)
  return image.rotate(360-r, fillcolor = "white")

In [None]:
np.random.choice([1,2,3],1)[0]

In [None]:
#將圖片完整路徑加入csv
data_df['suite_id'] = data_df['suite_id'].astype(str)
data_df['sample_id'] = data_df['sample_id'].astype(str)
data_df['code'] = data_df['code'].astype(str)
data_df["path"] = IMG_PATH+"input_"+data_df["suite_id"]+"_"+data_df["sample_id"]+"_"+data_df["code"]+".jpg"
data_df["path"].head()

In [None]:
#圖便平移旋轉，增加數據量
#imgs為輸入圖片array數組
#direction為平移旋轉的方向 left right up down
#px為平移像素
#返回原shape數組
def img_trans(imgs, direction):
  s = imgs.shape
  img1 = np.zeros(shape = s) + 255
  px = np.random.choice([1, 2, 3], 1)[0]
  if direction == "left":
    img1[:, :, :s[2] - px] = imgs[:, :, px:]
  if direction == "right":
    img1[:, :, px:] = imgs[:, :, :s[2] - px]
  if direction == "up":
    img1[:, :s[1] - px, :] = imgs[:, px:, :]
  if direction == "down":
    img1[:, px:, :] = imgs[:, :s[1] - px, :]
  return img1

In [None]:
#讀取圖片
train_images = map(Image.open, train_path + "/" + df_train["filename"])

#圖片處理
train_img = list(map(img_map, train_images))

#順逆時鐘旋轉
train_images_CW = map(img_rotate_CW, train_img) #順時針2度
train_images_CCW = map(img_rotate_CCW, train_img) #逆時針2度

#轉為數組
train_images_CW = map(to_array, train_images_CW)
train_images_CCW = map(to_array, train_images_CCW)
train_images = map(to_array, train_img)

#實例化並轉為數組
train_images_CW = np.array(list(train_images_CW))
train_images_CCW = np.array(list(train_images_CCW))
train_images = np.array(list(train_images))

print(train_images.shape,train_images_CW.shape,train_images_CCW.shape)

In [None]:
train_images_left = img_trans(train_images,"left")
train_images_right = img_trans(train_images,"right")
train_images_up = img_trans(train_images,"up")
train_images_down = img_trans(train_images,"down")

In [None]:
i = np.random.choice(list(range(1, train_images.shape[0])))
fig, (ax1, ax2, ax3, ax4, ax5, ax6, ax7) = plt.subplots(1, 7)
ax1.imshow(train_images[i, :, :], cmap = 'gray')
ax1.set_xticks([])
ax1.set_yticks([])
ax2.imshow(train_images_left[i, :, :],cmap = 'gray')
ax2.set_xticks([])
ax2.set_yticks([])
ax3.imshow(train_images_right[i, :, :],cmap = 'gray')
ax3.set_xticks([])
ax3.set_yticks([])
ax4.imshow(train_images_up[i, :, :],cmap = 'gray')
ax4.set_xticks([])
ax4.set_yticks([])
ax5.imshow(train_images_down[i, :, :],cmap = 'gray')
ax5.set_xticks([])
ax5.set_yticks([])
ax6.imshow(train_images_CW[i, :, :],cmap = 'gray')
ax6.set_xticks([])
ax6.set_yticks([])
ax7.imshow(train_images_CCW[i, :, :],cmap = 'gray')
ax7.set_xticks([])
ax7.set_yticks([])

In [None]:
#拼接數組
train_images = np.concatenate((train_images, train_images_left,
                train_images_right, train_images_up,
                train_images_down, train_images_CW,
                train_images_CCW))
x = np.array(df_train["chinese"])
df1 = pd.DataFrame(columns = ["chinese"])
y_train = np.tile(x, 7)
print(train_images.shape, y_train.shape)

In [None]:
df_test = pd.DataFrame(columns=["chinese","filename"])
for i in chinese:
  l = os.listdir(test_path + '/' + i)
  df_test = df_test.append(pd.DataFrame({
    "chinese" : i,
    "filename" : l
  }),ignore_index = True)
    
df_test.shape

In [None]:
#將filename列改成漢字編碼形式
df_test["filename"] = df_test['chinese'] + "/" + df_test["filename"]
df_test.head()

In [None]:
#讀取圖片
test_images = map(Image.open, test_path + "/" + df_test["filename"])

#圖片處理
test_img = list(map(img_map, test_images))

#順逆時鐘旋轉
test_images_CW = map(img_rotate_CW, test_img) #順時針2度
test_images_CCW = map(img_rotate_CCW, test_img) #逆時針2度

#轉為數組
test_images_CW = map(to_array, test_images_CW)
test_images_CCW = map(to_array, test_images_CCW)
test_images = map(to_array, test_img)

#實例化並轉為數組
test_images_CW = np.array(list(test_images_CW))
test_images_CCW = np.array(list(test_images_CCW))
test_images = np.array(list(test_images))

print(test_images.shape, test_images_CW.shape, test_images_CCW.shape)

In [None]:
test_images_left = img_trans(test_images,"left")
test_images_right = img_trans(test_images,"right")
test_images_up = img_trans(test_images,"up")
test_images_down = img_trans(test_images,"down")

In [None]:
#拼接數組
test_images = np.concatenate((test_images, test_images_left,
                test_images_right, test_images_up,
                test_images_down, test_images_CW,
                test_images_CCW))
x = np.array(df_test["chinese"])
df1 = pd.DataFrame(columns = ["chinese"])
y_test = np.tile(x,7)
print(test_images.shape, y_test.shape)

In [None]:
#打亂數據集

train_idx = list(range(0, len(y_train)))
np.random.shuffle(train_idx)
np.random.shuffle(train_idx)
x_train = train_images[train_idx]
y_train = y_train[train_idx]

test_idx = list(range(0, len(y_test)))
np.random.shuffle(test_idx)
np.random.shuffle(test_idx)
x_test = test_images[test_idx]
y_test = y_train[test_idx]

In [None]:
#保存數組
np.savez("/kaggle/working/handwritten-chinese-character.np",
        x_train = x_train,
        y_train = y_train,
        x_test = x_test,
        y_test = y_test)

In [None]:
plt.imshow(train_images[10, :, :], cmap = 'gray')