# 顔文字生成器（サンプル）
LSTMを用いた構成

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd

## 前処理

### データ読み込み

In [2]:
# 特殊文字
sp = {'pad': '<PAD>', 'bos': '<BOS>', 'eos': '<EOS>', 'unk': '<UNK>'}
# pad : padding. 文字列長を一定にするために使う
# bos : begin of sequence. 文頭文字．Decoderの最初の入力
# eos : end of sequence. 文末文字．
# unk : unknown. 出現数が低いものに割り当てる

In [18]:
KAOMOJI_MAX = 20    # 顔文字最大長

kmj_list = []   # 顔文字リスト
len_list = []       # <BOS> から <EOS> までの文字数のリスト
char_list = []      # 顔文字に使用されている文字のリスト

char_list += list(sp.values())
file_name = 'kaomoji_MAX=' + str(KAOMOJI_MAX) + '.txt'

with open(file_name, mode='r') as file:
  for line in file:
    temp = [sp['bos']]
    temp += list(line.replace('\n', ''))
    temp += [sp['eos']]
    len_list.append(len(temp))
    temp += [sp['pad'] for _ in range(KAOMOJI_MAX+2 - len(temp))]
    kmj_list.append(temp)
    char_list += temp

# 重複を消す
char_list = sorted(set(char_list), key=char_list.index)

In [19]:
print(kmj_list[0])

['<BOS>', '(', '✿', '\u3000', '́', '꒳', '`', ')', 'ノ', '°', '+', '.', '*', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [20]:
print('Number of kaomoji  :', len(kmj_list))
print('Number of character:', len(char_list))

Number of kaomoji  : 11587
Number of character: 1106


### 出現数が少ないものを置換

In [21]:
# 最小出現数
MIN_APPEAR = 20

kmj_list = np.array(kmj_list)

cnt = 0
for c in char_list:
  mask = (kmj_list == c)
  if np.sum(mask) < MIN_APPEAR:
    kmj_list[mask] = sp['unk']

char_list = list(sp.values()) + kmj_list.flatten().tolist()
char_list = sorted(set(char_list), key=char_list.index)

In [22]:
print('Number of character:', len(char_list))

Number of character: 271


### 添字検索
顔文字に使われる文字が文字リストの何番目にあるか調べる

In [23]:
kmj_index = []    # 添字リスト

for kmj in kmj_list.tolist():
  temp = [char_list.index(c) for c in kmj]
  kmj_index.append(temp)

In [24]:
print(len_list[0])
kmj_index[0]

14


[1, 4, 5, 6, 7, 3, 8, 9, 10, 11, 12, 13, 14, 2, 0, 0, 0, 0, 0, 0, 0, 0]

### One-hotベクトル化



In [25]:
kmj_num = len(kmj_index)        # 顔文字の総数
kmj_size = len(kmj_index[0])    # 1つの顔文字の長さ
char_num = len(char_list)       # 文字の種類数

# One-hotベクトルリスト
kmj_onehot = np.zeros((kmj_num, kmj_size, char_num))

for i, index in enumerate(kmj_index):
  mask = range(char_num) == np.array(index).reshape((kmj_size, 1))
  kmj_onehot[i][mask] = 1

In [26]:
kmj_onehot.shape

(11587, 22, 271)

### 訓練・検証・テスト用に分ける

In [27]:
dataset = []

for i in range(kmj_num):
  temp = torch.tensor(kmj_onehot[i].astype('float32')), \
         torch.tensor(len_list[i])
  dataset.append(temp)

In [28]:
dataset[0]

(tensor([[0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.]]),
 tensor(14))

In [29]:
train_num = int(len(dataset) * 0.85)
valid_num = int(len(dataset) * 0.10)

np.random.shuffle(dataset)
dataset_train = dataset[:train_num]
dataset_valid = dataset[train_num:train_num+valid_num]
dataset_test  = dataset[train_num+valid_num:]

In [30]:
dataset_train[0]

(tensor([[0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.]]),
 tensor(19))

## モデル

In [31]:
class Encoder(nn.Module):
  def __init__(self, in_dim, emb_dim, hid_dim):
    super().__init__()
    self.emb = nn.Linear(in_dim, emb_dim, bias=False)
    self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=1, batch_first=True)
    #self.linear = nn.Linear(hid_dim, in_dim)

  def forward(self, x, len_seq):
    x = self.emb(x)
    h, _ = self.lstm(x)
    #y = self.linear(h[-1])
    return h[len_seq - 1]

In [32]:
class Decoder(nn.Module):
  def __init__(self, in_dim, emb_dim, hid_dim):
    super().__init__()
    self.emb = nn.Linear(in_dim, emb_dim, bias=False)
    self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=1, batch_first=True)
    self.linear = nn.Linear(hid_dim, in_dim)

  def forward(self, x, len_seq, init_state):
    x = self.emb(x)
    h, _ = self.lstm(x, init_state)
    y = self.linear(h)
    return y

In [33]:
class Generator(nn.Module):
  def __init__(self, in_dim, emb_dim, hid_dim):
    super().__init__()
    self.encoder = Encoder(in_dim, emb_dim, hid_dim)
    self.decoder = Decoder(in_dim, emb_dim, hid_dim)

  def forward(self, x, len_seq):
    h = self.encoder(x[1:], len_seq - 1)
    h = torch.reshape(h, (1, len(h)))
    c = torch.zeros_like(h)
    y = self.decoder(x, len_seq, (h, c))
    return y

## 学習

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [35]:
n_epochs = 5

in_dim = len(char_list)
emb_dim  = 16
hid_dim  = 64

net = Generator(in_dim, emb_dim, hid_dim)
net.to(device)
optimizer = optim.Adam(net.parameters())    # 最適化
criterion = nn.CrossEntropyLoss()           # 損失関数

In [36]:
for epoch in range(n_epochs):
  losses_train = []
  losses_valid = []

  # 訓練
  net.train()
  for x, len_seq in dataset_train:
    net.zero_grad()  # 勾配の初期化

    x = x.to(device)
    len_seq = len_seq.to(device)

    y = net(x, len_seq)

    loss = criterion(y[:-1], x[1:])
    loss.backward()  # 誤差の逆伝播

    torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)

    losses_train.append(loss.tolist())

    optimizer.step()  # パラメータの更新

  # 検証
  net.eval()
  for x, len_seq in dataset_valid:
    x = x.to(device)
    len_seq = len_seq.to(device)

    y = net(x, len_seq)

    loss = criterion(y, x)
    losses_valid.append(loss.tolist())

  if (epoch+1) % 1 == 0:
    print('EPOCH: {:>2}, Train Loss: {:>4.5f}, Valid Loss: {:>4.5f}'.format(
        epoch,
        np.mean(losses_train),
        np.mean(losses_valid),
    ))

EPOCH:  0, Train Loss: 1.88233, Valid Loss: 3.21586
EPOCH:  1, Train Loss: 1.52374, Valid Loss: 3.81598
EPOCH:  2, Train Loss: 1.42176, Valid Loss: 4.04878
EPOCH:  3, Train Loss: 1.36893, Valid Loss: 4.10988
EPOCH:  4, Train Loss: 1.33054, Valid Loss: 4.18398


## 評価

In [37]:
def convert_str(x):
  x = np.array(char_list)[x.argmax(dim=1)]
  x = [c for c in x if c not in sp.values()]

  return ''.join(x)

def generate(base, net, rate=3.0):
  x, len_seq = base[0].to(device), base[1].to(device)
  eps = 2*rate * np.random.rand(hid_dim).astype('float32') - rate

  z = net.encoder(x[1:], len_seq - 1) + torch.tensor(eps).to(device)
  h = torch.reshape(z, (1, len(z)))
  c = torch.zeros_like(h)

  onehot = torch.zeros(kmj_size, char_num)
  bos_index = char_list.index(sp['bos'])
  onehot[0][bos_index] = 1

  for i in range(kmj_size-1):
    y = net.decoder(onehot, len_seq, (h, c))
    onehot[i+1][y[i].argmax()] = 1

  gen = convert_str(onehot.cpu())

  return ''.join(gen)

In [38]:
for i in np.random.randint(0, len(dataset_test), size=10):
  test = dataset_test[i]
  print('base     :', convert_str(test[0]))
  print('generate :', generate(test, net, rate=0))

base     : (o　́∀`)ノ。:..*.ぅ゚:..
generate : (　　̄(エ)　̄)　̄)))))))
base     : ﻿(。-ω-。)ノ　・゚゚・。
generate : !(　　̄(エ)　̄)))))
base     : ┐(-。-;)┌
generate : \(^ー^)/!
base     : ♪(*　　̄)o(◇　̄*)アーン
generate : !(　　̄　　̄)(　̄ー　̄)/
base     : (・。・)ノ
generate : (^-^)/
base     : ||||||~　゙(ノ><)ノ!
generate : ||||||||||||||||
base     : ヽ(*··)ノヽ(··*　)ノ!
generate : ヾ(　́・ω・)ノ(　́∀`)ノ　゙!
base     : (　^　^)/[]__(^　^　)!
generate : (　　̄(エ)　̄)))))))))
base     : (乂ω′)
generate : (^^)/
base     : ((◎―――――ー(°▽°)
generate : ヾ(　́・ω・)ノ　゙・・・


## 感想

このネットワークは，  
Encoder：Embedding＋LSTM  
Decoder：Embedding＋LSTM＋Linear

LSTMは，RNN系のネットワークで系列データを扱える．(言語情報学でもちょっとやったかも)

計算例  
入力：（＾－＾）

Encoder  
入力："（"　　　------>　h0  
入力："＾"，h0　------>　h1  
入力："ー"，h1　------>　h2  
入力："＾"，h2　------>　h3  
入力："）"，h3　------>　h4  
系列データを1つに集約する感じ？

Decoder  
入力："BOS", h4　------>　H0　------>　推論："（"  
入力："（"， H0　------>　H1　------>　推論："＾"  
入力："＾"， H1　------>　H2　------>　推論："ー"  
入力："ー"， H2　------>　H3　------>　推論："＾"  
入力："＾"， H3　------>　H4　------>　推論："（"  

このネットワークは，系列データを1つの潜在ベクトルにして，再構成する．
そのため，顔文字をちゃんと一つの塊として扱える．  
ただ，今のところ性能は良いとはいえない．  
違う顔文字を入力しても同じような出力になっている？