# 顔文字生成器（サンプル）
単純ニューラルネットワークだけのシンプル構成

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd

## 前処理

### データ読み込み

In [4]:
# 特殊文字
sp = {'pad': '<PAD>',
      # 'bos': '<BOS>',
      # 'eos': '<EOS>',
      'unk': '<UNK>'}
# pad : padding. 文字列長を一定にするために使う
# bos : begin of sequence. 文頭文字．Decoderの最初の入力
# eos : end of sequence. 文末文字．
# unk : unknown. 出現数が低いものに割り当てる

In [5]:
sp.values()

dict_values(['<PAD>', '<UNK>'])

In [6]:
KAOMOJI_MAX = 10    # 顔文字最大長

kmj_list = []   # 顔文字リスト
len_list = []       # <BOS> から <EOS> までの文字数のリスト
char_list = []      # 顔文字に使用されている文字のリスト

char_list += list(sp.values())
file_name = 'kaomoji_MAX=' + str(KAOMOJI_MAX) + '.txt'

with open(file_name, mode='r') as file:
  for line in file:
    # temp = [sp['bos']]
    temp = list(line.replace('\n', ''))
    # temp += [sp['eos']]
    len_list.append(len(temp))
    temp += [sp['pad'] for _ in range(KAOMOJI_MAX - len(temp))]
    kmj_list.append(temp)
    char_list += temp

# 重複を消す
char_list = sorted(set(char_list), key=char_list.index)

In [7]:
print(kmj_list[0])

['(', '(', '(', '\u3000', '̄', 'ー', '\u3000', '̄', '\u3000', ')']


In [8]:
print('Number of kaomoji  :', len(kmj_list))
print('Number of character:', len(char_list))

Number of kaomoji  : 5457
Number of character: 719


### 出現数が少ないものを置換

In [9]:
# 最小出現数
MIN_APPEAR = 20

kmj_list = np.array(kmj_list)

cnt = 0
for c in char_list:
  mask = (kmj_list == c)
  if np.sum(mask) < MIN_APPEAR:
    kmj_list[mask] = sp['unk']

char_list = list(sp.values()) + kmj_list.flatten().tolist()
char_list = sorted(set(char_list), key=char_list.index)

In [10]:
print('Number of character:', len(char_list))

Number of character: 137


### 添字検索
顔文字に使われる文字が文字リストの何番目にあるか調べる

In [11]:
kmj_index = []    # 添字リスト

for kmj in kmj_list.tolist():
  temp = [char_list.index(c) for c in kmj]
  kmj_index.append(temp)

In [12]:
print(len_list[0])
kmj_index[0]

10


[2, 2, 2, 3, 4, 5, 3, 4, 3, 6]

### One-hotベクトル化



In [13]:
kmj_num = len(kmj_index)        # 顔文字の総数
kmj_size = len(kmj_index[0])    # 1つの顔文字の長さ
char_num = len(char_list)       # 文字の種類数

# One-hotベクトルリスト
kmj_onehot = np.zeros((kmj_num, kmj_size, char_num))

for i, index in enumerate(kmj_index):
  mask = range(char_num) == np.array(index).reshape((kmj_size, 1))
  kmj_onehot[i][mask] = 1

In [14]:
kmj_onehot.shape

(5457, 10, 137)

### 訓練・検証・テスト用に分ける

In [15]:
dataset = torch.utils.data.TensorDataset(
  torch.tensor(kmj_onehot.astype('float32')),
  torch.tensor(len_list)
)

In [16]:
train_size = int(len(dataset) * 0.85)
valid_size = int(len(dataset) * 0.10)
test_size  = len(dataset) - train_size - valid_size

# indices = np.arange(len(dataset))

# dataset_train = torch.utils.data.Subset(dataset, indices[:train_size])
# dataset_valid = torch.utils.data.Subset(dataset, indices[train_size:train_size+valid_size])
# dataset_test  = torch.utils.data.Subset(dataset, indices[train_size+valid_size:])

split = [train_size, valid_size, test_size]

dataset_train, dataset_valid, dataset_test = torch.utils.data.random_split(dataset, split)

In [17]:
dataset_train[0]

(tensor([[0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.]]),
 tensor(9))

In [18]:
batch_size = 32

dataloader_train = torch.utils.data.DataLoader(
  dataset_train,
  batch_size=batch_size,
  shuffle=True
)

dataloader_valid = torch.utils.data.DataLoader(
  dataset_valid,
  batch_size=batch_size,
  shuffle=True
)

In [19]:
for x, len_seq in dataloader_train:
  print(x.shape, len_seq.shape)
  break

torch.Size([32, 10, 137]) torch.Size([32])


## モデル

### Encoder

In [103]:
class Encoder(nn.Module):
  def __init__(self, N, in_dim, emb_dim, hid_dim):
    super().__init__()
    self.emb = nn.Linear(in_dim, emb_dim, bias=False)
    self.pos_emb = torch.nn.Parameter(torch.randn(1, N, emb_dim))
    self.flat = nn.Flatten()
    self.linear = nn.Linear(N*emb_dim, hid_dim)

  def forward(self, x):
    x = self.emb(x) + self.pos_emb
    x = self.flat(x)
    return self.linear(x)

In [104]:
encoder = Encoder(20, 269, 16, 64)
x = torch.rand(32, 20, 269)
encoder(x).shape

torch.Size([32, 64])

### Decoder

In [105]:
class Decoder(nn.Module):
  def __init__(self, in_dim, out_dim):
    super().__init__()
    self.linear = nn.Linear(in_dim, out_dim)

  def forward(self, x):
    return self.linear(x)

In [106]:
decoder = Decoder(64, 20*269)
x = torch.rand(32, 64)
decoder(x).shape

torch.Size([32, 5380])

### Generator

In [107]:
class Generator(nn.Module):
  def __init__(self, N, in_dim, emb_dim, hid_dim):
    super().__init__()
    self.N = N
    self.in_dim = in_dim
    self.encoder = Encoder(N, in_dim, emb_dim, hid_dim)
    self.decoder = Decoder(hid_dim, N*in_dim)

  def forward(self, x):
    z = self.encoder(x)
    y = self.decoder(z)
    return y.view(-1, self.N, self.in_dim)

In [108]:
generator = Generator(20, 269, 16, 64)
x = torch.rand(32, 20, 269)
generator(x).shape

torch.Size([32, 20, 269])

## 学習

In [77]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [148]:
n_epochs = 5

N = KAOMOJI_MAX
in_dim = len(char_list)
emb_dim = 16
hid_dim = 64

net = Generator(N, in_dim, emb_dim, hid_dim)

optimizer = optim.Adam(net.parameters())    # 最適化
criterion = nn.CrossEntropyLoss()           # 損失関数

In [149]:
for epoch in range(n_epochs):
  losses_train = []
  losses_valid = []

  # 訓練
  net.train()
  for x, len_seq in dataloader_train:
    net.zero_grad()  # 勾配の初期化

    #x.to(device)
    #t.to(device)

    y = net(x)

    loss = criterion(y, x)
    loss.backward()  # 誤差の逆伝播
    losses_train.append(loss.tolist())

    optimizer.step()  # パラメータの更新

  # 検証
  net.eval()
  for x, len_seq in dataloader_valid:
    y = net(x)

    loss = criterion(y, x)
    losses_valid.append(loss.tolist())

  if (epoch+1) % 1 == 0:
    print('EPOCH: {:>2}, Train Loss: {:>4.5f}, Valid Loss: {:>4.5f}'.format(
        epoch,
        np.mean(losses_train),
        np.mean(losses_valid),
    ))

EPOCH:  0, Train Loss: 0.12941, Valid Loss: 0.12502
EPOCH:  1, Train Loss: 0.10755, Valid Loss: 0.09440
EPOCH:  2, Train Loss: 0.08265, Valid Loss: 0.07924
EPOCH:  3, Train Loss: 0.07013, Valid Loss: 0.06876
EPOCH:  4, Train Loss: 0.06252, Valid Loss: 0.06537


## 評価

In [150]:
def convert_str(x):
  x = np.array(char_list)[x.argmax(dim=1)]
  x = [c for c in x if c not in sp.values()]

  return ''.join(x)

def generate(net, base=None, rate=3.0):
  if base is None:
    z = 2*rate * torch.randn(1, hid_dim) - rate
  else:
    z = net.encoder(base.unsqueeze(0))
    eps = 2*rate * torch.rand(1, hid_dim) - rate
    z = z + eps

  y = net.decoder(z)
  gen = convert_str(y.view(KAOMOJI_MAX, len(char_list)))

  return ''.join(gen)

### 次元圧縮テスト
目標はbaseとgenerateで全く同じもの．
次元圧縮して，潜在ベクトルが獲得できているかどうかのテスト．

In [153]:
for i in np.random.randint(0, len(dataset_test), size=10):
  test = dataset_test[i][0]
  print('base     :', convert_str(test))
  print('generate :', generate(net, base=test, rate=0.0))

base     : (*'-')ゞ♪
generate : (*^ω^)ノ!
base     : ∑(　́A`　)/
generate : ∑(　́∀`*)ノ
base     : (゚∀゚)ノ
generate : (≧ω≦)っ≡♪
base     : 。・∀・)ノ　゙
generate : [゚∀^)ノ≡♪
base     : (　　　)ノ!
generate : (≧ω≦)φ≡♪
base     : (;　　Д　;　)
generate : (*　́дω̄●)
base     : (・x・ノ)ノ⌒
generate : (^ω≦メ)ゞ!
base     : (((　́ψψ`)
generate : 【ヾ`　́;Ψぉ)
base     : (/。\)!
generate : (≧ω≦)ノ≡♪
base     : (`\(▼▼メ)
generate : (「φ0≧　̄)


### 生成テスト
潜在ベクトルに乱数を入力しその出力を見る．

In [156]:
for _ in range(10):
  print('generate :', generate(net, base=None, rate=1.0))

generate : [人▽́*\๑♪　
generate : ლ≧^)×」ぉ~」
generate : (*Д+メヽ!♪๑
generate : ̄Uゞ∇{@*┌¬¬
generate : (∀ゝェヘ)m┌̄
generate : 　∩゚ω◕oლo━
generate : :(△]　@оへ
generate : ლ(^ー`・━ゝ,|
generate : (　▽́)・*"⊃
generate : (^　T▽エ゚━彡)


### 類似度テスト
訓練データの中から潜在ベクトルの距離が近いものを見る．

In [146]:
def similar(net, base, num=10):
  z = net.encoder(base.unsqueeze(0))

  size = list(z.shape)
  size[0] = 0
  z_list = torch.empty(size, dtype=torch.float)
  z_list = net.encoder(dataset_train[:][0])

  diff = z_list - z
  norm = enumerate(torch.norm(diff, dim=1))
  norm = sorted(norm, key=lambda x: x[1])

  for i in range(num):
    min_index = norm[i][0]
    str = convert_str(dataset_train[min_index][0])
    print('{} : {}'.format(str, norm[i][1]))

In [147]:
i = np.random.randint(0, len(dataset_test))
test = dataset_test[i][0]
print(convert_str(test), ' : base')
similar(net, test, num=10)

(-_-;)  : base
(--;) : 3.238682985305786
(-ω-;) : 3.382957935333252
(-"-;) : 3.4923157691955566
(~_~;) : 3.7870543003082275
(Θ_Θ;) : 4.026918888092041
(._.;) : 4.112420558929443
(-ε-*) : 4.142392158508301
(-.-;) : 4.15769100189209
(-ω-#) : 4.209686279296875
(>∀<;) : 4.669211387634277


## 感想

このネットワークは，  
Encoder：Embedding+Flatten+Linear  
Decoder：Linear  
の構成

入力系列のそれぞれのEmbeddingを無理矢理，平坦化して計算させる．

活性化関数はない方が良い？

体感，LSTMよりも性能が良さげ

