In [None]:
!pwd
%cd /content/drive/My Drive/Colab Notebooks/NLP

#第9章: RNN, CNN, Transformer

##課題80. ID番号への変換
問題51で構築した学習データ中の単語にユニークなID番号を付与したい．学習データ中で最も頻出する単語に1，2番目に頻出する単語に2，……といった方法で，学習データ中で2回以上出現する単語にID番号を付与せよ．そして，与えられた単語列に対して，ID番号の列を返す関数を実装せよ．ただし，出現頻度が2回未満の単語のID番号はすべて0とせよ．

###解説
学習データ中で最も頻出する単語に1，2番目に頻出する単語に2，……といった方法で，学習データ中で2回以上出現する単語にID番号を付与する。

データを準備する手順。

１．記事データをpandasで読み込み、必要な部分だけ抜き取る。

２．記事Titleに含まれる単語をカウントする。

３．単語からIDを引く辞書を作る。

４．タイトルに含まれる単語から、ID列に変換する補助関数を作っておく。

５．TITLEをID列にして、CATEGORY文字を数字に変換したデータを準備しておく。

６．それをtrain,valid,testに分割する。

###解答例



In [None]:
#From From 50,51
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
!unzip NewsAggregatorDataset.zip
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import pickle
#
# 記事データをpandasで読み込み、必要な部分だけ抜き取る。
#
df = pd.read_csv('./newsCorpora.csv', header=None, sep='\t', 
    names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), 
            ['TITLE', 'CATEGORY']]
df['TITLE'] = df['TITLE'].str.lower()
df['TITLE'] = df['TITLE'].str.replace('[^a-z]',' ', regex=True)
df['TITLE'] = df['TITLE'].str.replace(' +',' ', regex=True)
df.to_csv('./news.csv',sep='\t',index=False,header=False)
#
# 記事Titleに含まれる単語をカウントする。
#
lex={}
for title in df['TITLE']:
  words = title.split()
  for w in words:
      if w in lex:
        lex[w] += 1
      else:
        lex[w] = 1 
freq = sorted(lex.items(), key=lambda x: x[1], reverse=True)
for w in freq[:10]: print(w)
#
# 単語からIDを引く辞書を作る
#
word2id = { word: i + 1  for i, (word, cnt) in enumerate(freq) if cnt>1 }
print(len(word2id))
for e in list(word2id)[:10]: print(f'({e}, {word2id[e]})')
with open('./word2id.dict', mode='wb') as f:  pickle.dump(word2id,f)
#
# タイトルに含まれる単語から、ID列に変換する補助関数
#
def words2ids(words):
  ids = []
  for w in words.split():
    if w in word2id:
      ids.append(word2id[w])
    else:
      ids.append(0)
  return ' '.join([str(i) for i in ids])

text = df.iloc[0, 0]
print(text)
print(words2ids(text))
#
# TITLEをID列にして、CATEGORY文字を数字に変換したデータを準備しておく。
#
df['TITLE'] = df['TITLE'].map(words2ids)
df['CATEGORY'] = df['CATEGORY'].map({'b': 0, 'e': 1, 't': 2, 'm': 3})
data = pd.DataFrame(df.to_numpy())
print(data)
#
# train、valid, testに分割する
#
train, valid_test = train_test_split(data,  train_size=0.8, shuffle=True, stratify=data[1])
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, stratify=valid_test[1])

train.to_csv('./train.txt', sep='\t', index=False, header=False)
valid.to_csv('./valid.txt', sep='\t', index=False, header=False)
test.to_csv('./test.txt', sep='\t', index=False, header=False)
!wc -l train.txt valid.txt test.txt

In [None]:
from torch.utils.data import Dataset
import torch
from torch import nn

class NewsDataset(Dataset):
  def __init__(self, x, y):  self.x, self.y = x, y
  def __len__(self):  return len(self.y)
  def __getitem__(self, idx):  
    text = self.x[idx]
    #単語IDが並んでいるので、数値に型変換してリストにする
    inputs = [int(x) for x in text.split()]
    #torch.tensor方へ変換
    return { 'inputs': torch.tensor(inputs, dtype=torch.int64), 'labels': torch.tensor(self.y[idx], dtype=torch.int64) }

##課題81. RNNによる予測
ID番号で表現された単語列x=(x1,x2,…,xT)がある．ただし，Tは単語列の長さ，xt∈RVは単語のID番号のone-hot表記である（Vは単語の総数である）．再帰型ニューラルネットワーク（RNN: Recurrent Neural Network）を用い，単語列xからカテゴリy

を予測するモデルとして，次式を実装せよ．
h→0=0,h→t=RNN−→−−(emb(xt),h→t−1),y=softmax(W(yh)h→T+b(y))

ただし，emb(x)∈Rdw
は単語埋め込み（単語のone-hot表記から単語ベクトルに変換する関数），h→t∈Rdhは時刻tの隠れ状態ベクトル，RNN−→−−(x,h)は入力xと前時刻の隠れ状態hから次状態を計算するRNNユニット，W(yh)∈RL×dhは隠れ状態ベクトルからカテゴリを予測するための行列，b(y)∈RLはバイアス項である（dw,dh,Lはそれぞれ，単語埋め込みの次元数，隠れ状態ベクトルの次元数，ラベル数である）．RNNユニットRNN−→−−(x,h)

には様々な構成が考えられるが，典型例として次式が挙げられる．
RNN−→−−(x,h)=g(W(hx)x+W(hh)h+b(h))

ただし，W(hx)∈Rdh×dw，W(hh)∈Rdh×dh,b(h)∈Rdh
はRNNユニットのパラメータ，gは活性化関数（例えばtanh

やReLUなど）である．

なお，この問題ではパラメータの学習を行わず，ランダムに初期化されたパラメータでy
を計算するだけでよい．次元数などのハイパーパラメータは，dw=300,dh=50など，適当な値に設定せよ（以降の問題でも同様である）．

###解説
数式が分かりにくいが、PyTorchのクラスRNNなどを使うだけでよい。この課題では、入力データを埋め込み表現にして、RNNに通し、最後に得た隠れ状態ベクトルを、全結合層介して、出力する。


###解答例

In [None]:
import torch
from torch import nn

DEBUG=True

class RNN(nn.Module):

  def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size):
    super().__init__()
    self.hidden_size = hidden_size
    #縦が全単語数、横が埋め込みベクトルサイズの埋め込み表を定義。
    self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
    # RNNを定義。
    self.rnn = nn.RNN(emb_size, hidden_size, num_layers=1, bias=True, nonlinearity='tanh', batch_first=True)
    # RNNの最後の隠れ状態ベクトルを入力とし、4個のどのクラスに属するかのベクトルを出力する
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    #ここでは、ミニバッチサイズは１．後で複数にする。
    self.batch_size = x.size()[0]
    #隠れ状態ベクトルを初期化
    hidden = torch.zeros(1, self.batch_size, self.hidden_size)
    #入力（単語IDが単語数分並んでいる）に対し、その埋め込みベクトルを返す。
    emb = self.emb(x)
    if DEBUG: print(f'emb size {emb.size()}')
    #埋め込み表現された単語列をRNNに入力し、hiddenに隠れ状態ベクトルをためる
    out, hidden = self.rnn(emb, hidden)
    if DEBUG: print(f'rnn out size: {out.size()}')
    if DEBUG: print(f'fc input size: {out[:,-1,:].size()}')
    #最後の隠れ状態ベクトルを取り出し、全結合に入力する。
    out = self.fc(out[:, -1, :]) # fc input is the final hidden state vector
    if DEBUG: print(f'fc out size: {out.size()}')
    return out


In [None]:
import torch
from torch import nn
import pandas as pd

torch.manual_seed(1)
with open('./word2id.dict', mode='rb') as f: word2id = pickle.load(f)
train = pd.read_csv('./train.txt', sep='\t', header=None, names=['TITLE','CATEGORY'])
dataset_train = NewsDataset(train['TITLE'], train['CATEGORY'])

szVOCAB = len(word2id.values()) + 1 
szEMB = 300
PADDING_IDX = len(word2id.values())
szOUTPUT = 4
szHIDDEN = 64
model = RNN(szVOCAB, szEMB, PADDING_IDX, szOUTPUT, szHIDDEN)

for i in range(10):
  #ii番目の単語列を取り出す。
  x = dataset_train[i]['inputs']
  if DEBUG: print(f'model input: {x}')
  #ミニバッチの次元を追加
  x = x.unsqueeze(0) # insert batch_size=1 by unsqueeze(0)
  if DEBUG: print(f'model input unsqueezed: {x}')
  #モデルに単語ID列を入力し、どのクラスに属するかの4個の評価値ベクトルを得る。
  out = model(x)
  predict = torch.softmax(out, dim=-1)
  if DEBUG: print(f'model output: {predict}')


##課題82. 確率的勾配降下法による学習　[省略]
確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題81で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ

###解説
73相当で、83までの途中段階の課題で、トレーニングと評価のエポックを回す。
時間がかかるので省略する。


###解答例

In [None]:
import matplotlib.pyplot as plt

class Measure():
  def __init__(self):
    self.loss_train_list = []
    self.loss_valid_list = []
    self.accuracy_train_list = []
    self.accuracy_valid_list = []

  def init_epoch(self):
    self.correct_train = 0
    self.total_train = 0
    self.correct_valid = 0
    self.total_valid = 0
    self.loss_train = 0.0
    self.loss_valid = 0.0

  def accuracy(self, inputs,outputs,labels):
    total = len(inputs)
    prediction = torch.argmax(outputs, dim=1)
    correct = torch.sum(prediction==labels)
    return total, correct

  def record_train(self,inputs,outputs,labels,lossitem):
    self.loss_train += lossitem
    total, correct = self.accuracy(inputs,outputs,labels)
    self.total_train += total
    self.correct_train += correct

  def train(self):
    self.loss_train /= len(dataset_train)
    self.accuracy_train = self.correct_train / self.total_train

  def record_valid(self,inputs,outputs,labels, lossitem):
    self.loss_valid += lossitem
    total, correct = self.accuracy(inputs,outputs,labels)
    self.total_valid += total
    self.correct_valid += correct

  def valid(self):
    self.loss_valid /= len(dataset_valid)
    self.accuracy_valid = self.correct_valid / self.total_valid

  def record_epoch(self, epoch):
    print(f'epoch: {epoch + 1}, loss_train: {self.loss_train:.4f}, loss_valid: {self.loss_valid:.4f}, \
    accuracy_train: {self.accuracy_train:.2f}, accuracy_valid: {self.accuracy_valid:.2f}')
    self.loss_train_list.append(self.loss_train)
    self.loss_valid_list.append(self.loss_valid)
    self.accuracy_train_list.append(self.accuracy_train)
    self.accuracy_valid_list.append(self.accuracy_valid)

  def degrading(self, epoch):
   return (epoch > 1 and self.loss_valid_list[epoch - 2] <= self.loss_valid_list[epoch - 1] <= self.loss_valid_list[epoch])

def draw(numEpochs, loss_train_list, loss_valid_list, accuracy_train_list, accuracy_valid_list ):
  plt.figure()
  plt.plot(range(numEpochs), loss_train_list, label='loss_train')
  plt.plot(range(numEpochs), loss_valid_list, label='loss_valid')
  plt.legend()
  plt.xlabel('epoch')
  plt.show()
  plt.figure()
  plt.plot(range(numEpochs), accuracy_train_list, label='accuracy_train')
  plt.plot(range(numEpochs), accuracy_valid_list, label='accuracy_valid')
  plt.legend()
  plt.xlabel('epoch')
  plt.show()

In [None]:
from torch import optim
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

DEBUG = False

torch.manual_seed(0)
train = pd.read_csv('./train.txt', sep='\t', header=None, names=['TITLE','CAT'])
dataset_train = NewsDataset(train['TITLE'], train['CAT'])
valid = pd.read_csv('./valid.txt', sep='\t', header=None, names=['TITLE','CAT'])
dataset_valid = NewsDataset(valid['TITLE'], valid['CAT'])

szVOCAB = len(set(word2id.values())) + 1 
szEMB = 300
PADDING_IDX = len(set(word2id.values()))
szOUTPUT = 4
szHIDDEN = 64
LEARNING_RATE = 0.005
szBATCH = 1
numEPOCHS = 10

model = RNN(szVOCAB, szEMB, PADDING_IDX, szOUTPUT, szHIDDEN)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True)
dataloader_valid = DataLoader(dataset_valid, batch_size=1, shuffle=False)
scheduler = optim.lr_scheduler.StepLR(optimizer, numEPOCHS, gamma=0.8)

measure = Measure()
for epoch in range(numEPOCHS):
  measure.init_epoch()
  model.train()
  for data in dataloader_train:
    optimizer.zero_grad()
    inputs, labels = data['inputs'], data['labels']
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    measure.record_train(inputs,outputs,labels,loss.item())  
  measure.train()  
  model.eval()
  with torch.no_grad():
    for data in dataloader_valid:
      optimizer.zero_grad()
      inputs, outputs = data['inputs'], data['labels']
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      measure.record_valid(inputs,outputs,labels,loss.item())
  measure.valid()
  measure.record_epoch(epoch)
  if measure.degrading(epoch): break
  scheduler.step()
draw(epoch+1, measure.loss_train_list, measure.loss_valid_list, measure.accuracy_train_list, measure.accuracy_valid_list)

##課題83. ミニバッチ化・GPU上での学習
問題82のコードを改変し，B事例ごとに損失・勾配を計算して学習を行えるようにせよ（Bの値は適当に選べ）．また，GPU上で学習を実行せよ．

###解説
ミニバッチ化して、GPUを利用する、77,78相当。
ミニバッチ化する際、並列実行しやすいように、ミニバッチごとに、単語ID列を最大個数で固定長にする。
素のRNNは、やや非力なので、GRUという変種を使う。

###解答例

ここで、CPUタイプをGPUに変更する.
また、80番を実行する

In [None]:
import matplotlib.pyplot as plt

class Measure():
  def __init__(self):
    self.loss_train_list = []
    self.loss_valid_list = []
    self.accuracy_train_list = []
    self.accuracy_valid_list = []

  def init_epoch(self):
    self.correct_train = 0
    self.total_train = 0
    self.correct_valid = 0
    self.total_valid = 0
    self.loss_train = 0.0
    self.loss_valid = 0.0

  def accuracy(self, inputs,outputs,labels):
    total = len(inputs)
    prediction = torch.argmax(outputs, dim=1)
    correct = torch.sum(prediction==labels)
    return total, correct

  def record_train(self,inputs,outputs,labels,lossitem):
    self.loss_train += lossitem
    total, correct = self.accuracy(inputs,outputs,labels)
    self.total_train += total
    self.correct_train += correct

  def train(self):
    self.loss_train /= len(dataset_train)
    self.accuracy_train = self.correct_train / self.total_train

  def record_valid(self,inputs,outputs,labels, lossitem):
    self.loss_valid += lossitem
    total, correct = self.accuracy(inputs,outputs,labels)
    self.total_valid += total
    self.correct_valid += correct

  def valid(self):
    self.loss_valid /= len(dataset_valid)
    self.accuracy_valid = self.correct_valid / self.total_valid

  def record_epoch(self, epoch):
    print(f'epoch: {epoch + 1}, loss_train: {self.loss_train:.4f}, loss_valid: {self.loss_valid:.4f}, \
    accuracy_train: {self.accuracy_train:.2f}, accuracy_valid: {self.accuracy_valid:.2f}')
    self.loss_train_list.append(self.loss_train)
    self.loss_valid_list.append(self.loss_valid)
    self.accuracy_train_list.append(self.accuracy_train)
    self.accuracy_valid_list.append(self.accuracy_valid)

  def degrading(self, epoch):
   return (epoch > 1 and self.loss_valid_list[epoch - 2] <= self.loss_valid_list[epoch - 1] <= self.loss_valid_list[epoch])

def draw(numEpochs, loss_train_list, loss_valid_list, accuracy_train_list, accuracy_valid_list ):
  plt.figure()
  plt.plot(range(numEpochs), loss_train_list, label='loss_train')
  plt.plot(range(numEpochs), loss_valid_list, label='loss_valid')
  plt.legend()
  plt.xlabel('epoch')
  plt.show()
  plt.figure()
  plt.plot(range(numEpochs), accuracy_train_list, label='accuracy_train')
  plt.plot(range(numEpochs), accuracy_valid_list, label='accuracy_valid')
  plt.legend()
  plt.xlabel('epoch')
  plt.show()

In [None]:
#
#複数データをミニバッチとして、並行して処理するようにする。RNNは入力が可変長であるため、並列実行しにくい。
#そこで、ミニバッチ単位で、最も長い単語列のサイズに合わせる。
#
class Padsequence():
  def __init__(self, padding_idx):
    self.padding_idx = padding_idx

  def __call__(self, batch):
    sequences = [x['inputs'] for x in batch]
    sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.padding_idx)
    labels = torch.LongTensor([x['labels'] for x in batch])
    return {'inputs': sequences_padded, 'labels': labels}

In [None]:
from torch import optim
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np

class RNN(nn.Module):

  def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
    #self.rnn = nn.RNN(emb_size, hidden_size, num_layers=1, nonlinearity='tanh', bias=True, batch_first=True, dropout=0.3)
    self.rnn = nn.GRU(emb_size, hidden_size, num_layers=1, bias=True, batch_first=True, dropout=0.3)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    self.batch_size = x.size()[0]
    hidden = torch.zeros(1, self.batch_size, self.hidden_size).to(device) #<------
    emb = self.emb(x)
    out, hidden = self.rnn(emb, hidden)
    out = self.fc(out[:, -1, :]) # fc input is the final hidden state vector
    return out

In [None]:
torch.manual_seed(0)
with open('./word2id.dict', mode='rb') as f: word2id = pickle.load(f)
train = pd.read_csv('./train.txt', sep='\t', header=None, names=['TITLE','CATEGORY'])
dataset_train = NewsDataset(train['TITLE'], train['CATEGORY'])
valid = pd.read_csv('./valid.txt', sep='\t', header=None, names=['TITLE','CATEGORY'])
dataset_valid = NewsDataset(valid['TITLE'], valid['CATEGORY'])

szVOCAB = len(set(word2id.values())) + 1 
szEMB = 300
PADDING_IDX = len(set(word2id.values()))
szOUTPUT = 4
szHIDDEN = 300
LEARNING_RATE = 0.005
numEPOCHS = 30
szBATCH = 32

device = torch.device('cuda') #<-----
model = RNN(szVOCAB, szEMB, PADDING_IDX, szOUTPUT, szHIDDEN)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
model.to(device) #<------
dataloader_train = DataLoader(dataset_train, batch_size=szBATCH, shuffle=True, 
                              collate_fn=Padsequence(PADDING_IDX)) #<-----------
dataloader_valid = DataLoader(dataset_valid, batch_size=szBATCH, shuffle=False,
                              collate_fn=Padsequence(PADDING_IDX)) #<-----------
scheduler = optim.lr_scheduler.StepLR(optimizer, numEPOCHS, gamma=0.5)

def train_loop():
  measure = Measure()
  for epoch in range(numEPOCHS):
    measure.init_epoch()
    model.train()
    for data in dataloader_train:
      optimizer.zero_grad()
      inputs = data['inputs'].to(device) #<-----------
      labels = data['labels'].to(device) #<-----------
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()
      measure.record_train(inputs,outputs,labels,loss.item())
    measure.train()  
    model.eval()
    with torch.no_grad():
      for data in dataloader_valid:
        optimizer.zero_grad()
        inputs = data['inputs'].to(device) #<------
        labels = data['labels'].to(device) #<------
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        measure.record_valid(inputs,outputs,labels,loss.item())
    measure.valid()
    measure.record_epoch(epoch)
    if measure.degrading(epoch): break
    scheduler.step()
  draw(epoch+1, measure.loss_train_list, measure.loss_valid_list, measure.accuracy_train_list, measure.accuracy_valid_list)

train_loop()

##課題84. 単語ベクトルの導入 [省略]
事前学習済みの単語ベクトル（例えば，Google Newsデータセット（約1,000億単語）での学習済み単語ベクトル）で単語埋め込みemb(x)を初期化し，学習せよ．

###解説
Googleの埋め込みベクトルを、埋め込み表の初期値として使う。
相性が良くないようで、いい結果が出ない。省略。

###解答例

In [None]:
import pickle
import numpy as np
import torch

with open('./word2id.dict', mode='rb') as f: word2id = pickle.load(f)
szVOCAB = len(set(word2id.values())) + 1 
szEMB = 300

from gensim.models import KeyedVectors
wordModel = KeyedVectors.load_word2vec_format('./GoogleNews.bin.gz', binary=True)

weights = np.zeros((szVOCAB, szEMB))
hit = 0
err = 0
for i, word in enumerate(word2id.keys()):
  if word in wordModel:
    weights[i] = wordModel[word]
    hit += 1
  else:
    weights[i] = np.random.normal(scale=0.4, size=(szEMB,))
    err += 1
embeds = torch.from_numpy(weights.astype((np.float32)))
print(f'hit {hit}, err {err}')

In [None]:
from torch import optim
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np

torch.manual_seed(0)
with open('./word2id.dict', mode='rb') as f: word2id = pickle.load(f)
train = pd.read_csv('./train.txt', sep='\t', header=None, names=['TITLE','CAT'])
dataset_train = NewsDataset(train['TITLE'], train['CAT'])
valid = pd.read_csv('./valid.txt', sep='\t', header=None, names=['TITLE','CAT'])
dataset_valid = NewsDataset(valid['TITLE'], valid['CAT'])

szVOCAB = len(set(word2id.values())) + 1 
szEMB = 300
PADDING_IDX = len(set(word2id.values()))
szOUTPUT = 4
szHIDDEN = 300
LEARNING_RATE = 0.001
numEPOCHS = 30
szBATCH = 32
device = torch.device('cuda')

class RNN(nn.Module):
  def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.emb = nn.Embedding.from_pretrained(embeds, padding_idx=padding_idx) #<-----------------
    #self.rnn = nn.RNN(emb_size, hidden_size, 1, nonlinearity='tanh', batch_first=True, dropout=0.3)
    self.rnn = nn.GRU(emb_size, hidden_size, 1, batch_first=True, dropout=0.3)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    self.batch_size = x.size()[0]
    hidden = torch.zeros(1, self.batch_size, self.hidden_size).to(device)
    emb = self.emb(x)
    out, hidden = self.rnn(emb, hidden)
    out = self.fc(out[:, -1, :])
    return out

model = RNN(szVOCAB, szEMB, PADDING_IDX, szOUTPUT, szHIDDEN)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
model.to(device)
dataloader_train = DataLoader(dataset_train, batch_size=szBATCH, shuffle=True, collate_fn=Padsequence(PADDING_IDX)) 
dataloader_valid = DataLoader(dataset_valid, batch_size=szBATCH, shuffle=False, collate_fn=Padsequence(PADDING_IDX))
scheduler = optim.lr_scheduler.StepLR(optimizer, numEPOCHS, gamma=0.5)

train_loop()

##課題85. 双方向RNN・多層化 [省略]
順方向と逆方向のRNNの両方を用いて入力テキストをエンコードし，モデルを学習せよ．
h←T+1=0,h←t=RNN←−−−(emb(xt),h←t+1),y=softmax(W(yh)[h→T;h←1]+b(y))

ただし，h→t∈Rdh,h←t∈Rdh
はそれぞれ，順方向および逆方向のRNNで求めた時刻tの隠れ状態ベクトル，RNN←−−−(x,h)は入力xと次時刻の隠れ状態hから前状態を計算するRNNユニット，W(yh)∈RL×2dhは隠れ状態ベクトルからカテゴリを予測するための行列，b(y)∈RLはバイアス項である．また，[a;b]はベクトルaとb

の連結を表す。

さらに，双方向RNNを多層化して実験せよ．

###解説
双方向RNNというのは、入力を頭からRNNに通すのと、おしりから通すのと、両方やるやりかた。
多層化は、RNNを何枚か重ねて、下層の最後の隠れ状態を次の層の隠れベクトルの値とするように、つなげるもの。
この規模のデータだと、単一方向、単層と比べて、大きな改善はないので、省略。

###解答例

In [None]:
!pwd
%cd /content/drive/My Drive/Colab Notebooks/NLP100Exercises2020後期

from torch import optim
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np

torch.manual_seed(0)
with open('./word2id.dict', mode='rb') as f: word2id = pickle.load(f)
train = pd.read_csv('./train.txt', sep='\t', header=None, names=['TITLE','CATEGORY'])
dataset_train = NewsDataset(train['TITLE'], train['CATEGORY'])
valid = pd.read_csv('./valid.txt', sep='\t', header=None, names=['TITLE','CATEGORY'])
dataset_valid = NewsDataset(valid['TITLE'], valid['CATEGORY'])

szVOCAB = len(set(word2id.values())) + 1 
szEMB = 300
PADDING_IDX = len(set(word2id.values()))
szOUTPUT = 4
szHIDDEN = 300
LEARNING_RATE = 0.01
numEPOCHS = 100
szBATCH = 32
BIDIRECTIONAL = True #<----
numLAYERS = 3 #<---
device = torch.device('cuda')

class RNN(nn.Module):
  def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size, 
               num_layers, #<-----
               bidirectional #<-----
               ):
    super().__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.num_directions = 2 if bidirectional else 1
    #self.emb = nn.Embedding.from_pretrained(embeds, padding_idx=padding_idx)
    self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
    #self.rnn = nn.RNN(emb_size, hidden_size, num_layers, #<---
    self.rnn = nn.GRU(emb_size, hidden_size, num_layers, #<---
                      batch_first=True, 
                      bidirectional=bidirectional, #<---
                      dropout=0.3)
    self.fc = nn.Linear(hidden_size*self.num_directions, output_size)

  def forward(self, x):
    self.batch_size = x.size()[0]
    hidden = torch.zeros(self.num_layers*self.num_directions, #<-----
                         self.batch_size, self.hidden_size).to(device)
    emb = self.emb(x)
    out, hidden = self.rnn(emb, hidden)
    out = self.fc(out[:, -1, :])
    return out

model = RNN(szVOCAB, szEMB, PADDING_IDX, szOUTPUT, szHIDDEN, 
            numLAYERS, BIDIRECTIONAL) #<-----
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
model.to(device)
dataloader_train = DataLoader(dataset_train, batch_size=szBATCH, shuffle=True, collate_fn=Padsequence(PADDING_IDX)) 
dataloader_valid = DataLoader(dataset_valid, batch_size=szBATCH, shuffle=False, collate_fn=Padsequence(PADDING_IDX))
scheduler = optim.lr_scheduler.StepLR(optimizer, numEPOCHS, gamma=0.5)

train_loop()

##課題86. 畳み込みニューラルネットワーク (CNN)
ID番号で表現された単語列x=(x1,x2,…,xT)がある．ただし，Tは単語列の長さ，xt∈RVは単語のID番号のone-hot表記である（Vは単語の総数である）．畳み込みニューラルネットワーク（CNN: Convolutional Neural Network）を用い，単語列xからカテゴリy

を予測するモデルを実装せよ．

ただし，畳み込みニューラルネットワークの構成は以下の通りとする．

    単語埋め込みの次元数: dw

畳み込みのフィルターのサイズ: 3 トークン
畳み込みのストライド: 1 トークン
畳み込みのパディング: あり
畳み込み演算後の各時刻のベクトルの次元数: dh
畳み込み演算後に最大値プーリング（max pooling）を適用し，入力文をdh

    次元の隠れベクトルで表現

すなわち，時刻t
の特徴ベクトルpt∈Rdh

は次式で表される．
pt=g(W(px)[emb(xt−1);emb(xt);emb(xt+1)]+b(p))

ただし，W(px)∈Rdh×3dw,b(p)∈Rdh
はCNNのパラメータ，gは活性化関数（例えばtanhやReLUなど），[a;b;c]はベクトルa,b,cの連結である．なお，行列W(px)の列数が3dw

になるのは，3個のトークンの単語埋め込みを連結したものに対して，線形変換を行うためである．

最大値プーリングでは，特徴ベクトルの次元毎に全時刻における最大値を取り，入力文書の特徴ベクトルc∈Rdh
を求める．c[i]でベクトルcのi

番目の次元の値を表すことにすると，最大値プーリングは次式で表される．
c[i]=max1≤t≤Tpt[i]

最後に，入力文書の特徴ベクトルc
に行列W(yc)∈RL×dhとバイアス項b(y)∈RLによる線形変換とソフトマックス関数を適用し，カテゴリy

を予測する．
y=softmax(W(yc)c+b(y))

なお，この問題ではモデルの学習を行わず，ランダムに初期化された重み行列でy
を計算するだけでよい．

###解説
系列データに対し、画像処理で定番のCNNをかけてみる。
埋め込みベクトル表現した単語列データを画像のように見なし、ある単語の前後1単語ずつ含めて3単語分の埋め込みベクトルのデータに対し、畳み込みをかける。

###解答例

In [None]:
!pwd
%cd /content/drive/My Drive/Colab Notebooks/NLP

80の再実行

In [None]:
from torch import optim
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np
from torch.nn import functional as F

DEBUG = True

class CNN(nn.Module):
  def __init__(self, vocab_size, emb_size, padding_idx, output_size, out_channels, kernel_heights, stride, padding):
    super().__init__()
    #self.emb = nn.Embedding.from_pretrained(embeds, padding_idx=padding_idx)
    self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
    #入力1個を、out_channles個のFearure Mapに変換する。
    #変換は、seq長x埋め込みベクトルサイズの入力に対し、(3x埋め込みベクトルサイズ)のカーネルで畳み込む。結果は、seq長x1
    self.conv = nn.Conv2d(1, out_channels, (kernel_heights, emb_size), stride, (padding, 0))
    self.drop = nn.Dropout(0.3)
    self.fc = nn.Linear(out_channels, output_size)
 
  def forward(self, x):
    if DEBUG: print(f'input: mini-batch size {x.size()[0]}, seq len {x.size()[1]}')
    #SEQ長(ミニバッチごと固定長)を並べてConvolution（カーネルサイズは、縦が３、横が埋め込みベクトルサイズ）かけるため、次元を挿入する。
    emb = self.emb(x).unsqueeze(1)
    if DEBUG: print(f'embed: min-batch size {emb.size()[0]}, input channles {emb.size()[1]}, seq len {emb.size()[2]}, embed size {emb.size()[3]}')
    #3x埋め込みサイズのカーネルで畳み込みし、out_channels個のSEQ長x1の中間データを出力する。
    conv = self.conv(emb)
    conv = conv.squeeze(3)
    if DEBUG: print(f'conv output: mini-batch size {conv.size()[0]}, out_channels {conv.size()[1]}, Conv result {conv.size()[2]}')
    act = F.relu(conv)
    #seq長の方向に最大値を取得
    max_pool = F.max_pool1d(act, act.size()[2])
    max_pool = max_pool.squeeze(2)
    if DEBUG: print(f'max pooled: mini-batch size {act.size()[0]}, out_channels {act.size()[1]}')
    out = self.fc(self.drop(max_pool))
    if DEBUG: print(f'output {out.size()}')
    return out


In [None]:
# パラメータの設定
szVOCAB = len(set(word2id.values())) + 1
szEMB = 300
PADDING_IDX = len(set(word2id.values()))
szOUTPUT = 4
OUT_CHANNELS = 256
KERNEL_HEIGHTS = 3
STRIDE = 1
PADDING = 1

torch.manual_seed(0)
with open('./word2id.dict', mode='rb') as f: word2id = pickle.load(f)
train = pd.read_csv('./train.txt', sep='\t', header=None, names=['TITLE','CAT'])
dataset_train = NewsDataset(train['TITLE'], train['CAT'])
valid = pd.read_csv('./valid.txt', sep='\t', header=None, names=['TITLE','CAT'])
dataset_valid = NewsDataset(valid['TITLE'], valid['CAT'])

# モデルの定義
model = CNN(szVOCAB, szEMB, PADDING_IDX, szOUTPUT, OUT_CHANNELS, KERNEL_HEIGHTS, STRIDE, PADDING)

# 先頭10件の予測値取得
for i in range(10):
  X = dataset_train[i]['inputs']
  out = model(X.unsqueeze(0))
  print(torch.softmax(out, dim=-1))

##課題87. 確率的勾配降下法によるCNNの学習　[省略]
確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題86で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

###解説
86をミニバッチにして、GPU利用。77,78相当。とても時間がかかるので省略していい。GoogleがフリーのVMに割り当てるGPUのグレードを落としたかな？

###解答例

In [None]:
#
#複数データをミニバッチとして、並行して処理するようにする。RNNは入力が可変長であるため、並列実行しにくい。
#そこで、ミニバッチ単位で、最も長い単語列のサイズに合わせる。
#
class Padsequence():
  def __init__(self, padding_idx):
    self.padding_idx = padding_idx

  def __call__(self, batch):
    sequences = [x['inputs'] for x in batch]
    sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.padding_idx)
    labels = torch.LongTensor([x['labels'] for x in batch])
    return {'inputs': sequences_padded, 'labels': labels}

In [None]:
import matplotlib.pyplot as plt

class Measure():
  def __init__(self):
    self.loss_train_list = []
    self.loss_valid_list = []
    self.accuracy_train_list = []
    self.accuracy_valid_list = []

  def init_epoch(self):
    self.correct_train = 0
    self.total_train = 0
    self.correct_valid = 0
    self.total_valid = 0
    self.loss_train = 0.0
    self.loss_valid = 0.0

  def accuracy(self, inputs,outputs,labels):
    total = len(inputs)
    prediction = torch.argmax(outputs, dim=1)
    correct = torch.sum(prediction==labels)
    return total, correct

  def record_train(self,inputs,outputs,labels,lossitem):
    self.loss_train += lossitem
    total, correct = self.accuracy(inputs,outputs,labels)
    self.total_train += total
    self.correct_train += correct

  def train(self):
    self.loss_train /= len(dataset_train)
    self.accuracy_train = self.correct_train / self.total_train

  def record_valid(self,inputs,outputs,labels, lossitem):
    self.loss_valid += lossitem
    total, correct = self.accuracy(inputs,outputs,labels)
    self.total_valid += total
    self.correct_valid += correct

  def valid(self):
    self.loss_valid /= len(dataset_valid)
    self.accuracy_valid = self.correct_valid / self.total_valid

  def record_epoch(self, epoch):
    print(f'epoch: {epoch + 1}, loss_train: {self.loss_train:.4f}, loss_valid: {self.loss_valid:.4f}, \
    accuracy_train: {self.accuracy_train:.2f}, accuracy_valid: {self.accuracy_valid:.2f}')
    self.loss_train_list.append(self.loss_train)
    self.loss_valid_list.append(self.loss_valid)
    self.accuracy_train_list.append(self.accuracy_train)
    self.accuracy_valid_list.append(self.accuracy_valid)

  def degrading(self, epoch):
   return (epoch > 1 and self.loss_valid_list[epoch - 2] <= self.loss_valid_list[epoch - 1] <= self.loss_valid_list[epoch])

def draw(numEpochs, loss_train_list, loss_valid_list, accuracy_train_list, accuracy_valid_list ):
  plt.figure()
  plt.plot(range(numEpochs), loss_train_list, label='loss_train')
  plt.plot(range(numEpochs), loss_valid_list, label='loss_valid')
  plt.legend()
  plt.xlabel('epoch')
  plt.show()
  plt.figure()
  plt.plot(range(numEpochs), accuracy_train_list, label='accuracy_train')
  plt.plot(range(numEpochs), accuracy_valid_list, label='accuracy_valid')
  plt.legend()
  plt.xlabel('epoch')
  plt.show()

In [None]:
from torch import optim
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import numpy as np

DEBUG=False

torch.manual_seed(0)
with open('./word2id.dict', mode='rb') as f: word2id = pickle.load(f)
train = pd.read_csv('./train.txt', sep='\t', header=None, names=['TITLE','CAT'])
dataset_train = NewsDataset(train['TITLE'], train['CAT'])
valid = pd.read_csv('./valid.txt', sep='\t', header=None, names=['TITLE','CAT'])
dataset_valid = NewsDataset(valid['TITLE'], valid['CAT'])

szVOCAB = len(set(word2id.values())) + 1 
szEMB = 300
PADDING_IDX = len(set(word2id.values()))
szOUTPUT = 4
LEARNING_RATE = 0.003
numEPOCHS = 30
szBATCH = 32

OUT_CHANNELS = 256
KERNEL_HEIGHTS = 3
STRIDE = 1
PADDING = 1

device = torch.device('cuda')
model = CNN(szVOCAB, szEMB, PADDING_IDX, szOUTPUT, OUT_CHANNELS, KERNEL_HEIGHTS, STRIDE, PADDING)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
model.to(device)
dataloader_train = DataLoader(dataset_train, batch_size=szBATCH, shuffle=True, collate_fn=Padsequence(PADDING_IDX)) 
dataloader_valid = DataLoader(dataset_valid, batch_size=szBATCH, shuffle=False, collate_fn=Padsequence(PADDING_IDX))
scheduler = optim.lr_scheduler.StepLR(optimizer, numEPOCHS, gamma=0.5)

def train_loop():
  measure = Measure()
  for epoch in range(numEPOCHS):
    measure.init_epoch()
    model.train()
    for data in dataloader_train:
      optimizer.zero_grad()
      inputs = data['inputs'].to(device) #<-----------
      labels = data['labels'].to(device) #<-----------
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()
      measure.record_train(inputs,outputs,labels,loss.item())
    measure.train()  
    model.eval()
    with torch.no_grad():
      for data in dataloader_valid:
        optimizer.zero_grad()
        inputs = data['inputs'].to(device) #<------
        labels = data['labels'].to(device) #<------
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        measure.record_valid(inputs,outputs,labels,loss.item())
    measure.valid()
    measure.record_epoch(epoch)
    if measure.degrading(epoch): break
    scheduler.step()
  draw(epoch+1, measure.loss_train_list, measure.loss_valid_list, measure.accuracy_train_list, measure.accuracy_valid_list)

train_loop()


##課題88. パラメータチューニング [省略]
問題85や問題87のコードを改変し，ニューラルネットワークの形状やハイパーパラメータを調整しながら，高性能なカテゴリ分類器を構築せよ


##課題89. 事前学習済み言語モデルからの転移学習
事前学習済み言語モデル（例えばBERTなど）を出発点として，ニュース記事見出しをカテゴリに分類するモデルを構築せよ．

###解説

https://qiita.com/yamaru/items/63a342c844cff056a549　を参考にさせてもらいました。

読みやすいように書き直しています。

また精度が出るように、ネット構成、Optimizer、ロス関数を変えています。

PyTorchのtransformerクラスでなく、transformers というパッケージを使っています。

In [None]:
!pwd
%cd /content/drive/My Drive/Colab Notebooks/NLP

In [None]:
!pip install transformers==3

In [None]:
import numpy as np
import transformers
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch import optim
from torch import cuda
import time
from matplotlib import pyplot as plt
import pandas as pd

In [None]:
df = pd.read_csv('./newsCorpora.csv', header=None, sep='\t', 
    names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), 
            ['TITLE', 'CATEGORY']]
df['TITLE'] = df['TITLE'].str.replace('[^a-zA-Z]',' ', regex=True)
df['TITLE'] = df['TITLE'].str.replace(' +',' ', regex=True)
df.to_csv('./news.csv',sep='\t',index=False,header=False)

import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('./newsCorpora.csv', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]
train, valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=123, stratify=df['CATEGORY'])
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['CATEGORY'])
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
print(train.head())


In [None]:
class NewsDataset(Dataset):
  def __init__(self, x, y, tokenizer, max_len):
    self.x = x
    self.y = y
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):  # len(Dataset)で返す値を指定
    return len(self.y)

  def __getitem__(self, index):  # Dataset[index]で返す値を指定
    text = self.x[index]
    inputs = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      pad_to_max_length=True,
      truncation=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']　# 固定長で処理するため、短い入力は0 padding。その位置を示す。

    return {
      'ids': torch.LongTensor(ids),
      'mask': torch.LongTensor(mask),
      'labels': torch.Tensor(self.y[index])
    }

In [None]:
# 正解ラベルのone-hot化
y_train = pd.get_dummies(train, columns=['CATEGORY'])[['CATEGORY_b', 'CATEGORY_e', 'CATEGORY_t', 'CATEGORY_m']].values
y_valid = pd.get_dummies(valid, columns=['CATEGORY'])[['CATEGORY_b', 'CATEGORY_e', 'CATEGORY_t', 'CATEGORY_m']].values
y_test = pd.get_dummies(test, columns=['CATEGORY'])[['CATEGORY_b', 'CATEGORY_e', 'CATEGORY_t', 'CATEGORY_m']].values

# Datasetの作成
max_len = 20
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset_train = NewsDataset(train['TITLE'], y_train, tokenizer, max_len)
dataset_valid = NewsDataset(valid['TITLE'], y_valid, tokenizer, max_len)
dataset_test = NewsDataset(test['TITLE'], y_test, tokenizer, max_len)

print(dataset_train[0])

In [None]:
class BERTClass(torch.nn.Module):
  def __init__(self, drop_rate, output_size):
    super().__init__()
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    #self.drop = torch.nn.Dropout(drop_rate)
    #self.fc = torch.nn.Linear(768, output_size)  # BERTの出力に合わせて768次元を指定

    self.fc1 = nn.Linear(768, 64)
    nn.init.kaiming_normal_(self.fc1.weight)
    self.fc2 = nn.Linear(64, output_size)
    nn.init.kaiming_normal_(self.fc2.weight)
    self.bn = nn.BatchNorm1d(64)

  def forward(self, ids, mask):
    _, out = self.bert(ids, attention_mask=mask)
    #out = self.fc(self.drop(out))
    out = self.fc2(F.relu(self.bn(self.fc1(out))))
    return out

In [None]:
import matplotlib.pyplot as plt

class Measure():
  def __init__(self):
    self.loss_train_list = []
    self.loss_valid_list = []
    self.accuracy_train_list = []
    self.accuracy_valid_list = []

  def init_epoch(self):
    self.correct_train = 0
    self.total_train = 0
    self.correct_valid = 0
    self.total_valid = 0
    self.loss_train = 0.0
    self.loss_valid = 0.0

  def accuracy(self, inputs,outputs,labels):
    total = len(inputs)
    prediction = torch.argmax(outputs, dim=1)
    correct = torch.sum(prediction==labels)
    return total, correct

  def record_train(self,inputs,outputs,labels,lossitem):
    self.loss_train += lossitem
    total, correct = self.accuracy(inputs,outputs,labels)
    self.total_train += total
    self.correct_train += correct

  def train(self):
    self.loss_train /= len(dataset_train)
    self.accuracy_train = self.correct_train / self.total_train

  def record_valid(self,inputs,outputs,labels, lossitem):
    self.loss_valid += lossitem
    total, correct = self.accuracy(inputs,outputs,labels)
    self.total_valid += total
    self.correct_valid += correct

  def valid(self):
    self.loss_valid /= len(dataset_valid)
    self.accuracy_valid = self.correct_valid / self.total_valid

  def record_epoch(self, epoch):
    print(f'epoch: {epoch + 1}, loss_train: {self.loss_train:.4f}, loss_valid: {self.loss_valid:.4f}, \
    accuracy_train: {self.accuracy_train:.2f}, accuracy_valid: {self.accuracy_valid:.2f}')
    self.loss_train_list.append(self.loss_train)
    self.loss_valid_list.append(self.loss_valid)
    self.accuracy_train_list.append(self.accuracy_train)
    self.accuracy_valid_list.append(self.accuracy_valid)

  def degrading(self, epoch):
   return (epoch > 1 and self.loss_valid_list[epoch - 2] <= self.loss_valid_list[epoch - 1] <= self.loss_valid_list[epoch])

def draw(numEpochs, loss_train_list, loss_valid_list, accuracy_train_list, accuracy_valid_list ):
  plt.figure()
  plt.plot(range(numEpochs), loss_train_list, label='loss_train')
  plt.plot(range(numEpochs), loss_valid_list, label='loss_valid')
  plt.legend()
  plt.xlabel('epoch')
  plt.show()
  plt.figure()
  plt.plot(range(numEpochs), accuracy_train_list, label='accuracy_train')
  plt.plot(range(numEpochs), accuracy_valid_list, label='accuracy_valid')
  plt.legend()
  plt.xlabel('epoch')
  plt.show()

In [None]:
class Measure2(Measure):
  def __init__(self):
    super().__init__()
  def accuracy(self, inputs,outputs,labels):
    prediction = torch.argmax(outputs, dim=-1) # バッチサイズの長さの予測ラベル配列
    labels = torch.argmax(labels, dim=-1)  # バッチサイズの長さの正解ラベル配列
    total = len(labels)
    correct = torch.sum(prediction == labels)
    return total, correct

In [None]:

DROP_RATE = 0.2
szOUTPUT = 4
szBATCH = 32
numEPOCHS = 10
LEARNING_RATE = 0.005

model = BERTClass(DROP_RATE, szOUTPUT)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
device = 'cuda' if cuda.is_available() else 'cpu'
model.to(device)
dataloader_train = DataLoader(dataset_train, batch_size=szBATCH, shuffle=True)
dataloader_valid = DataLoader(dataset_valid, batch_size=szBATCH, shuffle=False)
measure = Measure2()
for epoch in range(numEPOCHS):
  measure.init_epoch()
  model.train()
  for data in dataloader_train:
    optimizer.zero_grad()
    ids, mask, labels = data['ids'].to(device), data['mask'].to(device), data['labels'].to(device)
    outputs = model(ids, mask)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    measure.record_train(ids,outputs,labels,loss.item())  
  measure.train()  
  model.eval()
  with torch.no_grad():
    for data in dataloader_valid:
      optimizer.zero_grad()
      ids, mask, labels = data['ids'].to(device), data['mask'].to(device), data['labels'].to(device)
      outputs = model(ids, mask)
      loss = criterion(outputs, labels)
      measure.record_valid(ids,outputs,labels,loss.item())
  measure.valid()
  measure.record_epoch(epoch)
  if measure.degrading(epoch): break
  optimizer.step()
draw(epoch+1, measure.loss_train_list, measure.loss_valid_list, measure.accuracy_train_list, measure.accuracy_valid_list)
