## 80. ID番号への変換

問題51で構築した学習データ中の単語にユニークなID番号を付与したい．学習データ中で最も頻出する単語に1，2番目に頻出する単語に2，……といった方法で，学習データ中で2回以上出現する単語にID番号を付与せよ．そして，与えられた単語列に対して，ID番号の列を返す関数を実装せよ．ただし，出現頻度が2回未満の単語のID番号はすべて0とせよ

In [None]:
import os 

cu_path = os.getcwd()
data_path = os.path.join(cu_path,'data','full_data_tokened.txt')

vocab = {}
with open(data_path) as data:
    for line in data:
        text_data = line.split("\t")[0]
        words = text_data.split()

        for word in  words:
            if word in vocab:
                vocab[word] +=1
            else:
                vocab[word] = 1

#あとでサンプルで使う
sample = words

#(単語,頻度)の順にタプルで入ってる
vocab_sorted = sorted(vocab.items(),key=lambda x:x[1])[::-1]

vocab_with_id = {}
#idに変換
for item in vocab_sorted:
    if item[1] >= 2:
        vocab_with_id[item[0]] = len(vocab_with_id.items())
    else:
        vocab_with_id[item[0]] = 0

def word2id(words):
    return [vocab_with_id[word] for word in words]
#sample
print(sample)
print(word2id(sample))

## 81. RNNによる予測

ID番号で表現された単語列x=(x1,x2,…,xT)
がある．ただし，T
は単語列の長さ，xt∈ℝV
は単語のID番号のone-hot表記である（V
は単語の総数である）．再帰型ニューラルネットワーク（RNN: Recurrent Neural Network）を用い，単語列x
からカテゴリy
を予測するモデルとして，次式を実装せよ．

h→0=0,h→t=RNN−→−−(emb(xt),h→t−1),y=softmax(W(yh)h→T+b(y))
ただし，emb(x)∈ℝdw
は単語埋め込み（単語のone-hot表記から単語ベクトルに変換する関数），h→t∈ℝdh
は時刻t
の隠れ状態ベクトル，RNN−→−−(x,h)
は入力x
と前時刻の隠れ状態h
から次状態を計算するRNNユニット，W(yh)∈ℝL×dh
は隠れ状態ベクトルからカテゴリを予測するための行列，b(y)∈ℝL
はバイアス項である（dw,dh,L
はそれぞれ，単語埋め込みの次元数，隠れ状態ベクトルの次元数，ラベル数である）．RNNユニットRNN−→−−(x,h)
には様々な構成が考えられるが，典型例として次式が挙げられる．

RNN−→−−(x,h)=g(W(hx)x+W(hh)h+b(h))
ただし，W(hx)∈ℝdh×dw，W(hh)∈ℝdh×dh,b(h)∈ℝdh
はRNNユニットのパラメータ，g
は活性化関数（例えばtanh
やReLUなど）である．

なお，この問題ではパラメータの学習を行わず，ランダムに初期化されたパラメータでy
を計算するだけでよい．次元数などのハイパーパラメータは，dw=300,dh=50
など，適当な値に設定せよ（以降の問題でも同様である）．

In [2]:
#train dev test 前の章で作ったものを使う
#full_data_tokened.txtは3つの全部が入ってる
#mozes でトークナイズ済み
import os 
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils.rnn 
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence,pad_sequence,pack_sequence
from sklearn.metrics import accuracy_score
import warnings
warnings.simplefilter('ignore')

class SimpleRNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(SimpleRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim,hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim,tagset_size)

    def forward(self,sentence,word_seq_lengths):
        embeds = self.word_embeddings(sentence)  

        #手書きでpaddingしたのでpackしてRNNに入れて戻す
        packed_words = pack_padded_sequence(embeds, word_seq_lengths.cpu().numpy(), batch_first=True)   
        hidden = None
        output, _ = self.rnn(packed_words,hidden)
        # output, _ = self.rnn(embeds)
        output, _ = pad_packed_sequence(output,batch_first=True)
        
        #ここ0の分処理してないので直す
        output=torch.sum(output,dim=1)
        
        tag_space = self.hidden2tag(output)
 
        tag_scores = F.log_softmax(tag_space,dim=1)
        return tag_scores

#vocab 次元の辞書を作る 80番の内容
def make_vocab(data_path):
    vocab = {}
    with open(data_path) as data:
        for line in data:
            text_data = line.split("\t")[0]
            words = text_data.split()

            for word in  words:
                if word in vocab:
                    vocab[word] +=1
                else:
                    vocab[word] = 1

    #(単語,頻度)の順にタプルで入ってる
    vocab_sorted = sorted(vocab.items(),key=lambda x:x[1])[::-1]

    vocab_with_id = {}
    #idに変換
    for item in vocab_sorted:
        if item[1] >= 2:
            vocab_with_id[item[0]] = len(vocab_with_id.items())
        else:
            vocab_with_id[item[0]] = 0

    return vocab_with_id

#train,test,devを読み込む関数
def data_import(mode,vocab_with_id):
    cu_path = os.getcwd()
    name = mode
    data_path = os.path.join(cu_path,"data",name+".txt")

    text_data_list = []
    label_list = []
    with open(data_path) as data:
        for line in data:
            text_data,label  = line.strip().split("\t")
            words = text_data.split()
            text_data_list.append(words)
            label_list.append(label)
    
    return text_data_list,label_list

#単語列 -> id に変換する
def prepare_sequence(seqs,with_id,mode):
    bacth_id_list = []
    for seq in seqs:
        idxs = [with_id[w] for w in seq]
        bacth_id_list.append(idxs)
    # print(bacth_id_list)
    # return torch.tensor(idxs, dtype=torch.long)
    return bacth_id_list

#結局手で書いてしまった
#系列長が違う生でlist > tensor の変換がうまくいかないので0埋めを手動でやった
def sequence2padded_tesnsor(seqs,labels):    
    batch_size = len(seqs)
    word_seq_lengths = torch.LongTensor(list(map(len, seqs)))
    max_seq_len = word_seq_lengths.max().item()
    word_seq_tensor = torch.zeros((batch_size, max_seq_len), requires_grad=True).long()
    label_seq_tensor = torch.zeros((batch_size), requires_grad=True).long()
    for idx, (seq, label,seqlen) in enumerate(zip(seqs, labels,word_seq_lengths)):
        seqlen = seqlen.item()
        word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        label_seq_tensor[idx] = torch.LongTensor(label)
    
    word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True)
    word_seq_tensor = word_seq_tensor[word_perm_idx]#長さごとにに並べ替え
    label_seq_tensor = label_seq_tensor[word_perm_idx]

    return word_seq_tensor,label_seq_tensor,word_seq_lengths
    
def main():
    #paraameter
    EMBEDDING_DIM = 300
    HIDDEN_DIM = 50
    BATCH_SIZE = 32
     
    cu_path = os.getcwd()
    data_path = os.path.join(cu_path,'data','full_data_tokened.txt')
    vocab_with_id = make_vocab(data_path)
    train_X, train_y = data_import("train",vocab_with_id)

    label_with_id = {"b":0,"t":1,"e":2,"m":3}

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = SimpleRNN(EMBEDDING_DIM, 
                      HIDDEN_DIM, 
                      len(vocab_with_id), 
                      len(label_with_id))
    loss_function = nn.NLLLoss()

    optimizer = optim.SGD(model.parameters(), lr=0.01)

    model.to(device)

    model.eval()
    for epoch in range(1):
        print(epoch)
        train_num = len(train_X)
        batch_size = BATCH_SIZE

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1

        loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])
        
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            #全部一気に載せれなかったのでバッチで少しづつ載せる
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = train_X[start:end]
            tags = train_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets)

            tag_scores = model(sentence_in,word_seq_lengths)
            
            # print(tag_scores)
            
            pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets)
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
            
        if torch.cuda.is_available():
            train_acc = accuracy_score(gold_list.cpu(), pred_list.cpu())
        else:
            train_acc = accuracy_score(gold_list, pred_list)

        print("train_acc :",train_acc)        

if __name__ == '__main__':
    main()

0
train_acc : 0.2926371149511645


##  82. 確率的勾配降下法による学習

確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題81で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

In [8]:
#train dev test 前の章で作ったものを使う
#full_data_tokened.txtは3つの全部が入ってる
#mozes でtトークナイズ済み
import os 
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence,pad_sequence,pack_sequence
from sklearn.metrics import accuracy_score
import random

class SimpleRNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(SimpleRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim,hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim,tagset_size)

    def forward(self,sentence,word_seq_lengths):
        embeds = self.word_embeddings(sentence)  
        #手書きでpaddingしたのでpackしてRNNに入れて戻す
        packed_words = pack_padded_sequence(embeds, word_seq_lengths.cpu().numpy(), batch_first=True)  
        hidden = None
        output, _ = self.rnn(packed_words,hidden)
        # output, _ = self.rnn(embeds)
        output, _ = pad_packed_sequence(output,batch_first=True)
        
        #ここ0の分処理してないので直す
        output=torch.sum(output,dim=1)
        
        tag_space = self.hidden2tag(output)
 
        tag_scores = F.log_softmax(tag_space,dim=1)
        return tag_scores

#vocab 次元の辞書を作る 80番の内容
def make_vocab(data_path):
    vocab = {}
    with open(data_path) as data:
        for line in data:
            text_data = line.split("\t")[0]
            words = text_data.split()

            for word in  words:
                if word in vocab:
                    vocab[word] +=1
                else:
                    vocab[word] = 1

    #(単語,頻度)の順にタプルで入ってる
    vocab_sorted = sorted(vocab.items(),key=lambda x:x[1])[::-1]

    vocab_with_id = {}
    #idに変換
    for item in vocab_sorted:
        if item[1] >= 2:
            vocab_with_id[item[0]] = len(vocab_with_id.items())
        else:
            vocab_with_id[item[0]] = 0

    return vocab_with_id

#train,test,devを読み込む関数
def data_import(mode,vocab_with_id):
    cu_path = os.getcwd()
    name = mode
    data_path = os.path.join(cu_path,"data",name+".txt")

    text_data_list = []
    label_list = []
    with open(data_path) as data:
        for line in data:
            text_data,label  = line.strip().split("\t")
            words = text_data.split()
            text_data_list.append(words)
            label_list.append(label)
    
    return text_data_list,label_list

#単語列 -> id に変換する
def prepare_sequence(seqs,with_id,mode):
    bacth_id_list = []
    for seq in seqs:
        idxs = [with_id[w] for w in seq]
        bacth_id_list.append(idxs)
    # print(bacth_id_list)
    # return torch.tensor(idxs, dtype=torch.long)
    return bacth_id_list

#結局手で書いてしまった
#系列長が違う生でlist > tensor の変換がうまくいかないので0埋めを手動でやった
def sequence2padded_tesnsor(seqs,labels):    
    batch_size = len(seqs)
    word_seq_lengths = torch.LongTensor(list(map(len, seqs)))
    max_seq_len = word_seq_lengths.max().item()
    word_seq_tensor = torch.zeros((batch_size, max_seq_len), requires_grad=True).long()
    label_seq_tensor = torch.zeros((batch_size), requires_grad=True).long()
    # mask = torch.zeros((batch_size, max_seq_len), requires_grad=True).byte()
    for idx, (seq, label,seqlen) in enumerate(zip(seqs, labels,word_seq_lengths)):
        seqlen = seqlen.item()
        word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        label_seq_tensor[idx] = torch.LongTensor(label)
        # mask[idx, :seqlen] = torch.Tensor([1] * seqlen)
    
    word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True)
    word_seq_tensor = word_seq_tensor[word_perm_idx]#長さごとにに並べ替え
    label_seq_tensor = label_seq_tensor[word_perm_idx]

    return word_seq_tensor,label_seq_tensor,word_seq_lengths
    
def main():
    #paraameter
    EMBEDDING_DIM = 300
    HIDDEN_DIM = 50
    BATCH_SIZE = 32
     
    cu_path = os.getcwd()
    data_path = os.path.join(cu_path,'data','full_data_tokened.txt')
    vocab_with_id = make_vocab(data_path)
    train_X, train_y = data_import("train",vocab_with_id)

    label_with_id = {"b":0,"t":1,"e":2,"m":3}

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = SimpleRNN(EMBEDDING_DIM, 
                      HIDDEN_DIM, 
                      len(vocab_with_id), 
                      len(label_with_id))
    loss_function = nn.NLLLoss()

    optimizer = optim.SGD(model.parameters(), lr=0.01)

    model.to(device)

    for epoch in range(10):
        model.train()
        print(epoch)
        train_num = len(train_X)
        batch_size = BATCH_SIZE

        #shuffle
        combined=list(zip(train_X,train_y))
        random.shuffle(combined)
        train_X,train_y=zip(*combined)

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1
        
        loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])
        
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = train_X[start:end]
            tags = train_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            loss_total += loss.item()
            loss.backward()
            optimizer.step()

            pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets)
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)

        if torch.cuda.is_available():
            train_acc = accuracy_score(gold_list.cpu(), pred_list.cpu())
        else:
            train_acc = accuracy_score(gold_list, pred_list)

        print("train_acc :",train_acc)        
        print("train_loss :",loss_total)

        #========================================================
        #dev
        dev_X, dev_y = data_import("dev",vocab_with_id)
        train_num = len(dev_X)
        batch_size = BATCH_SIZE

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1

        dev_loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])

        model.eval()
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = dev_X[start:end]
            tags = dev_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            dev_loss_total += loss.item()

            pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets)
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
        
        if torch.cuda.is_available():
            dev_acc = accuracy_score(gold_list.cpu(), pred_list.cpu())
        else:
            dev_acc = accuracy_score(gold_list, pred_list)

        print("dev_acc :",dev_acc)        
        print("devloss :",dev_loss_total)

if __name__ == '__main__':
    main()


0
train_acc : 0.6216190833959429
train_loss : 334.00431844592094
dev_acc : 0.6729041916167665
devloss : 37.01860195398331
1
train_acc : 0.7006010518407213
train_loss : 269.1085135638714
dev_acc : 0.6916167664670658
devloss : 34.09345591068268
2
train_acc : 0.7271788129226145
train_loss : 242.58858534693718
dev_acc : 0.7208083832335329
devloss : 32.20902916789055
3
train_acc : 0.7556348610067618
train_loss : 221.2155325114727
dev_acc : 0.7357784431137725
devloss : 30.744750767946243
4
train_acc : 0.7821187077385424
train_loss : 201.87052065134048
dev_acc : 0.7410179640718563
devloss : 30.107810616493225
5
train_acc : 0.7982719759579263
train_loss : 184.1765981465578
dev_acc : 0.7574850299401198
devloss : 28.469183325767517
6
train_acc : 0.818557475582269
train_loss : 168.24360464513302
dev_acc : 0.7544910179640718
devloss : 28.34752196073532
7
train_acc : 0.8386551465063862
train_loss : 153.6498180180788
dev_acc : 0.7702095808383234
devloss : 27.637164175510406
8
train_acc : 0.855935386

## 83. ミニバッチ化・GPU上での学習

問題82のコードを改変し，B
事例ごとに損失・勾配を計算して学習を行えるようにせよ（B
の値は適当に選べ）．また，GPU上で学習を実行せよ．

In [13]:
import os 
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence,pad_sequence,pack_sequence
from sklearn.metrics import accuracy_score
import random


class SimpleRNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(SimpleRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim,hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim,tagset_size)

    def forward(self,sentence,word_seq_lengths):
        embeds = self.word_embeddings(sentence)  
        #手書きでpaddingしたのでpackしてRNNに入れて戻す
        packed_words = pack_padded_sequence(embeds, word_seq_lengths.cpu().numpy(), batch_first=True)   
        hidden = None
        output, _ = self.rnn(packed_words,hidden)
        # output, _ = self.rnn(embeds)
        output, _ = pad_packed_sequence(output,batch_first=True)
        
        #ここ0の分処理してないので直す
        output=torch.sum(output,dim=1)
        
        tag_space = self.hidden2tag(output)
 
        tag_scores = F.log_softmax(tag_space,dim=1)
        return tag_scores

#vocab 次元の辞書を作る 80番の内容
def make_vocab(data_path):
    vocab = {}
    with open(data_path) as data:
        for line in data:
            text_data = line.split("\t")[0]
            words = text_data.split()

            for word in  words:
                if word in vocab:
                    vocab[word] +=1
                else:
                    vocab[word] = 1

    #(単語,頻度)の順にタプルで入ってる
    vocab_sorted = sorted(vocab.items(),key=lambda x:x[1])[::-1]

    vocab_with_id = {}
    #idに変換
    for item in vocab_sorted:
        if item[1] >= 2:
            vocab_with_id[item[0]] = len(vocab_with_id.items())
        else:
            vocab_with_id[item[0]] = 0

    return vocab_with_id

#train,test,devを読み込む関数
def data_import(mode,vocab_with_id):
    cu_path = os.getcwd()
    name = mode
    data_path = os.path.join(cu_path,"data",name+".txt")

    text_data_list = []
    label_list = []
    with open(data_path) as data:
        for line in data:
            text_data,label  = line.strip().split("\t")
            words = text_data.split()
            text_data_list.append(words)
            label_list.append(label)
    
    return text_data_list,label_list

#単語列 -> id に変換する
def prepare_sequence(seqs,with_id,mode):
    bacth_id_list = []
    for seq in seqs:
        idxs = [with_id[w] for w in seq]
        bacth_id_list.append(idxs)
    # print(bacth_id_list)
    # return torch.tensor(idxs, dtype=torch.long)
    return bacth_id_list

#結局手で書いてしまった
#系列長が違う生でlist > tensor の変換がうまくいかないので0埋めを手動でやった
def sequence2padded_tesnsor(seqs,labels,device):    
    batch_size = len(seqs)
    word_seq_lengths = torch.LongTensor(list(map(len, seqs)))
    max_seq_len = word_seq_lengths.max().item()
    word_seq_tensor = torch.zeros((batch_size, max_seq_len), requires_grad=True).long()
    label_seq_tensor = torch.zeros((batch_size), requires_grad=True).long()
    # mask = torch.zeros((batch_size, max_seq_len), requires_grad=True).byte()
    for idx, (seq, label,seqlen) in enumerate(zip(seqs, labels,word_seq_lengths)):
        seqlen = seqlen.item()
        word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        label_seq_tensor[idx] = torch.LongTensor(label)
        # mask[idx, :seqlen] = torch.Tensor([1] * seqlen)
    
    word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True)
    word_seq_tensor = word_seq_tensor[word_perm_idx]#長さごとにに並べ替え
    label_seq_tensor = label_seq_tensor[word_perm_idx]

    if device == "cuda":
        word_seq_tensor = word_seq_tensor.to(device)
        label_seq_tensor = label_seq_tensor.to(device)
        word_seq_lengths = word_seq_lengths.to(device)

    return word_seq_tensor,label_seq_tensor,word_seq_lengths
    
def main():
    #paraameter
    EMBEDDING_DIM = 300
    HIDDEN_DIM = 50
    BATCH_SIZE = 32
     
    cu_path = os.getcwd()
    data_path = os.path.join(cu_path,'data','full_data_tokened.txt')
    vocab_with_id = make_vocab(data_path)
    train_X, train_y = data_import("train",vocab_with_id)

    label_with_id = {"b":0,"t":1,"e":2,"m":3}

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = SimpleRNN(EMBEDDING_DIM, 
                      HIDDEN_DIM, 
                      len(vocab_with_id), 
                      len(label_with_id))
    loss_function = nn.NLLLoss()

    optimizer = optim.SGD(model.parameters(), lr=0.01)

    model.to(device)

    for epoch in range(10):
        model.train()
        print(epoch)
        train_num = len(train_X)
        batch_size = BATCH_SIZE

        #shuffle
        combined=list(zip(train_X,train_y))
        random.shuffle(combined)
        train_X,train_y=zip(*combined)

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1
        
        loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])
        
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = train_X[start:end]
            tags = train_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets,device)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            loss_total += loss.item()
            loss.backward()
            optimizer.step()

            if torch.cuda.is_available():
                pred_y = torch.argmax(tag_scores,dim=1).cpu().numpy()
            else:
                pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets.cpu())
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
        
        train_acc = accuracy_score(gold_list, pred_list)

        print("train_acc :",train_acc)        
        print("train_loss :",loss_total)

        #========================================================
        #dev
        dev_X, dev_y = data_import("dev",vocab_with_id)
        train_num = len(dev_X)
        batch_size = BATCH_SIZE

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1

        dev_loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])

        model.eval()
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = dev_X[start:end]
            tags = dev_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets,device)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            dev_loss_total += loss.item()
            
            if torch.cuda.is_available():
                pred_y = torch.argmax(tag_scores,dim=1).cpu().numpy()
            else:
                pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets.cpu())
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
        
        dev_acc = accuracy_score(gold_list, pred_list)

        print("dev_acc :",dev_acc)        
        print("devloss :",dev_loss_total)

if __name__ == '__main__':
    main()


0
train_acc : 0.6132607062359129
train_loss : 334.6142292022705
dev_acc : 0.6796407185628742
devloss : 36.24382966756821
1
train_acc : 0.697877535687453
train_loss : 269.3880696296692
dev_acc : 0.7050898203592815
devloss : 32.87444168329239
2
train_acc : 0.7277422990232908
train_loss : 241.03529497981071
dev_acc : 0.7215568862275449
devloss : 31.17666858434677
3
train_acc : 0.7537565740045079
train_loss : 218.00044417381287
dev_acc : 0.7342814371257484
devloss : 30.350026458501816
4
train_acc : 0.7831517655897822
train_loss : 198.1716799288988
dev_acc : 0.7402694610778443
devloss : 28.892420887947083
5
train_acc : 0.8026859504132231
train_loss : 181.21855601668358
dev_acc : 0.7470059880239521
devloss : 27.663793861865997
6
train_acc : 0.8218444778362134
train_loss : 165.22911374270916
dev_acc : 0.7604790419161677
devloss : 27.004014551639557
7
train_acc : 0.8385612321562734
train_loss : 150.19089603424072
dev_acc : 0.7612275449101796
devloss : 26.666532933712006
8
train_acc : 0.8539631

## 84. 単語ベクトルの導入

事前学習済みの単語ベクトル（例えば，Google Newsデータセット（約1,000億単語）での学習済み単語ベクトル）で単語埋め込みemb(x)
を初期化し，学習せよ．

In [14]:
import os 
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence,pad_sequence,pack_sequence
from sklearn.metrics import accuracy_score
import random
import gensim


class SimpleRNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,pre_trained_embedding):
        super(SimpleRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight = nn.Parameter(torch.from_numpy(pre_trained_embedding))
        self.rnn = nn.RNN(embedding_dim,hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim,tagset_size)

        # if pre_trained_embedding:


    def forward(self,sentence,word_seq_lengths):
        embeds = self.word_embeddings(sentence)  
        #手書きでpaddingしたのでpackしてRNNに入れて戻す
        packed_words = pack_padded_sequence(embeds, word_seq_lengths.cpu().numpy(), batch_first=True) 
        hidden = None
        output, h_n = self.rnn(packed_words,hidden)
        #h_nは各時刻の情報がpackされている情報がくる
        #そのまま使うとpackの最後の情報がピンポイントで使えて便利
        tag_space = self.hidden2tag(h_n[0])
 
        tag_scores = F.log_softmax(tag_space,dim=1)
        return tag_scores

#vocab 次元の辞書を作る 80番の内容
def make_vocab(data_path):
    vocab = {}
    with open(data_path) as data:
        for line in data:
            text_data = line.split("\t")[0]
            words = text_data.split()

            for word in  words:
                if word in vocab:
                    vocab[word] +=1
                else:
                    vocab[word] = 1

    #(単語,頻度)の順にタプルで入ってる
    vocab_sorted = sorted(vocab.items(),key=lambda x:x[1])[::-1]

    vocab_with_id = {}
    #idに変換
    for item in vocab_sorted:
        #default_dictを使って一行で書く
        if item[1] >= 2:
            vocab_with_id[item[0]] = len(vocab_with_id.items())+1
        else:
            vocab_with_id[item[0]] = 0

    return vocab_with_id

#train,test,devを読み込む関数
def data_import(mode,vocab_with_id):
    cu_path = os.getcwd()
    name = mode
    data_path = os.path.join(cu_path,"data",name+".txt")

    text_data_list = []
    label_list = []
    with open(data_path) as data:
        for line in data:
            text_data,label  = line.strip().split("\t")
            words = text_data.split()
            text_data_list.append(words)
            label_list.append(label)
    
    return text_data_list,label_list

#単語列 -> id に変換する
#ここでID errorの時に0に処理するプログラムに変える
def prepare_sequence(seqs,with_id,mode):
    bacth_id_list = []
    for seq in seqs:
        idxs = [with_id[w] if (w in with_id) else 0 for w in seq]
        bacth_id_list.append(idxs)
    # print(bacth_id_list)
    # return torch.tensor(idxs, dtype=torch.long)
    return bacth_id_list


#torch.saveはnumpyでもできる
#data => torch.saveでsave それをtorch.loadで読み込むと早くなるかも

#データセットをある程度長さごとに固めてバッチを作ると系列長が揃って処理時間が短くなる
#Allen_NLPを使うと便利

#結局手で書いてしまった
#系列長が違う生でlist > tensor の変換がうまくいかないので0埋めを手動でやった
def sequence2padded_tesnsor(seqs,labels,device):    
    batch_size = len(seqs)
    word_seq_lengths = torch.LongTensor(list(map(len, seqs)))
    max_seq_len = word_seq_lengths.max().item()
    word_seq_tensor = torch.zeros((batch_size, max_seq_len), requires_grad=True).long()
    label_seq_tensor = torch.zeros((batch_size), requires_grad=True).long()
    # mask = torch.zeros((batch_size, max_seq_len), requires_grad=True).byte()
    for idx, (seq, label,seqlen) in enumerate(zip(seqs, labels,word_seq_lengths)):
        seqlen = seqlen.item()
        word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        label_seq_tensor[idx] = torch.LongTensor(label)
        # mask[idx, :seqlen] = torch.Tensor([1] * seqlen)
    
    word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True)
    word_seq_tensor = word_seq_tensor[word_perm_idx]#長さごとにに並べ替え
    label_seq_tensor = label_seq_tensor[word_perm_idx]

    if device == "cuda":
        word_seq_tensor = word_seq_tensor.to(device)
        label_seq_tensor = label_seq_tensor.to(device)
        word_seq_lengths = word_seq_lengths.to(device)

    return word_seq_tensor,label_seq_tensor,word_seq_lengths

def init_embedding(vocab_with_id,EMBEDDING_DIM):
    from gensim.models import KeyedVectors
    path = 'data/GoogleNews-vectors-negative300.bin'
    vectors = KeyedVectors.load_word2vec_format(path,binary=True)
    matrix = np.zeros((len(vocab_with_id),EMBEDDING_DIM))

    for word,id in vocab_with_id.items():
        if word in vectors.vocab:
            matrix[id] = vectors[word]
    matrix[0] = np.zeros(EMBEDDING_DIM)
    

    return matrix
        

def main():
    #paraameter
    EMBEDDING_DIM = 300
    HIDDEN_DIM = 50
    BATCH_SIZE = 32
     
    cu_path = os.getcwd()
    data_path = os.path.join(cu_path,'data','train.txt')
    vocab_with_id = make_vocab(data_path)
    pre_trained_embedding = init_embedding(vocab_with_id,EMBEDDING_DIM)
    
    train_X, train_y = data_import("train",vocab_with_id)

    label_with_id = {"b":0,"t":1,"e":2,"m":3}

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = SimpleRNN(EMBEDDING_DIM, 
                      HIDDEN_DIM, 
                      len(vocab_with_id), 
                      len(label_with_id),
                      pre_trained_embedding).float()
    #型エラーが出たのでfloatにして修正
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    
    model.to(device)



    for epoch in range(10):
        model.train()
        print(epoch)
        train_num = len(train_X)
        batch_size = BATCH_SIZE

        #shuffle
        combined=list(zip(train_X,train_y))
        random.shuffle(combined)
        train_X,train_y=zip(*combined)

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1
        
        loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])
        
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = train_X[start:end]
            tags = train_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets,device)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            loss_total += loss.item()
            loss.backward()
            optimizer.step()

            if torch.cuda.is_available():
                pred_y = torch.argmax(tag_scores,dim=1).cpu().numpy()
            else:
                pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets.cpu())
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
        
        train_acc = accuracy_score(gold_list, pred_list)

        print("train_acc :",train_acc)        
        print("train_loss :",loss_total)

        #========================================================
        #dev
        dev_X, dev_y = data_import("dev",vocab_with_id)
        train_num = len(dev_X)
        batch_size = BATCH_SIZE

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1

        dev_loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])

        model.eval()
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = dev_X[start:end]
            tags = dev_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets,device)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            dev_loss_total += loss.item()
            
            if torch.cuda.is_available():
                pred_y = torch.argmax(tag_scores,dim=1).cpu().numpy()
            else:
                pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets.cpu())
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
        
        dev_acc = accuracy_score(gold_list, pred_list)

        print("dev_acc :",dev_acc)        
        print("devloss :",dev_loss_total)

if __name__ == '__main__':
    main()


0
train_acc : 0.7383546205860255
train_loss : 242.85468243062496
dev_acc : 0.8038922155688623
devloss : 22.48978614807129
1
train_acc : 0.8225957926371149
train_loss : 167.74377293139696
dev_acc : 0.8473053892215568
devloss : 18.099978432059288
2
train_acc : 0.8524605559729527
train_loss : 141.10449175536633
dev_acc : 0.8637724550898204
devloss : 16.390637129545212
3
train_acc : 0.8672051089406462
train_loss : 131.88817410171032
dev_acc : 0.8510479041916168
devloss : 17.1375330388546
4
train_acc : 0.8752817430503381
train_loss : 122.67428221926093
dev_acc : 0.7305389221556886
devloss : 31.017480820417404
5
train_acc : 0.8855184072126221
train_loss : 112.81891445443034
dev_acc : 0.8308383233532934
devloss : 20.215745612978935
6
train_acc : 0.902610818933133
train_loss : 98.28194688819349
dev_acc : 0.8817365269461078
devloss : 14.844500876963139
7
train_acc : 0.8713373403456048
train_loss : 126.15577337145805
dev_acc : 0.8540419161676647
devloss : 17.645660877227783
8
train_acc : 0.90552

## 85. 双方向RNN・多層化

順方向と逆方向のRNNの両方を用いて入力テキストをエンコードし，モデルを学習せよ．

h⃖ T+1=0,h⃖ t=RNN←−−−(emb(xt),h⃖ t+1),y=softmax(W(yh)[h→T;h⃖ 1]+b(y))
ただし，h→t∈ℝdh,h⃖ t∈ℝdh
はそれぞれ，順方向および逆方向のRNNで求めた時刻t
の隠れ状態ベクトル，RNN←−−−(x,h)
は入力x
と次時刻の隠れ状態h
から前状態を計算するRNNユニット，W(yh)∈ℝL×2dh
は隠れ状態ベクトルからカテゴリを予測するための行列，b(y)∈ℝL
はバイアス項である．また，[a;b]
はベクトルa
とb
の連結を表す。

さらに，双方向RNNを多層化して実験せよ．



In [15]:
#train dev test 前の章で作ったものを使う
#full_data_tokened.txtは3つの全部が入ってる
#mozes でtトークナイズ済み
import os 
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence,pad_sequence,pack_sequence
from sklearn.metrics import accuracy_score
import random
import gensim


class BiRNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,pre_trained_embedding):
        super(BiRNN, self).__init__()
        self.hidden_dim = hidden_dim
        middle_dim = 200
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight = nn.Parameter(torch.from_numpy(pre_trained_embedding))
        self.birnn = nn.RNN(embedding_dim,hidden_dim,bidirectional=True)
        self.hidden2middle = nn.Linear(hidden_dim*2,middle_dim)
        self.middle2tag = nn.Linear(middle_dim,tagset_size)


    def forward(self,sentence,word_seq_lengths):
        embeds = self.word_embeddings(sentence)  
        #手書きでpaddingしたのでpackしてRNNに入れて戻す
        packed_words = pack_padded_sequence(embeds, word_seq_lengths.cpu().numpy(), batch_first=True) 
        hidden = None
        output, h_n = self.birnn(packed_words,hidden)
        forward_h = h_n[:][0]#前向きの最後
        back_h = h_n[:][1]#後ろ向きの最後
        birnn_out = torch.cat((forward_h,back_h),1)#catしてhidden_dim*2次元
        #h_nは各時刻の情報がpackされている情報がくる
        #そのまま使うとpackの最後の情報がピンポイントで使えて便利
        middle_out  = self.hidden2middle(birnn_out)
        middle_out = F.relu(middle_out)
        tag_space = self.middle2tag(middle_out)
        tag_scores = F.log_softmax(tag_space,dim=1)

        return tag_scores

#vocab 次元の辞書を作る 80番の内容
def make_vocab(data_path):
    vocab = {}
    with open(data_path) as data:
        for line in data:
            text_data = line.split("\t")[0]
            words = text_data.split()

            for word in  words:
                if word in vocab:
                    vocab[word] +=1
                else:
                    vocab[word] = 1

    #(単語,頻度)の順にタプルで入ってる
    vocab_sorted = sorted(vocab.items(),key=lambda x:x[1])[::-1]

    vocab_with_id = {}
    #idに変換
    for item in vocab_sorted:
        #default_dictを使って一行で書く
        if item[1] >= 2:
            vocab_with_id[item[0]] = len(vocab_with_id.items())+1
        else:
            vocab_with_id[item[0]] = 0

    return vocab_with_id

#train,test,devを読み込む関数
def data_import(mode,vocab_with_id):
    cu_path = os.getcwd()
    name = mode
    data_path = os.path.join(cu_path,"data",name+".txt")

    text_data_list = []
    label_list = []
    with open(data_path) as data:
        for line in data:
            text_data,label  = line.strip().split("\t")
            words = text_data.split()
            text_data_list.append(words)
            label_list.append(label)
    
    return text_data_list,label_list

#単語列 -> id に変換する
#ここでID errorの時に0に処理するプログラムに変える
def prepare_sequence(seqs,with_id,mode):
    bacth_id_list = []
    for seq in seqs:
        idxs = [with_id[w] if (w in with_id) else 0 for w in seq]
        bacth_id_list.append(idxs)
    # print(bacth_id_list)
    # return torch.tensor(idxs, dtype=torch.long)
    return bacth_id_list


#torch.saveはnumpyでもできる
#data => torch.saveでsave それをtorch.loadで読み込むと早くなるかも

#データセットをある程度長さごとに固めてバッチを作ると系列長が揃って処理時間が短くなる
#Allen_NLPを使うと便利

#結局手で書いてしまった
#系列長が違う生でlist > tensor の変換がうまくいかないので0埋めを手動でやった
def sequence2padded_tesnsor(seqs,labels,device):    
    batch_size = len(seqs)
    word_seq_lengths = torch.LongTensor(list(map(len, seqs)))
    max_seq_len = word_seq_lengths.max().item()
    word_seq_tensor = torch.zeros((batch_size, max_seq_len), requires_grad=True).long()
    label_seq_tensor = torch.zeros((batch_size), requires_grad=True).long()
    # mask = torch.zeros((batch_size, max_seq_len), requires_grad=True).byte()
    for idx, (seq, label,seqlen) in enumerate(zip(seqs, labels,word_seq_lengths)):
        seqlen = seqlen.item()
        word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        label_seq_tensor[idx] = torch.LongTensor(label)
        # mask[idx, :seqlen] = torch.Tensor([1] * seqlen)
    
    word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True)
    word_seq_tensor = word_seq_tensor[word_perm_idx]#長さごとにに並べ替え
    label_seq_tensor = label_seq_tensor[word_perm_idx]

    if device == "cuda":
        word_seq_tensor = word_seq_tensor.to(device)
        label_seq_tensor = label_seq_tensor.to(device)
        word_seq_lengths = word_seq_lengths.to(device)

    return word_seq_tensor,label_seq_tensor,word_seq_lengths

def init_embedding(vocab_with_id,EMBEDDING_DIM):
    from gensim.models import KeyedVectors
    #作ったvocabのindexに紐付ける
    matrix = np.zeros((len(vocab_with_id),EMBEDDING_DIM))
    path = 'data/GoogleNews-vectors-negative300.bin'
    vectors = KeyedVectors.load_word2vec_format(path,binary=True)
    for word,id in vocab_with_id.items():
        if word in vectors.vocab:
            matrix[id] = vectors[word]

    matrix[0] = np.zeros(EMBEDDING_DIM)
    
    return matrix
        
def main():
    #paraameter
    EMBEDDING_DIM = 300
    HIDDEN_DIM = 50
    BATCH_SIZE = 32
     
    cu_path = os.getcwd()
    data_path = os.path.join(cu_path,'data','train.txt')
    vocab_with_id = make_vocab(data_path)
    pre_trained_embedding = init_embedding(vocab_with_id,EMBEDDING_DIM)
    
    train_X, train_y = data_import("train",vocab_with_id)

    label_with_id = {"b":0,"t":1,"e":2,"m":3}

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model = BiRNN(EMBEDDING_DIM, 
                      HIDDEN_DIM, 
                      len(vocab_with_id), 
                      len(label_with_id),
                      pre_trained_embedding).float()
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.3)
    
    model.to(device)

    for epoch in range(10):
        model.train()
        print(epoch)
        train_num = len(train_X)
        batch_size = BATCH_SIZE

        #shuffle
        combined=list(zip(train_X,train_y))
        random.shuffle(combined)
        train_X,train_y=zip(*combined)

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1
        
        loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])
        
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = train_X[start:end]
            tags = train_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets,device)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            loss_total += loss.item()
            loss.backward()
            optimizer.step()

            if torch.cuda.is_available():
                pred_y = torch.argmax(tag_scores,dim=1).cpu().numpy()
            else:
                pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets.cpu())
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
        
        train_acc = accuracy_score(gold_list, pred_list)

        print("train_acc :",train_acc)        
        print("train_loss :",loss_total)

        #========================================================
        #dev
        dev_X, dev_y = data_import("dev",vocab_with_id)
        train_num = len(dev_X)
        batch_size = BATCH_SIZE

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1

        dev_loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])

        model.eval()
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = dev_X[start:end]
            tags = dev_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets,device)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            dev_loss_total += loss.item()
            
            if torch.cuda.is_available():
                pred_y = torch.argmax(tag_scores,dim=1).cpu().numpy()
            else:
                pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets.cpu())
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
        
        dev_acc = accuracy_score(gold_list, pred_list)

        print("dev_acc :",dev_acc)        
        print("devloss :",dev_loss_total)

if __name__ == '__main__':
    main()


0
train_acc : 0.7742299023290758
train_loss : 205.08350579440594
dev_acc : 0.8600299401197605
devloss : 17.4151853621006
1
train_acc : 0.8552779864763336
train_loss : 135.75032676011324
dev_acc : 0.843562874251497
devloss : 17.769000574946404
2
train_acc : 0.8909654395191585
train_loss : 102.9591537117958
dev_acc : 0.8607784431137725
devloss : 15.9034653455019
3
train_acc : 0.9167918858001503
train_loss : 77.89074492640793
dev_acc : 0.8697604790419161
devloss : 15.179561972618103
4
train_acc : 0.935762584522915
train_loss : 62.71002884674817
dev_acc : 0.8630239520958084
devloss : 18.299617916345596
5
train_acc : 0.9504132231404959
train_loss : 48.66107478446793
dev_acc : 0.8203592814371258
devloss : 26.735527724027634
6
train_acc : 0.9607438016528925
train_loss : 35.55458526988514
dev_acc : 0.8540419161676647
devloss : 25.607435926795006
7
train_acc : 0.9572689706987227
train_loss : 40.36867264145985
dev_acc : 0.8592814371257484
devloss : 22.24944542348385
8
train_acc : 0.9750187828700

## 86. 畳み込みニューラルネットワーク (CNN)

ID番号で表現された単語列x=(x1,x2,…,xT)
がある．ただし，T
は単語列の長さ，xt∈ℝV
は単語のID番号のone-hot表記である（V
は単語の総数である）．畳み込みニューラルネットワーク（CNN: Convolutional Neural Network）を用い，単語列x
からカテゴリy
を予測するモデルを実装せよ．

ただし，畳み込みニューラルネットワークの構成は以下の通りとする．

単語埋め込みの次元数: dw
畳み込みのフィルターのサイズ: 3 トークン
畳み込みのストライド: 1 トークン
畳み込みのパディング: あり
畳み込み演算後の各時刻のベクトルの次元数: dh
畳み込み演算後に最大値プーリング（max pooling）を適用し，入力文をdh
次元の隠れベクトルで表現
すなわち，時刻t
の特徴ベクトルpt∈ℝdh
は次式で表される．

pt=g(W(px)[emb(xt−1);emb(xt);emb(xt+1)]+b(p))
ただし，W(px)∈ℝdh×3dw,b(p)∈ℝdh
はCNNのパラメータ，g
は活性化関数（例えばtanh
やReLUなど），[a;b;c]
はベクトルa,b,c
の連結である．なお，行列W(px)
の列数が3dw
になるのは，3個のトークンの単語埋め込みを連結したものに対して，線形変換を行うためである．

最大値プーリングでは，特徴ベクトルの次元毎に全時刻における最大値を取り，入力文書の特徴ベクトルc∈ℝdh
を求める．c[i]
でベクトルc
のi
番目の次元の値を表すことにすると，最大値プーリングは次式で表される．

c[i]=max1≤t≤Tpt[i]
最後に，入力文書の特徴ベクトルc
に行列W(yc)∈ℝL×dh
とバイアス項b(y)∈ℝL
による線形変換とソフトマックス関数を適用し，カテゴリy
を予測する．

y=softmax(W(yc)c+b(y))
なお，この問題ではモデルの学習を行わず，ランダムに初期化された重み行列でy
を計算するだけでよい．



## 87. 確率的勾配降下法によるCNNの学習

確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題86で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

In [4]:
import warnings
warnings.simplefilter('ignore')
import os 
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence,pad_sequence,pack_sequence
from sklearn.metrics import accuracy_score
import random
import gensim

class CNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,pre_trained_embedding):
        super(CNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight = nn.Parameter(torch.from_numpy(pre_trained_embedding))
        self.cnn = nn.Conv1d(embedding_dim,hidden_dim,3,stride=1, padding=2)
        self.hidden2tag = nn.Linear(hidden_dim,tagset_size)

    def forward(self,sentence,word_seq_lengths):
        #sentence (batch_size,max_length) 
        embeds = self.word_embeddings(sentence)  
        #embeds (batch_size,max_length,emb_dim)
        embeds = embeds.permute(0, 2, 1)
        #embeds (batch_size,emb_dim,max_length)
        cnn_out =  F.relu(self.cnn(embeds))
        #cnn_out (batch_size,hidden_dim,max_length-3+1)
        pooled = F.max_pool1d(cnn_out,cnn_out.shape[2]).squeeze(2)
        #pooled (batch.size,hidden_dim)
        tag_space  = self.hidden2tag(pooled)
        tag_scores = F.log_softmax(tag_space,dim=1)
        return tag_scores

#vocab 次元の辞書を作る 80番の内容
def make_vocab(data_path):
    vocab = {}
    with open(data_path) as data:
        for line in data:
            text_data = line.split("\t")[0]
            words = text_data.split()

            for word in  words:
                if word in vocab:
                    vocab[word] +=1
                else:
                    vocab[word] = 1

    #(単語,頻度)の順にタプルで入ってる
    vocab_sorted = sorted(vocab.items(),key=lambda x:x[1])[::-1]

    vocab_with_id = {}
    #idに変換
    for item in vocab_sorted:
        #default_dictを使って一行で書く
        if item[1] >= 2:
            vocab_with_id[item[0]] = len(vocab_with_id.items())+2
        else:
            vocab_with_id[item[0]] = 0

    return vocab_with_id

#train,test,devを読み込む関数
def data_import(mode,vocab_with_id):
    cu_path = os.getcwd()
    name = mode
    data_path = os.path.join(cu_path,"data",name+".txt")

    text_data_list = []
    label_list = []
    with open(data_path) as data:
        for line in data:
            text_data,label  = line.strip().split("\t")
            words = text_data.split()
            text_data_list.append(words)
            label_list.append(label)
    
    return text_data_list,label_list

#単語列 -> id に変換する
#ここでID errorの時に-1に処理するプログラムに変える
def prepare_sequence(seqs,with_id,mode):
    bacth_id_list = []
    for seq in seqs:
        idxs = [with_id[w] if (w in with_id) else 0 for w in seq]
        bacth_id_list.append(idxs)
    # print(bacth_id_list)
    # return torch.tensor(idxs, dtype=torch.long)
    return bacth_id_list

#torch.saveはnumpyでもできる
#data => torch.saveでsave それをtorch.loadで読み込むと早くなるかも

#データセットをある程度長さごとに固めてバッチを作ると系列長が揃って処理時間が短くなる
#Allen_NLPを使うと便利

#結局手で書いてしまった
#系列長が違う生でlist > tensor の変換がうまくいかないので0埋めを手動でやった
#paddingには1番を割り当てる
#unkは0番を割り当てる
def sequence2padded_tesnsor(seqs,labels,device):    
    batch_size = len(seqs)
    word_seq_lengths = torch.LongTensor(list(map(len, seqs)))
    max_seq_len = word_seq_lengths.max().item()
    word_seq_tensor = torch.ones((batch_size, max_seq_len), requires_grad=True).long()
    label_seq_tensor = torch.zeros((batch_size), requires_grad=True).long()
    # mask = torch.zeros((batch_size, max_seq_len), requires_grad=True).byte()
    for idx, (seq, label,seqlen) in enumerate(zip(seqs, labels,word_seq_lengths)):
        seqlen = seqlen.item()
        word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        label_seq_tensor[idx] = torch.LongTensor(label)
        # mask[idx, :seqlen] = torch.Tensor([1] * seqlen)
    
    word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True)
    word_seq_tensor = word_seq_tensor[word_perm_idx]#長さごとにに並べ替え
    label_seq_tensor = label_seq_tensor[word_perm_idx]

    if device == "cuda":
        word_seq_tensor = word_seq_tensor.to(device)
        label_seq_tensor = label_seq_tensor.to(device)
        word_seq_lengths = word_seq_lengths.to(device)

    return word_seq_tensor,label_seq_tensor,word_seq_lengths

def init_embedding(vocab_with_id,EMBEDDING_DIM):
    from gensim.models import KeyedVectors
    #作ったvocabのindexに紐付ける
    matrix = np.random.uniform(0, 1, size=(len(vocab_with_id)+1, EMBEDDING_DIM))
    path = 'data/GoogleNews-vectors-negative300.bin'
    vectors = KeyedVectors.load_word2vec_format(path,binary=True)
    for word,id in vocab_with_id.items():
        if word in vectors.vocab:
            matrix[id] = vectors[word]
    return matrix
        
def main():
    #paraameter
    EMBEDDING_DIM = 300
    HIDDEN_DIM = 50
    BATCH_SIZE = 32
     
    cu_path = os.getcwd()
    data_path = os.path.join(cu_path,'data','train.txt')
    vocab_with_id = make_vocab(data_path)
    pre_trained_embedding = init_embedding(vocab_with_id,EMBEDDING_DIM)
    
    train_X, train_y = data_import("train",vocab_with_id)

    label_with_id = {"b":0,"t":1,"e":2,"m":3}

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    """
    model = BiRNN(EMBEDDING_DIM, 
                      HIDDEN_DIM, 
                      len(vocab_with_id), 
                      len(label_with_id),
                      pre_trained_embedding).float()
    """
    model = CNN(EMBEDDING_DIM, 
                    HIDDEN_DIM, 
                    len(vocab_with_id), 
                    len(label_with_id),
                    pre_trained_embedding).float()
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.3)
    
    model.to(device)

    for epoch in range(10):
        model.train()
        print(epoch)
        train_num = len(train_X)
        batch_size = BATCH_SIZE

        #shuffle
        combined=list(zip(train_X,train_y))
        random.shuffle(combined)
        train_X,train_y=zip(*combined)

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1
        
        loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])
        
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = train_X[start:end]
            tags = train_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets,device)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            loss_total += loss.item()
            loss.backward()
            optimizer.step()

            if torch.cuda.is_available():
                pred_y = torch.argmax(tag_scores,dim=1).cpu().numpy()
            else:
                pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets.cpu())
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
        
        train_acc = accuracy_score(gold_list, pred_list)

        print("train_acc :",train_acc)        
        print("train_loss :",loss_total)

        #========================================================
        #dev
        dev_X, dev_y = data_import("dev",vocab_with_id)
        train_num = len(dev_X)
        batch_size = BATCH_SIZE

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1

        dev_loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])

        model.eval()
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = dev_X[start:end]
            tags = dev_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets,device)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            dev_loss_total += loss.item()
            
            if torch.cuda.is_available():
                pred_y = torch.argmax(tag_scores,dim=1).cpu().numpy()
            else:
                pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets.cpu())
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
        
        dev_acc = accuracy_score(gold_list, pred_list)

        print("dev_acc :",dev_acc)        
        print("devloss :",dev_loss_total)

if __name__ == '__main__':
    main()


0
train_acc : 0.8213749060856499
train_loss : 160.4285215958953
dev_acc : 0.875
devloss : 15.079336039721966
1
train_acc : 0.912847483095417
train_loss : 83.39828199520707
dev_acc : 0.8802395209580839
devloss : 14.821674607694149
2
train_acc : 0.9399887302779865
train_loss : 55.47876461967826
dev_acc : 0.8847305389221557
devloss : 14.323731660842896
3
train_acc : 0.9666604057099925
train_loss : 34.23760186415166
dev_acc : 0.8907185628742516
devloss : 14.618648897856474
4
train_acc : 0.9813110443275732
train_loss : 21.260149023262784
dev_acc : 0.8952095808383234
devloss : 14.761813420802355
5
train_acc : 0.9894815927873779
train_loss : 12.155366194667295
dev_acc : 0.8974550898203593
devloss : 15.761664882302284
6
train_acc : 0.9954921111945906
train_loss : 7.967729117721319
dev_acc : 0.8989520958083832
devloss : 17.14870515652001
7
train_acc : 0.9969947407963937
train_loss : 5.800838275579736
dev_acc : 0.8959580838323353
devloss : 17.413154434412718
8
train_acc : 0.9970886551465064
trai

##  88. パラメータチューニングPerma

In [8]:
#88
#CNNの方がベースの性能が高そうだったのでCNNを改造する
#3~5のカーネルサイズの情報を全部使う
class CNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size,filter_sizes,dropout_rate,pre_trained_embedding):
        super(CNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings.weight = nn.Parameter(torch.from_numpy(pre_trained_embedding))
        self.cnn_list = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = hidden_dim, 
                                              kernel_size = fs,
                                              stride=1,
                                              padding = 2)
                                    for fs in filter_sizes
                                    ])
        self.hidden2tag = nn.Linear(len(filter_sizes) * hidden_dim,tagset_size)
        self.dropout =  nn.Dropout(dropout_rate) 

    def forward(self,sentence,word_seq_lengths):
        #sentence (batch_size,max_length) 
        embeds = self.word_embeddings(sentence)  
        #embeds (batch_size,max_length,emb_dim)
        embeds = embeds.permute(0, 2, 1)
        #embeds (batch_size,emb_dim,max_length)
        cnn_out_list =  [F.relu(cnn(embeds)) for cnn in self.cnn_list] 
        #cnn_out [(batch_size,hidden_dim,max_length+)]*len(filter_sizes)
        pooled = [F.max_pool1d(cnn_out,cnn_out.shape[2]).squeeze(2) for cnn_out in cnn_out_list]
        #pooled [(batch.size,hidden_dim)]*len(filter_sizes)
        cat = self.dropout(torch.cat(pooled, dim=1))
        #cat = (batch_size, hiddem_size * len(filter_sizes))
        tag_space  = self.hidden2tag(cat)
        tag_scores = F.log_softmax(tag_space,dim=1)
        return tag_scores

#vocab 次元の辞書を作る 80番の内容
def make_vocab(data_path):
    vocab = {}
    with open(data_path) as data:
        for line in data:
            text_data = line.split("\t")[0]
            words = text_data.split()

            for word in  words:
                if word in vocab:
                    vocab[word] +=1
                else:
                    vocab[word] = 1

    #(単語,頻度)の順にタプルで入ってる
    vocab_sorted = sorted(vocab.items(),key=lambda x:x[1])[::-1]

    vocab_with_id = {}
    #idに変換
    for item in vocab_sorted:
        #default_dictを使って一行で書く
        if item[1] >= 2:
            vocab_with_id[item[0]] = len(vocab_with_id.items())+2
        else:
            vocab_with_id[item[0]] = 0

    return vocab_with_id

#train,test,devを読み込む関数
def data_import(mode,vocab_with_id):
    cu_path = os.getcwd()
    name = mode
    data_path = os.path.join(cu_path,"data",name+".txt")

    text_data_list = []
    label_list = []
    with open(data_path) as data:
        for line in data:
            text_data,label  = line.strip().split("\t")
            words = text_data.split()
            text_data_list.append(words)
            label_list.append(label)
    
    return text_data_list,label_list

#単語列 -> id に変換する
#ここでID errorの時に-1に処理するプログラムに変える
def prepare_sequence(seqs,with_id,mode):
    bacth_id_list = []
    for seq in seqs:
        idxs = [with_id[w] if (w in with_id) else 0 for w in seq]
        bacth_id_list.append(idxs)
    # print(bacth_id_list)
    # return torch.tensor(idxs, dtype=torch.long)
    return bacth_id_list

#torch.saveはnumpyでもできる
#data => torch.saveでsave それをtorch.loadで読み込むと早くなるかも

#データセットをある程度長さごとに固めてバッチを作ると系列長が揃って処理時間が短くなる
#Allen_NLPを使うと便利

#結局手で書いてしまった
#系列長が違う生でlist > tensor の変換がうまくいかないので0埋めを手動でやった
#paddingには1番を割り当てる
#unkは0番を割り当てる
def sequence2padded_tesnsor(seqs,labels,device):    
    batch_size = len(seqs)
    word_seq_lengths = torch.LongTensor(list(map(len, seqs)))
    max_seq_len = word_seq_lengths.max().item()
    word_seq_tensor = torch.ones((batch_size, max_seq_len), requires_grad=True).long()
    label_seq_tensor = torch.zeros((batch_size), requires_grad=True).long()
    # mask = torch.zeros((batch_size, max_seq_len), requires_grad=True).byte()
    for idx, (seq, label,seqlen) in enumerate(zip(seqs, labels,word_seq_lengths)):
        seqlen = seqlen.item()
        word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        label_seq_tensor[idx] = torch.LongTensor(label)
        # mask[idx, :seqlen] = torch.Tensor([1] * seqlen)
    
    word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True)
    word_seq_tensor = word_seq_tensor[word_perm_idx]#長さごとにに並べ替え
    label_seq_tensor = label_seq_tensor[word_perm_idx]

    if device == "cuda":
        word_seq_tensor = word_seq_tensor.to(device)
        label_seq_tensor = label_seq_tensor.to(device)
        word_seq_lengths = word_seq_lengths.to(device)

    return word_seq_tensor,label_seq_tensor,word_seq_lengths

def init_embedding(vocab_with_id,EMBEDDING_DIM):
    from gensim.models import KeyedVectors
    #作ったvocabのindexに紐付ける
    matrix = np.random.uniform(0, 1, size=(len(vocab_with_id)+1, EMBEDDING_DIM))
    path = 'data/GoogleNews-vectors-negative300.bin'
    vectors = KeyedVectors.load_word2vec_format(path,binary=True)
    for word,id in vocab_with_id.items():
        if word in vectors.vocab:
            matrix[id] = vectors[word]
    return matrix
        
def main():
    #paraameter
    EMBEDDING_DIM = 300
    HIDDEN_DIM = 50
    BATCH_SIZE = 32
    filter_sizes = [3,4,5]
    dropout_rate = 0.1
    epoch_num = 15

    cu_path = os.getcwd()
    data_path = os.path.join(cu_path,'data','train.txt')
    vocab_with_id = make_vocab(data_path)
    pre_trained_embedding = init_embedding(vocab_with_id,EMBEDDING_DIM)
    
    train_X, train_y = data_import("train",vocab_with_id)

    label_with_id = {"b":0,"t":1,"e":2,"m":3}

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    """
    model = BiRNN(EMBEDDING_DIM, 
                      HIDDEN_DIM, 
                      len(vocab_with_id), 
                      len(label_with_id),
                      pre_trained_embedding).float()
    """
    model = CNN(EMBEDDING_DIM, 
                    HIDDEN_DIM, 
                    len(vocab_with_id), 
                    len(label_with_id),
                    filter_sizes,
                    dropout_rate,
                    pre_trained_embedding).float()
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.2)
    
    model.to(device)

    prev_acc = -1

    for epoch in range(epoch_num):
        model.train()
        print(epoch)
        train_num = len(train_X)
        batch_size = BATCH_SIZE

        #shuffle
        combined=list(zip(train_X,train_y))
        random.shuffle(combined)
        train_X,train_y=zip(*combined)

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1
        
        loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])
        
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = train_X[start:end]
            tags = train_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets,device)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            loss_total += loss.item()
            loss.backward()
            optimizer.step()

            if torch.cuda.is_available():
                pred_y = torch.argmax(tag_scores,dim=1).cpu().numpy()
            else:
                pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets.cpu())
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
        
        train_acc = accuracy_score(gold_list, pred_list)

        print("train_acc :",train_acc)        
        print("train_loss :",loss_total)

        #========================================================
        #dev
        dev_X, dev_y = data_import("dev",vocab_with_id)
        train_num = len(dev_X)
        batch_size = BATCH_SIZE

        if train_num % batch_size == 0:
            total_batch = train_num // batch_size 
        else:
            total_batch = train_num // batch_size + 1

        dev_loss_total = 0
        gold_list = np.array([-1])
        pred_list = np.array([-1])

        model.eval()
        for batch_id in range(total_batch):
            #それぞれのsizeが違ってdata_loaderの使いかたがわからなかった
            start = batch_id * batch_size
            end = (batch_id + 1) * batch_size
            sentences = dev_X[start:end]
            tags = dev_y[start:end]

            model.zero_grad()

            sentence_in = prepare_sequence(sentences, vocab_with_id,"X")
            targets = prepare_sequence(tags, label_with_id,"y")
            sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets,device)

            tag_scores = model(sentence_in,word_seq_lengths)
            loss = loss_function(tag_scores, targets)
            
            # print(tag_scores)
            dev_loss_total += loss.item()
            
            if torch.cuda.is_available():
                pred_y = torch.argmax(tag_scores,dim=1).cpu().numpy()
            else:
                pred_y = torch.argmax(tag_scores,dim=1).numpy()
            pred_list = np.insert(pred_list,-1,pred_y)
        
            gold_y = np.array(targets.cpu())
            gold_list = np.insert(gold_list,-1,gold_y)
            
        pred_list = np.delete(pred_list,-1)
        gold_list = np.delete(gold_list,-1)
        
        dev_acc = accuracy_score(gold_list, pred_list)

        print("dev_acc :",dev_acc)        
        print("devloss :",dev_loss_total)

        if dev_acc > prev_acc:
            prev_acc = dev_acc
            model_name = "work/model/"+"best.model"
            torch.save(model.state_dict(), model_name)

            optim_name = "work/model/"+str(epoch)+".opt"
            torch.save(optimizer.state_dict(), optim_name)

    #test
    gold_list = np.array([-1])
    pred_list = np.array([-1])

    if torch.cuda.is_available():
        model.load_state_dict(torch.load(model_name))
    else:
        model.load_state_dict(torch.load(model_name,map_location='cpu'))

    test_X, test_y = data_import("test",vocab_with_id)
    sentence_in = prepare_sequence(test_X, vocab_with_id,"X")
    targets = prepare_sequence(test_y, label_with_id,"y")
    sentence_in, targets, word_seq_lengths = sequence2padded_tesnsor(sentence_in,targets,device)
    tag_scores = model(sentence_in,word_seq_lengths)

    if torch.cuda.is_available():
        pred_y = torch.argmax(tag_scores,dim=1).cpu().numpy()
    else:
        pred_y = torch.argmax(tag_scores,dim=1).numpy()
    pred_list = np.insert(pred_list,-1,pred_y)

    gold_y = np.array(targets.cpu())
    gold_list = np.insert(gold_list,-1,gold_y)

    pred_list = np.delete(pred_list,-1)
    gold_list = np.delete(gold_list,-1)

    test_acc = accuracy_score(gold_list, pred_list)
        
    print("final test score is ",test_acc)

if __name__ == '__main__':
    main()

0
train_acc : 0.7958302028549963
train_loss : 184.97623272240162
dev_acc : 0.8622754491017964
devloss : 17.325629472732544
1
train_acc : 0.8942524417731029
train_loss : 96.6865854114294
dev_acc : 0.8869760479041916
devloss : 13.346505269408226
2
train_acc : 0.9293764087152517
train_loss : 66.28478169068694
dev_acc : 0.8824850299401198
devloss : 14.125717639923096
3
train_acc : 0.9517280240420737
train_loss : 46.61705727502704
dev_acc : 0.8937125748502994
devloss : 12.806960821151733
4
train_acc : 0.9691021788129226
train_loss : 32.09472524607554
dev_acc : 0.8847305389221557
devloss : 13.672567173838615
5
train_acc : 0.9822501878287002
train_loss : 20.12522188294679
dev_acc : 0.8989520958083832
devloss : 13.569524727761745
6
train_acc : 0.9896694214876033
train_loss : 14.226403721026145
dev_acc : 0.8967065868263473
devloss : 14.01552214473486
7
train_acc : 0.9932381667918858
train_loss : 10.39991925819777
dev_acc : 0.8944610778443114
devloss : 14.778943970799446
8
train_acc : 0.99530428