### 무작위로 초기화된 임베딩 벡터 적용

In [1]:
### 문장 분류 모델

from torch import nn


class SentenceClassifier(nn.Module):
    def __init__(
        self,
        n_vocab,
        hidden_dim,
        embedding_dim,
        n_layers,
        dropout=0.5,
        bidirectional=True,
        model_type="lstm"
    ):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings = n_vocab,
            embedding_dim  = embedding_dim,
            padding_idx    = 0
        )
        if model_type == "rnn":
            self.model = nn.RNN(
                input_size    = embedding_dim,
                hidden_size   = hidden_dim,
                num_layers    = n_layers,
                bidirectional = bidirectional,
                dropout       = dropout,
                batch_first   = True,
            )
        elif model_type == "lstm":
            self.model = nn.LSTM(
                input_size    = embedding_dim,
                hidden_size   = hidden_dim,
                num_layers    = n_layers,
                bidirectional = bidirectional,
                dropout       = dropout,
                batch_first   = True,
            )

        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2, 1)
        else:
            self.classifier = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):
        embeddings  = self.embedding(inputs)
        output, _   = self.model(embeddings)
        last_output = output[:, -1, :]  # 마지막 시점의 결괏값만 분리
        # 분류기 계층에 전달
        last_output = self.dropout(last_output)
        logits      = self.classifier(last_output)
        return logits

In [2]:
import pandas as pd
from Korpora import Korpora


corpus    = Korpora.load("nsmc")
corpus_df = pd.DataFrame(corpus.test)


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at /Users/seoyun/Korpora/nsmc/ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at /Users/seoyun

In [3]:
### 데이터세트 불러오기

train = corpus_df.sample(frac=0.9, random_state=42)
test  = corpus_df.drop(train.index)

print(train.head(5).to_markdown())
print("Training Data Size :", len(train))
print("Testing Data Size :", len(test))

|       | text                                                                                     |   label |
|------:|:-----------------------------------------------------------------------------------------|--------:|
| 33553 | 모든 편견을 날려 버리는 가슴 따뜻한 영화. 로버트 드 니로, 필립 세이모어 호프만 영원하라. |       1 |
|  9427 | 무한 리메이크의 소재. 감독의 역량은 항상 그 자리에...                                    |       0 |
|   199 | 신날 것 없는 애니.                                                                       |       0 |
| 12447 | 잔잔 격동                                                                                |       1 |
| 39489 | 오랜만에 찾은 주말의 명화의 보석                                                         |       1 |
Training Data Size : 45000
Testing Data Size : 5000


In [4]:
### 데이터 토큰화 및 단어 사전 구축

from konlpy.tag import Okt
from collections import Counter


def build_vocab(corpus, n_vocab, special_tokens):
    counter = Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab = special_tokens
    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab


tokenizer    = Okt()
train_tokens = [tokenizer.morphs(review) for review in train.text]
test_tokens  = [tokenizer.morphs(review) for review in test.text]

vocab = build_vocab(corpus=train_tokens, n_vocab=5000, special_tokens=["<pad>", "<unk>"])
token_to_id = {token: idx for idx, token in enumerate(vocab)}
id_to_token = {idx: token for idx, token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


In [5]:
### 정수 인코딩 및 패딩

import numpy as np


# 최대길이(max_length)를 기준으로 잘라내거나 패딩
def pad_sequences(sequences, max_length, pad_value): 
    result = list()
    for sequence in sequences: 
        sequence        = sequence[:max_length]
        pad_length      = max_length - len(sequence)
        padded_sequence = sequence + [pad_value] * pad_length
        result.append(padded_sequence)
    return np.asarray(result)


unk_id    = token_to_id["<unk>"]
train_ids = [
    [token_to_id.get(token, unk_id) for token in review] for review in train_tokens
]
test_ids = [
    [token_to_id.get(token, unk_id) for token in review] for review in test_tokens
]

max_length = 32
pad_id     = token_to_id["<pad>"]
train_ids  = pad_sequences(train_ids, max_length, pad_id)
test_ids   = pad_sequences(test_ids, max_length, pad_id)

print(train_ids[0])
print(test_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [7]:
### 데이터로더 적용

import torch
from torch.utils.data import TensorDataset, DataLoader


train_ids = torch.tensor(train_ids)
test_ids  = torch.tensor(test_ids)

train_labels = torch.tensor(train.label.values, dtype=torch.float32)
test_labels  = torch.tensor(test.label.values, dtype=torch.float32)

train_dataset = TensorDataset(train_ids, train_labels)
test_dataset  = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False)

  train_ids = torch.tensor(train_ids)
  test_ids  = torch.tensor(test_ids)


In [8]:
### 손실함수와 최적화함수 정의

from torch import optim


n_vocab       = len(token_to_id)
hidden_dim    = 64
embedding_dim = 128
n_layers      = 2

device     = "cuda" if torch.cuda.is_available() else "cpu"
classifier = SentenceClassifier(
    n_vocab = n_vocab, hidden_dim = hidden_dim, embedding_dim = embedding_dim, n_layers = n_layers
).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier.parameters(), lr=0.001)

In [9]:
### 모델 학습 및 테스트

def train(model, datasets, criterion, optimizer, device, interval): 
    model.train()
    losses = list()

    for step, (input_ids, labels) in enumerate(datasets): 
        input_ids = input_ids.to(device)
        labels    = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss   = criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % interval == 0:
            print(f"Train Loss {step} : {np.mean(losses)}")


def test(model, datasets, criterion, device): 
    model.eval()
    losses   = list()
    corrects = list()

    for step, (input_ids, labels) in enumerate(datasets): 
        input_ids = input_ids.to(device)
        labels    = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss   = criterion(logits, labels)
        losses.append(loss.item())
        yhat = torch.sigmoid(logits)>.5
        corrects.extend(
            torch.eq(yhat, labels).cpu().tolist()
        )

    print(f"Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}")


epochs   = 5
interval = 500

for epoch in range(epochs): 
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)

Train Loss 0 : 0.6906307935714722
Train Loss 500 : 0.6938317327680226
Train Loss 1000 : 0.6850458616679246
Train Loss 1500 : 0.667911120766723
Train Loss 2000 : 0.6565387439483527
Train Loss 2500 : 0.6489575155040638
Val Loss : 0.5854749915698847, Val Accuracy : 0.6994
Train Loss 0 : 0.47263532876968384
Train Loss 500 : 0.5422476583909608
Train Loss 1000 : 0.5240420766703257
Train Loss 1500 : 0.5100836372629632
Train Loss 2000 : 0.4995229266796155
Train Loss 2500 : 0.4893557622653396
Val Loss : 0.43305313458648353, Val Accuracy : 0.7966
Train Loss 0 : 0.30962520837783813
Train Loss 500 : 0.39415990480197644
Train Loss 1000 : 0.3939001073095051
Train Loss 1500 : 0.3927157275765876
Train Loss 2000 : 0.3938581429101687
Train Loss 2500 : 0.3929202196637734
Val Loss : 0.4027989142523787, Val Accuracy : 0.816
Train Loss 0 : 0.2241000086069107
Train Loss 500 : 0.33796470048660765
Train Loss 1000 : 0.32974154361850255
Train Loss 1500 : 0.33788463864711266
Train Loss 2000 : 0.3386702847307262
T

In [10]:
### 학습된 모델로부터 임베딩 추출

token_to_embedding = dict()
embedding_matrix   = classifier.embedding.weight.detach().cpu().numpy()

for word, emb in zip(vocab, embedding_matrix): 
    token_to_embedding[word] = emb

token = vocab[1000]
print(token, token_to_embedding[token])

보고싶다 [ 4.6381968e-01  4.5853284e-01  1.1042061e+00 -2.8630337e-01
 -2.3868809e+00  1.8186281e+00  5.6016415e-02  1.7269763e+00
 -4.8994631e-01  2.3241314e-01 -6.2607592e-01  1.5056326e+00
  1.4955376e+00 -9.4073707e-01 -2.2985396e+00  4.1764311e-02
 -5.3232628e-01 -5.6993902e-01 -9.5013803e-01 -1.7728597e+00
  1.0996594e+00 -1.1447600e+00  1.3541558e-01 -5.0945348e-01
  1.7277492e-02  9.5294577e-01  5.8853292e-01 -1.3104263e-01
 -7.9997852e-02  5.8746483e-02  8.9699215e-01  1.2525725e+00
 -3.8871145e-01  6.8464339e-01  1.2854266e+00  2.9320785e-01
  8.0701023e-01  1.0742013e+00  5.0337810e-02  8.9767778e-01
 -1.4691917e+00  2.3550589e-01  4.7478580e-01 -1.1573229e+00
 -1.5816892e+00  2.2993772e+00  1.6551023e+00 -7.6844889e-01
  9.7413224e-01  8.2299095e-01 -8.1355536e-01 -1.8070993e+00
 -2.8115697e-03 -4.9761187e-02  9.5386863e-01  2.1064138e+00
 -1.6204534e+00 -1.4959038e+00  2.4197862e-02 -9.8601902e-01
 -6.7798823e-02  7.7265567e-01  9.2326939e-01  1.3727677e+00
  1.7368157e+00 -1.

### 사전 학습된 임베딩 값을 초깃값으로 적용

In [11]:
### 사전 학습된 모델로 임베딩 계층 초기화

from gensim.models import Word2Vec


word2vec        = Word2Vec.load("../models/word2vec.model")
init_embeddings = np.zeros((n_vocab, embedding_dim))

for index, token in id_to_token.items():
    if token not in ["<pad>", "<unk>"]:
        init_embeddings[index] = word2vec.wv[token]

embedding_layer = nn.Embedding.from_pretrained(
    torch.tensor(init_embeddings, dtype=torch.float32)
)

In [12]:
### 사전학습된 임베딩 계층 적용

from torch import nn


class SentenceClassifier(nn.Module):
    def __init__(
        self,
        n_vocab,
        hidden_dim,
        embedding_dim,
        n_layers,
        dropout=0.5,
        bidirectional=True,
        model_type="lstm",
        ### -------추가-------
        pretrained_embedding=None
        ### -------추가-------
    ):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings = n_vocab,
            embedding_dim  = embedding_dim,
            padding_idx    = 0
        )
        if model_type == "rnn":
            self.model = nn.RNN(
                input_size    = embedding_dim,
                hidden_size   = hidden_dim,
                num_layers    = n_layers,
                bidirectional = bidirectional,
                dropout       = dropout,
                batch_first   = True,
            )
        elif model_type == "lstm":
            self.model = nn.LSTM(
                input_size    = embedding_dim,
                hidden_size   = hidden_dim,
                num_layers    = n_layers,
                bidirectional = bidirectional,
                dropout       = dropout,
                batch_first   = True,
            )

        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2, 1)
        else:
            self.classifier = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

        ### -------추가-------
        if pretrained_embedding is not None:
            self.embedding = nn.Embedding.from_pretrained(
                torch.tensor(pretrained_embedding, dtype=torch.float32)
            )
        else:
            self.embedding = nn.Embedding(
                num_embeddings = n_vocab,
                embedding_dim  = embedding_dim,
                padding_idx    = 0
            )
        ### -------추가-------

    def forward(self, inputs):
        embeddings  = self.embedding(inputs)
        output, _   = self.model(embeddings)
        last_output = output[:, -1, :]  # 마지막 시점의 결괏값만 분리
        # 분류기 계층에 전달
        last_output = self.dropout(last_output)
        logits      = self.classifier(last_output)
        return logits

In [13]:
### 사전 학습된 임베딩을 사용한 모델 학습

from torch import optim


device     = "cuda" if torch.cuda.is_available() else "cpu"
classifier = SentenceClassifier(
    n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, 
n_layers=n_layers, pretrained_embedding=init_embeddings
).to(device)
criterion  = nn.BCEWithLogitsLoss().to(device)
optimizer  = optim.RMSprop(classifier.parameters(), lr=0.001)

epochs   = 5
interval = 500

for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)

Train Loss 0 : 0.6896252036094666
Train Loss 500 : 0.6652128881442094
Train Loss 1000 : 0.6565358474121227
Train Loss 1500 : 0.6461410267523017
Train Loss 2000 : 0.6407335017812902
Train Loss 2500 : 0.6421096345297292
Val Loss : 0.6435661743433712, Val Accuracy : 0.668
Train Loss 0 : 0.7451693415641785
Train Loss 500 : 0.602105374405246
Train Loss 1000 : 0.6142179651277049
Train Loss 1500 : 0.6076423599075111
Train Loss 2000 : 0.5885867872263776
Train Loss 2500 : 0.5769679590743907
Val Loss : 0.5091475683469742, Val Accuracy : 0.7548
Train Loss 0 : 0.5844336748123169
Train Loss 500 : 0.5169892067799787
Train Loss 1000 : 0.5144198542827374
Train Loss 1500 : 0.5077311032954889
Train Loss 2000 : 0.5002851151246419
Train Loss 2500 : 0.4962731299341702
Val Loss : 0.46782764916221936, Val Accuracy : 0.7738
Train Loss 0 : 0.5001537799835205
Train Loss 500 : 0.46687558525336714
Train Loss 1000 : 0.4618528429325763
Train Loss 1500 : 0.4606963502395796
Train Loss 2000 : 0.4591790795370914
Train 