#### 문장 분류 모델

In [1]:
from torch import nn

In [2]:
class SentenceClassifier(nn.Module):
    def __init__(self,
                 n_vocab,
                 hidden_dim,
                 embedding_dim,
                 n_layers, 
                 dropout = 0.5,
                 bidirectional = True,
                 model_type = 'lstm'):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim = embedding_dim,
            padding_idx= 0)
        
        if model_type =='rnn':
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size = hidden_dim,
                num_layers= n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True)
    
        elif model_type == "lstm":
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size = hidden_dim,
                num_layers= n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True )
            
        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2, 1)
            
        else:
            self.classifier = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):
        embeddings =self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self. dropout(last_output)
        logits = self. classifier(last_output)
        return logits

#### 데이터셋 불러오기

In [3]:
import pandas as pd
from Korpora import Korpora

In [4]:
corpus = Korpora.load("nsmc")
corpusDF = pd.DataFrame(corpus.test)


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\KDP-23\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\KD

In [5]:
train = corpusDF.sample(frac=0.9, random_state=42)
test= corpusDF.drop(train.index)

In [6]:
print(train.head(5).to_markdown())

|       | text                                                                                     |   label |
|------:|:-----------------------------------------------------------------------------------------|--------:|
| 33553 | 모든 편견을 날려 버리는 가슴 따뜻한 영화. 로버트 드 니로, 필립 세이모어 호프만 영원하라. |       1 |
|  9427 | 무한 리메이크의 소재. 감독의 역량은 항상 그 자리에...                                    |       0 |
|   199 | 신날 것 없는 애니.                                                                       |       0 |
| 12447 | 잔잔 격동                                                                                |       1 |
| 39489 | 오랜만에 찾은 주말의 명화의 보석                                                         |       1 |


In [7]:
print('Training Data Size : ', len(train))
print('Testing Data Size : ', len(test))

Training Data Size :  45000
Testing Data Size :  5000


#### 데이터 토큰화 및 단어사전 구축

In [8]:
from konlpy.tag import *
from collections import Counter

In [9]:
def build_vocab(corpus, n_vocab, special_tokens):
    counter = Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab = special_tokens
    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab

In [10]:
tokenizer = Okt()
train_tokens = [tokenizer.morphs(review) for review in train.text] 
test_tokens = [tokenizer.morphs(review) for review in test.text]

In [11]:
len(train_tokens)

45000

In [12]:
vocab = build_vocab(corpus=train_tokens, n_vocab=5000, special_tokens=["<pad>", "<unk>"])
token_to_id ={token: idx for idx, token in enumerate(vocab)}
id_to_token ={idx: token for idx, token in enumerate(vocab)}

In [13]:
print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


#### 정수 인코딩 및 패딩

In [14]:
import numpy as np

In [15]:
def pad_sequences(sequences, max_length, pad_value):
    result = list()
    for sequence in sequences:
        sequence = sequence[:max_length]
        pad_length = max_length - len(sequence)
        padded_sequence = sequence +[pad_value] * pad_length
        result.append(padded_sequence)
    return np.asarray(result)

In [16]:
unk_id =  token_to_id['<unk>']
train_ids = [
    [token_to_id.get(token, unk_id) for token in review] for review in train_tokens]

test_ids = [
    [token_to_id.get(token, unk_id) for token in review] for review in test_tokens]

In [17]:
max_length = 32
pad_id = token_to_id['<pad>']
train_ids = pad_sequences(train_ids, max_length, pad_id)
test_ids = pad_sequences(test_ids, max_length, pad_id)

In [18]:
print(train_ids[0])
print(test_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [19]:
type(train_ids[0])

numpy.ndarray

#### 데이터로더 적용

In [20]:
import torch
from torch.utils.data import TensorDataset, DataLoader

In [21]:
train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)

In [22]:
train_labels = torch.tensor(train.label.values, dtype=torch.float32)
test_labels = torch.tensor(test.label.values, dtype=torch.float32)

In [23]:
train_dataset = TensorDataset(train_ids, train_labels)
test_dataset = TensorDataset(test_ids, test_labels)

In [24]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [25]:
from torch import optim

In [26]:
cnt_vocab = len(token_to_id)
hidden_dim = 64
embedding_dim = 128
n_layers =2

device = "cuda" if torch.cuda.is_available() else 'cpu'
classifier = SentenceClassifier(n_vocab=cnt_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layers=n_layers).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier.parameters(), lr=0.001)

In [27]:
def train(model, dataset, criterion, optimizer, device, interval):
    model.train()
    losses= list()

    for step, (input_ids, labels) in enumerate(dataset):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % interval ==0:
            print(f'Train Loss {step} : {np.mean(losses)}')


def test(model, dataset, criterion, device):
    model.eval()
    losses= list()
    corrects = list()

    for step, (input_ids, labels) in enumerate(dataset):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        yhat = torch.sigmoid(logits)>.5
        corrects.extend(torch.eq(yhat, labels).cpu().tolist())
        
    print(f'Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}')

epochs = 5
interval = 500
for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, train_loader, criterion, device)

Train Loss 0 : 0.6880030035972595
Train Loss 500 : 0.6941480021514816
Train Loss 1000 : 0.688730467449535
Train Loss 1500 : 0.6745151617104495
Train Loss 2000 : 0.6626327075343439
Train Loss 2500 : 0.6530778975021548
Val Loss : 0.5756861278161518, Val Accuracy : 0.7074444444444444
Train Loss 0 : 0.7166152596473694
Train Loss 500 : 0.570579332208443
Train Loss 1000 : 0.5619320194799822
Train Loss 1500 : 0.5580665143349424
Train Loss 2000 : 0.5450983932171983
Train Loss 2500 : 0.5335616979514156
Val Loss : 0.4245760769944491, Val Accuracy : 0.8066444444444445
Train Loss 0 : 0.4047014117240906
Train Loss 500 : 0.4467914657678433
Train Loss 1000 : 0.4360128542134812
Train Loss 1500 : 0.4361931407386108
Train Loss 2000 : 0.4320067875664393
Train Loss 2500 : 0.4291237010533025
Val Loss : 0.3722901458031251, Val Accuracy : 0.8352
Train Loss 0 : 0.2805987000465393
Train Loss 500 : 0.3820060052855048
Train Loss 1000 : 0.39141541018293097
Train Loss 1500 : 0.3929347401307393
Train Loss 2000 : 0.

#### 학습된 모델로부터 임베딩 추출

In [28]:
token_to_embedding = dict()
embedding_matrix = classifier.embedding.weight.detach().cpu().numpy()

for word, emb in zip(vocab, embedding_matrix):
    token_to_embedding[word] = emb

token = vocab[1000]
print( token, token_to_embedding[token])

보고싶다 [-0.44044474  1.9632758   0.41487783  0.2231063   0.63580036 -0.54615927
 -0.7899087  -0.70094746  2.3760195  -0.27559242  0.1922664   0.61469215
 -1.246909   -0.208986    1.7938346  -1.0357165   0.19075765 -0.11249615
 -0.53565645 -0.47509468 -0.85269433  1.7697381  -0.39388093  1.9232157
  2.1198893   0.9804483  -1.3769224  -3.2657049   1.0480913   1.2467566
 -1.2173885   0.18797366  0.4779758   0.1686798   1.1648282  -0.8910298
 -1.7237079  -2.8126023  -1.0814131   0.3408776  -0.18635152  0.20313086
 -0.04737411 -1.0321995   0.9766377  -1.18542     0.23155864 -0.61769384
 -0.4495899  -0.22784503 -1.1482393   0.12849554 -0.29890183 -0.91810066
 -0.13539153 -0.4875324  -0.26028097  0.08725326  0.70289356 -0.5981444
 -0.7482934  -0.32465085  1.5417975   0.78487426 -1.1451414   0.5749128
 -2.2202199   1.0255811  -0.38335544  0.42853782  0.21496303  1.5771028
  0.7076303   0.28464982  0.7622117  -1.9952075   0.940368   -0.44545466
  1.6530398   0.87928736 -0.82973963  0.29890323 -0.

#### Word2Ves 모델 학습

In [29]:
from gensim.models import Word2Vec

In [30]:
corpusDF_1 = pd.DataFrame(corpus.test)
tokenizer = Okt()
token= [tokenizer.morphs(review) for review in corpusDF_1.text]

In [31]:
word2vec = Word2Vec(
    sentences = token,
    vector_size =128,
    window = 5,
    min_count=1,
    sg=1,
    epochs=3,
    max_final_vocab=10000)

word2vec.save('../models/word2vec.model')

In [32]:
word2vec= Word2Vec.load('../models/word2vec.model')
init_embeddings = np.zeros((cnt_vocab, embedding_dim))

In [33]:
for index, token in id_to_token.items():
    if token not in ['<pad>','<unk>']:
        init_embeddings[index] = word2vec.wv[token]

In [34]:
embedding_layer = nn.Embedding.from_pretrained(
    torch.tensor(init_embeddings, dtype=torch.float32)
)

In [36]:
class SentenceClassifier1(nn.Module):
    def __init__(self,
                 n_vocab,
                 hidden_dim,
                 embedding_dim,
                 n_layers, 
                 dropout = 0.5,
                 bidirectional = True,
                 model_type = 'lstm',
                 pretrained_embedding = None):
        
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim = embedding_dim,
            padding_idx= 0)
        
        if model_type =='rnn':
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size = hidden_dim,
                num_layers= n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True)
    
        elif model_type == "lstm":
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size = hidden_dim,
                num_layers= n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True )
            
        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2, 1)
            
        else:
            self.classifier = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

        if pretrained_embedding is not None:
            self.embedding = nn.Embedding.from_pretrained(
                torch.tensor(pretrained_embedding, dtype=torch.float32))
        
        
        else :
            self.embedding = nn.Embedding(
                num_embeddings=cnt_vocab,
                embedding_dim=embedding_dim,
                padding_idx= 0)

    def forward(self, inputs):
        embeddings =self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self. dropout(last_output)
        logits = self. classifier(last_output)
        return logits

In [37]:
classifier = SentenceClassifier1(n_vocab=cnt_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layers=n_layers, pretrained_embedding=init_embeddings).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier.parameters(), lr=0.001)

In [38]:
epochs= 5
interval = 500

In [39]:
for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, train_loader, criterion, device)

Train Loss 0 : 0.688694179058075
Train Loss 500 : 0.6459737810902967
Train Loss 1000 : 0.5867796676172005
Train Loss 1500 : 0.5668385048594815
Train Loss 2000 : 0.5562820314973786
Train Loss 2500 : 0.5466975319247301
Val Loss : 0.48883435874382797, Val Accuracy : 0.7687111111111111
Train Loss 0 : 0.4954495131969452
Train Loss 500 : 0.5033940049464594
Train Loss 1000 : 0.4953794547846982
Train Loss 1500 : 0.4929087327062329
Train Loss 2000 : 0.49022016446406336
Train Loss 2500 : 0.4868011046908513
Val Loss : 0.45339400471951924, Val Accuracy : 0.7868888888888889
Train Loss 0 : 0.3293130099773407
Train Loss 500 : 0.4766405849994538
Train Loss 1000 : 0.46595476960742865
Train Loss 1500 : 0.4640176440480548
Train Loss 2000 : 0.4642110054401205
Train Loss 2500 : 0.4614615861783739
Val Loss : 0.43429906841882643, Val Accuracy : 0.7923111111111111
Train Loss 0 : 0.38643699884414673
Train Loss 500 : 0.45593318305983993
Train Loss 1000 : 0.4515436663851514
Train Loss 1500 : 0.4476499797392892
T