In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import torchtext

from konlpy.tag import Kkma
from torchtext.data import Field, Iterator, Example, TabularDataset

In [2]:
torchtext.__version__

'0.2.3'

In [4]:
# Field - 텐서 변환을 위해 데이터타입 정의
tagger = Kkma()
tokenize = tagger.morphs
preprocessing = lambda x: 0 if x == "FOOD" else 1

TEXT = Field(tokenize=tokenize, 
             use_vocab=True, 
             lower=True,
             include_lengths=True,
             batch_first=True,
            )

LABEL = Field(sequential=False,
              use_vocab=False,
              preprocessing=preprocessing,
             )

In [25]:
train_data, test_data = TabularDataset.splits(path='./',
                                              train='torchtext_train.txt',
                                              test='torchtext_test.txt',
                                              format='tsv',
                                              fields=[('TEXT', TEXT), ('LABEL', LABEL)]
                                             )

In [31]:
print(train_data.examples[0].TEXT)
print(train_data.examples[0].LABEL)

['배고프', '다', '밥', '주', '어']
0


In [32]:
TEXT.build_vocab(train_data)

In [33]:
len(TEXT.vocab)

54

In [48]:
TEXT.vocab.stoi # defaultdict instance mapping token strings to numerical identifiers

defaultdict(<function torchtext.vocab._default_unk_index>,
            {'<pad>': 1,
             '<unk>': 0,
             '?': 6,
             'ㄴ': 7,
             '거': 8,
             '고': 9,
             '고등': 20,
             '과': 21,
             '근처': 22,
             '기': 23,
             '나': 24,
             '냐': 25,
             '는': 3,
             '다': 10,
             '다시': 26,
             '드라마': 27,
             '랩': 28,
             '만': 29,
             '만하': 30,
             '맛': 31,
             '맛있': 32,
             '먹': 4,
             '뭐': 11,
             '밥': 12,
             '배고프': 33,
             '보': 13,
             '보여주': 14,
             '볼만': 34,
             '삼겹살': 35,
             '신': 36,
             '싶': 15,
             '알려주': 37,
             '어': 2,
             '없': 38,
             '영상': 39,
             '영화': 16,
             '예능': 40,
             '요즘': 41,
             '을': 42,
             '음식': 43,
             '이': 44,
             '있': 4

In [None]:
TEXT.vocab.itos # list of token strings indexed by their numerical identifiers

In [49]:
TEXT.vocab.freqs # Counter object holding the frequencies of tokens in the data used to build the vocab

Counter({'?': 2,
         'ㄴ': 2,
         '거': 2,
         '고': 2,
         '고등': 1,
         '과': 1,
         '근처': 1,
         '기': 1,
         '나': 1,
         '냐': 1,
         '는': 3,
         '다': 2,
         '다시': 1,
         '드라마': 1,
         '랩': 1,
         '만': 1,
         '만하': 1,
         '맛': 1,
         '맛있': 1,
         '먹': 3,
         '뭐': 2,
         '밥': 2,
         '배고프': 1,
         '보': 2,
         '보여주': 2,
         '볼만': 1,
         '삼겹살': 1,
         '신': 1,
         '싶': 2,
         '알려주': 1,
         '어': 8,
         '없': 1,
         '영상': 1,
         '영화': 2,
         '예능': 1,
         '요즘': 1,
         '을': 1,
         '음식': 1,
         '이': 1,
         '있': 1,
         '재밌': 2,
         '점': 1,
         '좀': 3,
         '주': 2,
         '줄거리': 1,
         '지': 1,
         '집': 1,
         '추천': 2,
         '푸': 1,
         '하': 1,
         '하이라이트': 1,
         '함께': 1})

In [51]:
# iterator 선언 L loads batches of data from a dataset
# args 
#    - dataset : dataset object to load dataset
#    - batch_size : Batch size (mini-batch)
#    - device : -1은 CPU, None은 active GPU device
#    - sort_key : 데이터들을 정렬하기위한 key, 비슷한 길이와 패딩최소화 된 예제를 함께 배치
train_iter, test_iter = Iterator.splits((train_data, test_data),
                                        batch_size=3,
                                        device=-1,
                                        sort_key=lambda x: len(x.TEXT),
                                        sort_within_batch=True,
                                        repeat=False
                                       ) # x.TEXT 기준으로 정렬

In [None]:
from IPython.config import 

In [52]:
for batch in train_iter:
    print(batch.TEXT)
    print(batch.LABEL)
    break

(Variable containing:
   20    28    50     2    26    13    23     5
   35     4     9    15     2     1     1     1
   12    18     2     1     1     1     1     1
[torch.LongTensor of size 3x8]
, 
 8
 5
 3
[torch.LongTensor of size 3]
)
Variable containing:
 1
 0
 0
[torch.LongTensor of size 3]



# modeling

In [106]:
class EmbedClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size):
        super(EmbedClassifier, self).__init__()
        
        # 각 단어의 임베딩을 평균해서 문장 단위의 임베딩 표현
        self.sentence_embed = nn.EmbeddingBag(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, output_size)
    
    def forward(self, inputs):
        outputs = self.sentence_embed(inputs)
        outputs = self.linear(outputs)
        return outputs

54

In [113]:
STEP=50
LR=0.1

model = EmbedClassifier(len(TEXT.vocab), 20, 2) 
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LR)

In [114]:
for step in range(STEP):
    losses = []
    for i, batch in enumerate(train_iter):
        inputs, lengths = batch.TEXT
        targets = batch.LABEL
        model.zero_grad()
        preds = model(inputs)
        loss = loss_function(preds, targets)
        losses.append(loss.data[0])
        loss.backward()
        optimizer.step()
    if step % 10 == 0:
        print(np.mean(losses))
        losses = []

0.6451420068740845
0.47487186789512636
0.31591417491436
0.18547815084457397
0.19009195640683174


# TEST

### numericalize

문장 -> 인덱스에 맞는 numerical vector (LongTensor)로 변환

In [115]:
for test in test_data.examples:
    input, length = TEXT.numericalize(([test.TEXT,], [len(test.TEXT)]), train=False, device=-1)
    pred = model(input)
    pred = pred.max(1)[1]
    print(pred.data[0], test.LABEL)

0 0
0 0
1 1
1 1
