In [7]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### Bag of Words로 Classification 해보기

In [9]:
# 문장이 spanish인지 english인지 classification 하는 문제입니다.

data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH")]

# word들을 integer 값으로 mapping 해줍니다.
word_to_ix = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2


class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # 잊지말기 !!! nn.Module 상속 후 super로 불러오기
        super(BoWClassifier, self).__init__()

        # 간단한 MLP 모델입니다. (layer 하나)
        # BoW이기 때문에 voca_size가 곧 벡터의 길이가 됩니다.
        self.linear = nn.Linear(vocab_size, num_labels)

    def forward(self, bow_vec):
        # log softmax를 거친 vector의 확률값을 뱉어냅니다.
        return F.log_softmax(self.linear(bow_vec))


def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])


model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

for param in model.parameters():
    print(param) # W, b의 값을 차례대로 출력합니다.

sample = data[0]
bow_vector = make_bow_vector(sample[0], word_to_ix)
print(bow_vector[0])
log_probs = model(autograd.Variable(bow_vector))
print(log_probs)

{'en': 3, 'No': 9, 'buena': 14, 'it': 7, 'at': 22, 'sea': 12, 'cafeteria': 5, 'Yo': 23, 'la': 4, 'to': 8, 'creo': 10, 'is': 16, 'a': 18, 'good': 19, 'get': 20, 'idea': 15, 'que': 11, 'not': 17, 'me': 0, 'on': 25, 'gusta': 1, 'lost': 21, 'Give': 6, 'una': 13, 'si': 24, 'comer': 2}
Parameter containing:

Columns 0 to 9 
-0.1342 -0.1708  0.0709  0.1419 -0.1424  0.0092 -0.0419 -0.0775 -0.0104  0.0061
-0.1135 -0.1588 -0.0201 -0.1071 -0.0779 -0.0400 -0.1216 -0.0747 -0.1461 -0.0343

Columns 10 to 19 
 0.0550  0.1906  0.1876  0.0392  0.1406 -0.0721 -0.0235  0.1739  0.0083  0.0809
 0.0849 -0.1783 -0.1777 -0.0166 -0.0925  0.0495  0.0466  0.0962 -0.0931  0.1070

Columns 20 to 25 
 0.1462  0.0792  0.0019 -0.0234  0.1682 -0.1821
 0.1481  0.0124 -0.1238 -0.1128 -0.1079 -0.0308
[torch.FloatTensor of size 2x26]

Parameter containing:
-0.1329
 0.0340
[torch.FloatTensor of size 2]


 1
 1
 1
 1
 1
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
[torch.FloatTensor of size 26]

Variable con

In [10]:
label_to_ix = {"SPANISH": 0, "ENGLISH": 1}

In [13]:
# training 전에 test를 해보겠습니다.
for instance, label in test_data:
    print make_bow_vector(instance, word_to_ix)
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)

# training 전의 creo에 대한 log probability를 찍어봅니다
print(next(model.parameters())[:, word_to_ix["creo"]])

# Negative loglikelihood loss를 사용합니다.
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# 보통 100 epoch는 실제 데이터에 적합하지 않습니다. 약 5~30의 epoch수를 사용합니다.
for epoch in range(100):
    for instance, label in data:
        # Pytorch는 gradient가 중첩되기때문에 매 step마다 gradient를 초기화 해주어야 합니다.
        model.zero_grad()

        # Variable로 input vector와 label들을 wrap 해줍니다.
        bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
        target = autograd.Variable(make_target(label, label_to_ix))

        # forward pass로 계산해줍니다.
        log_probs = model(bow_vec)

        # loss를 계산하고 gradient를 계산한 후에 optimizer로 gradient를 업데이트 하는 과정입니다.
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

# 다시 model에 test data를 넣어봅니다.
for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)

# creo는 원래 spanish인데, weight값이 spanish쪽이 높아진 것으로 보아 잘 학습된 것을 알 수 있습니다.
print(next(model.parameters())[:, word_to_ix["creo"]])
print next(model.parameters())



Columns 0 to 12 
    0     0     0     0     0     0     0     0     0     0     1     1     0

Columns 13 to 25 
    0     0     0     0     0     0     0     0     0     0     1     1     0
[torch.FloatTensor of size 1x26]

Variable containing:
-0.0827 -2.5335
[torch.FloatTensor of size 1x2]



Columns 0 to 12 
    1     0     0     0     0     0     0     1     0     0     0     0     0

Columns 13 to 25 
    0     0     0     1     0     0     0     0     1     0     0     0     1
[torch.FloatTensor of size 1x26]

Variable containing:
-3.4761 -0.0314
[torch.FloatTensor of size 1x2]

Variable containing:
 0.5057
-0.3658
[torch.FloatTensor of size 2]

Variable containing:
-0.0712 -2.6770
[torch.FloatTensor of size 1x2]

Variable containing:
-3.6942 -0.0252
[torch.FloatTensor of size 1x2]

Variable containing:
 0.5414
-0.4015
[torch.FloatTensor of size 2]

Parameter containing:

Columns 0 to 9 
-0.2517  0.3950  0.6368  0.7077  0.4234  0.5750 -0.7252 -1.0683 -1.0012  0.1849
 0.0039 -

### Word Embedding 후 n-gram LM 만들어보기

In [14]:
word_to_ix = {"hello": 0, "world": 1}
# tf.get_varaible("embedding_table", [voca_size, embedding_size]) 와 동일합니다.
# 즉, embedding table을 만듭니다. parameters는 차례로 vocab(characters)의 unique 갯수, embedding vector dimension 입니다.
embeds = nn.Embedding(2, 5)

# lookup은 indexing 하면 됩니다.
lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
hello_embed = embeds(autograd.Variable(lookup_tensor))
print(hello_embed)

Variable containing:
-0.8690  1.6708 -0.1012  0.6869 -1.1280
[torch.FloatTensor of size 1x5]



In [15]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

# 셰익스피어 모네 data 입니다.
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

# ((2번째 전의 단어, 1번째 전의 단어), 타겟 단어)로 튜플을 만듭니다.
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]


# 3개만 프린트 해보겠습니다.
print(trigrams[:3])

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]


In [16]:
# set이라는 좋은 함수가 있다는 걸 오늘 처음 알았네요. 자동으로 unique한 애들만 골라줍니다
vocab = set(test_sentence)
print vocab
word_to_ix = {word: i for i, word in enumerate(vocab)}

set(['all', 'brow,', 'being', 'couldst', 'treasure', 'Proving', 'to', 'field,', 'worth', 'his', 'thine!', 'lies,', 'Where', 'dig', 'succession', 'small', 'praise.', 'where', 'old', 'fair', 'see', 'Thy', "deserv'd", 'sum', 'shall', 'forty', 'new', 'be', 'asked,', 'days;', 'This', 'Were', 'say,', 'by', 'on', 'thou', 'of', 'thine', 'own', 'gazed', 'within', 'When', 'art', 'now,', 'trenches', "feel'st", 'much', 'more', 'held:', 'count,', 'it', 'warm', "beauty's", 'child', 'an', "youth's", 'And', 'made', 'How', "'This", 'praise', 'were', 'eyes,', 'my', 'old,', 'and', 'use,', 'mine', 'deep', 'livery', 'To', 'shame,', 'in', 'Then', 'all-eating', 'sunken', 'Shall', 'make', 'when', 'thriftless', 'answer', 'lusty', 'beauty', 'besiege', 'weed', 'Will', "excuse,'", 'blood', 'winters', 'a', 'thy', 'proud', 'cold.', "totter'd", 'so', 'the', 'If'])


In [36]:
class TrigramClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, context_size):
        super(TrigramClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.fc = nn.Linear(context_size * embedding_size, 128)
        self.fc2 = nn.Linear(128, vocab_size)
        
    def forward(self, inputs):
        
        embed = self.embedding(inputs).view((1,-1))
        out = F.relu(self.fc(embed))
        out = self.fc2(out)
        
        return out
        
model = TrigramClassifier(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)

In [39]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    loss_per_epoch = 0.0
    for _input, label in trigrams:
        model.zero_grad()
        _input = autograd.Variable(torch.LongTensor([word_to_ix[w] for w in _input]))
        
        prob = model(_input)
        label = autograd.Variable(torch.LongTensor([word_to_ix[label]]))
        
        loss = loss_function(prob, label)
        
        loss.backward()
        optimizer.step()
        
        
        loss_per_epoch += loss.data[0]
        
    print loss_per_epoch
    


2.80429686469
2.79989315488
2.81764824557
2.80762083901
2.80451477984
2.80253405706
2.80112398949
2.80018622932
2.79969908038
2.79953677681


### LSTM model 만들어보기

In [40]:
lstm = nn.LSTM(3,3)

In [42]:
inputs = [autograd.Variable(torch.randn((1,3))) for _ in range(5)]

In [43]:
inputs

[Variable containing:
  0.4438  0.3946  1.4681
 [torch.FloatTensor of size 1x3], Variable containing:
 -0.1383 -0.7589 -0.7654
 [torch.FloatTensor of size 1x3], Variable containing:
 -0.6630 -0.6874 -0.5107
 [torch.FloatTensor of size 1x3], Variable containing:
  1.8368 -0.2794 -0.8937
 [torch.FloatTensor of size 1x3], Variable containing:
 -0.6421  0.4721 -2.9939
 [torch.FloatTensor of size 1x3]]

In [45]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)


training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# weight가 어떻게 변하는지 보기위해 dimension을 작게 가져감
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'Everybody': 5, 'ate': 2, 'apple': 4, 'that': 7, 'read': 6, 'dog': 1, 'book': 8, 'the': 3, 'The': 0}


In [61]:
class LSTMTagger(nn.Module):
    def __init__(self, vocab_size, embedding_size, num_units, target_size):
        # 무조건 합니다
        super(LSTMTagger, self).__init__()
        # lstm num_units (hidden dimesion) 설정
        self.num_units = num_units
        # embedding table을 만들어줍니다.
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        # lstm을 선언합니다. [input dimension, hidden dimension]
        self.lstm = nn.LSTM(embedding_size, num_units)
        # classification용 output linear layer입니다.
        self.output = nn.Linear(num_units, target_size)
        
        # pytorch는 lstm cell의 hidden을 따로 선언해주고 처음에 initialize 해야 합니다.
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # (h_n, c_n) pair 입니다.
        return (autograd.Variable(torch.zeros(1,1,self.num_units)), autograd.Variable(torch.zeros(1,1,self.num_units)))
        
    
    def forward(self, inputs):
        # embedding table에서 input vector를 꺼내옵니다. [seq_len, batch_size, embedding_size]
        embed = self.embedding(inputs)
        
        # output: [seq_len, batch_size, num_units], self.hidden: [num_layers, batch_size, num_units]
        output, self.hidden = self.lstm(embed.view(len(inputs), 1, -1), self.hidden)
        
        # softmax layer
        output = self.output(output.view(len(inputs),-1))
        output = F.log_softmax(output)
        
        return output
        

In [62]:
model = LSTMTagger(len(word_to_ix), EMBEDDING_DIM, HIDDEN_DIM, len(tag_to_ix))

In [64]:
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(300):
    for inputs, targets in training_data:
        # index의 list로 변환
        inputs = prepare_sequence(inputs, word_to_ix)
        targets = prepare_sequence(targets, tag_to_ix)
        
        # 무조건 해줍니다.
        model.zero_grad()
        
        # lstm hidden state를 초기화해줍니다. (이거 질문)
        model.hidden = model.init_hidden()

        output = model(inputs)

        loss = loss_function(output, targets)
        loss.backward()
        optimizer.step()