In [1]:
# 사전 훈련된 임베딩을 사용하지 않는 경우
import numpy as np
from collections import Counter
import gensim

In [2]:
# 감성 분류 모델(긍정:1, 부정:0)

sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]

In [3]:
tokenized_sentences = [sent.split() for sent in sentences]
print('단어 토큰화된 결과 :', tokenized_sentences)

단어 토큰화된 결과 : [['nice', 'great', 'best', 'amazing'], ['stop', 'lies'], ['pitiful', 'nerd'], ['excellent', 'work'], ['supreme', 'quality'], ['bad'], ['highly', 'respectable']]


In [6]:
word_list = []

for sent in tokenized_sentences :
    for word in sent :
        word_list.append(word)
word_list

['nice',
 'great',
 'best',
 'amazing',
 'stop',
 'lies',
 'pitiful',
 'nerd',
 'excellent',
 'work',
 'supreme',
 'quality',
 'bad',
 'highly',
 'respectable']

In [7]:
word_counts = Counter(word_list)
print('총 단어수 :', len(word_counts))

총 단어수 : 15


In [8]:
# 등장 빈도순으로 정렬

vocab = sorted(word_counts, key=word_counts.get, reverse=True)
vocab

['nice',
 'great',
 'best',
 'amazing',
 'stop',
 'lies',
 'pitiful',
 'nerd',
 'excellent',
 'work',
 'supreme',
 'quality',
 'bad',
 'highly',
 'respectable']

In [9]:
word_to_index = {}
word_to_index['<PAD>'] = 0
word_to_index['<UNK>'] = 1

for index, word in enumerate(vocab) :
    word_to_index[word] = index + 2

In [10]:
len(word_to_index)

17

In [11]:
vocab_size = len(word_to_index)
print('패딩 토큰, UNK 토큰 고려한 단어 집합 크기 :', vocab_size)

패딩 토큰, UNK 토큰 고려한 단어 집합 크기 : 17


In [12]:
print(word_to_index)

{'<PAD>': 0, '<UNK>': 1, 'nice': 2, 'great': 3, 'best': 4, 'amazing': 5, 'stop': 6, 'lies': 7, 'pitiful': 8, 'nerd': 9, 'excellent': 10, 'work': 11, 'supreme': 12, 'quality': 13, 'bad': 14, 'highly': 15, 'respectable': 16}


In [13]:
def texts_to_sequences(tokenized_x_data, word_to_index) :
    encoded_x_data = []
    
    for sent in tokenized_x_data :
        index_sequences = []
        for word in sent :
            try :
                index_sequences.append(word_to_index[word])
            
            except KeyError :
                index_sequences.append(word_to_index['<UNK>'])
    
        encoded_x_data.append(index_sequences)
    
    return encoded_x_data



In [15]:
x_encoded = texts_to_sequences(tokenized_sentences, word_to_index)

In [16]:
max(len(i) for i in x_encoded)

4

In [17]:
max_len = max(len(i) for i in x_encoded)
print('최대길이 :', max_len)

최대길이 : 4


In [18]:
def pad_sequences(sentences, max_len) :
    features = np.zeros((len(sentences), max_len), dtype = int)
    
    for index, sentence in enumerate(sentences) :
        if len(sentence) != 0 :
            features[index, :len(sentence)] = np.array(sentence)[:max_len]
            
    return features


In [19]:
pad_sequences(x_encoded, max_len)

array([[ 2,  3,  4,  5],
       [ 6,  7,  0,  0],
       [ 8,  9,  0,  0],
       [10, 11,  0,  0],
       [12, 13,  0,  0],
       [14,  0,  0,  0],
       [15, 16,  0,  0]])

In [20]:
x_train = pad_sequences(x_encoded, max_len = max_len)
y_train = np.array(y_train)

print('패딩 결과 :')
print(x_train)

패딩 결과 :
[[ 2  3  4  5]
 [ 6  7  0  0]
 [ 8  9  0  0]
 [10 11  0  0]
 [12 13  0  0]
 [14  0  0  0]
 [15 16  0  0]]


In [21]:
# 모델링

import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset

In [22]:
class SimpleModel(nn.Module) :
    def __init__(self, vocab_size, embedding_dim) :
        super(SimpleModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(embedding_dim*max_len, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x) :
        # embedding.shape == (배치 크기, 문장의 길이, 임베딩 벡터 차원)
        embedded = self.embedding(x)
        
        # flattend.shape == (배치크기* 문장의 길이, 임베딩 벡터 차원)
        flattend = self.flatten(embedded)
        
        #output.shape == (배치크기, 1)
        output = self.fc(flattend)
        
        return self.sigmoid(output)
    
    
        

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
simple_model = SimpleModel(vocab_size, 100).to(device)

In [29]:
criterion = nn.BCELoss()
# BCELOSS : 이진분류(binary classification)

optimizer = Adam(simple_model.parameters())

train_dataset = TensorDataset(torch.tensor(x_train, dtype = torch.long), 
                              torch.tensor(y_train, dtype = torch.float32))

In [30]:
train_dataloader = DataLoader(train_dataset, batch_size=2)

In [31]:
print(len(train_dataloader))

4


In [32]:
for epoch in range(10) :
    for inputs, targets in train_dataloader :
        # inputs.shape = (배치 크기, 문장 길이)
        # targets.shape = (배츠 크기)
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        
        # outputs.shape == (배치 크기)
        outputs = simple_model(inputs).view(-1)
        # view(-1) : 반환된 출력값을 1차원 배열로 변환
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch : {epoch+1} \t Loss : {loss.item()}')

Epoch : 1 	 Loss : 0.5270470380783081
Epoch : 2 	 Loss : 0.5010523200035095
Epoch : 3 	 Loss : 0.44444406032562256
Epoch : 4 	 Loss : 0.3799958825111389
Epoch : 5 	 Loss : 0.3184640407562256
Epoch : 6 	 Loss : 0.26483941078186035
Epoch : 7 	 Loss : 0.22059763967990875
Epoch : 8 	 Loss : 0.18523289263248444
Epoch : 9 	 Loss : 0.1573697179555893
Epoch : 10 	 Loss : 0.1354503184556961


In [37]:
# 사전 훈련된 임베딩을 사용할 경우
# !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Av37IVBQAAntSe1X3MOAl5gvowQzd2_j&#39; -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Av37IVBQAAntSe1X3MOAl5gvowQzd2_j" -O GoogleNews-vectors-negative300.bin.gz && rm -rf /tmp/cookies.txt

In [34]:
# 구글의 사전 훈련된 word2vec 모델 로드

word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [35]:
embedding_matrix = np.zeros((vocab_size, 300))
print('임베딩 행렬 크기 :', embedding_matrix.shape)

임베딩 행렬 크기 : (17, 300)


In [36]:
def get_vector(word) :
    if word in word2vec_model :
        return word2vec_model[word]
    else :
        return None

In [38]:
for word, i in word_to_index.items() :
    if i > 2 :
        temp = get_vector(word)
        if temp is not None :
            embedding_matrix[i] = temp

# if i > 2 : <PAD>, <UNK> 0, 1번은 실제 단어가 아님 >> mapping 제외

In [43]:
# print(embedding_matrix[0])

In [42]:
word_to_index['great']

3

In [44]:
# word2vec_model에서 'greate'의 임베딩 벡터
# embedding_matrix[3] 이 일치하는 지 확인

np.all(word2vec_model['great'] == embedding_matrix[3])
# np.all() : 모든 원소가 참(True)인지 검사하는 함수(boolean)

True

In [52]:
class PreTrainedEmbeddingModel(nn.Module) :
    
    def __init__(self, vocab_size, embedding_dim) :
        super(PreTrainedEmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype = torch.float32))
        self.embedding.weight.requires_grad = True
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(embedding_dim * max_len, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x) :
        embedded = self.embedding(x) 
        flattened = self.flatten(embedded)
        output = self.fc(flattened)
        return self.sigmoid(output)
        

In [53]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [54]:
pretrained_embedding_model = PreTrainedEmbeddingModel(vocab_size, 300).to(device)

In [55]:
pretrained_embedding_model

PreTrainedEmbeddingModel(
  (embedding): Embedding(17, 300)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc): Linear(in_features=1200, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [56]:
criterion = nn.BCELoss()
optimizer = Adam(pretrained_embedding_model.parameters())

train_dataset = TensorDataset(torch.tensor(x_train, dtype = torch.long), 
                              torch.tensor(y_train, dtype = torch.float32))

train_dataloader = DataLoader(train_dataset, batch_size = 2)

In [57]:
print(len(train_dataloader))

4


In [58]:
for epoch in range(10) :
    for inputs, targets in train_dataloader :
        # inputs.shape == (배치 크기, 문장 길이)
        # targets.shape == (배치 크기)
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        
        # outputs.shape == (배치 크기)
        outputs = pretrained_embedding_model(inputs).view(-1)
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch : {epoch +1} \t Loss : {loss.item()}')

Epoch : 1 	 Loss : 0.695076048374176
Epoch : 2 	 Loss : 0.6320176720619202
Epoch : 3 	 Loss : 0.5693480968475342
Epoch : 4 	 Loss : 0.5104113817214966
Epoch : 5 	 Loss : 0.4561007022857666
Epoch : 6 	 Loss : 0.4066222310066223
Epoch : 7 	 Loss : 0.36190178990364075
Epoch : 8 	 Loss : 0.3217328190803528
Epoch : 9 	 Loss : 0.28584054112434387
Epoch : 10 	 Loss : 0.2539149224758148
