# 1. Preparations

### 1-1. Import Libraries
- 데이터셋 다운로드와 전처리를 쉽게 하는 torchtext 라이브러리를 import 합니다.


In [1]:
!pip install torch==1.11.0
!pip install torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.11.0
  Downloading torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl (750.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.6/750.6 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 2.0.1+cu118
    Uninstalling torch-2.0.1+cu118:
      Successfully uninstalled torch-2.0.1+cu118
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.0.2+cu118 requires torch==2.0.1, but you have torch 1.11.0 which is incompatible.
torchdata 0.6.1 requires torch==2.0.1, but you have torch 1.11.0 which is incompatible.
torchtext 0.15.2 requires torch==2.0.1, but you have torch 1.11.0 which is incompatible.
torchvision 0.15.2+cu118 req

In [2]:
import os
import random
import time
import sys

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data, datasets
import random
import time
import spacy
import numpy as np
from torch import Tensor

### 1-2. Load data
- Field 를 정의합니다.
- IMDB 데이터를 다운받습니다.
- Train,valid,test 데이터셋으로 split 합니다.

In [3]:
TEXT = data.Field(include_lengths=True)
LABEL = data.LabelField(dtype = torch.float) 

In [4]:
# Download IMDB data (about 14mins)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 9.37MB/s]


In [5]:
# Idle Function for maintaining Runtime Session
while True:
  pass

KeyboardInterrupt: ignored

In [6]:
# Set the random seed
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [7]:
# Split train and valid data
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [8]:
print('Number of training examples: {}'.format(len(train_data)))
print('Number of validation examples: {}'.format(len(valid_data)))
print('Number of testing examples: {}'.format(len(test_data)))

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [9]:
# print example
print(vars(train_data.examples[0]))
print(' '.join(vars(train_data.examples[0])['text']))

{'text': ['Seriously,', 'I', 'can', 'easily', 'stomach', 'a', 'lot', 'of', 'on', 'screen', 'blood,', 'gore', 'and', 'repulsiveness,', 'but', 'what', 'really', 'makes', 'this', 'film', 'disturbing', '&', 'uncomfortable', 'to', 'watch', 'is', 'how', 'the', 'doctor', 'character', 'keeps', 'on', 'rambling', 'about', 'the', 'physical', 'damage', 'done', 'to', 'raped', 'women.', 'He,', 'John', 'Cassavetes', 'of', '"Rosemary\'s', 'Baby",', 'talks', 'about', 'ruptured', 'uterus,', 'dry', 'intercourse', 'and', 'massive', 'loads', 'of', 'reddish', '(?)', 'sperm', 'like', 'they', 'are', 'the', 'most', 'common', 'little', 'ailments', 'in', 'the', 'world', 'of', 'medicine.', 'That', 'being', 'said,', '"Incubus"', 'is', 'an', 'ultimately', 'STRANGE', 'horror', 'effort.', 'It', "isn't", 'necessarily', 'awful', '\x96', 'although', 'it', "isn't", 'very', 'good,', 'neither', '\x96', 'but', 'just', 'plain', 'weird.', 'The', 'muddled', '&', 'incoherent', 'script', 'initially', 'revolves', 'on', 'the', 'hu

### 1-3. Cuda Setup
- GPU 사용을 위한 Cuda 설정
- Colab 페이지 상단 메뉴>수정>노트설정에서 GPU 사용 설정이 선행되어야 합니다.


In [10]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda:0" if USE_CUDA else "cpu")

##2. Preprocess data
- Vocab (단어장) 을 만듭니다.
- Iterator 를 만듭니다. (Iterator 를 통해 batch training 을 위한 batching 과 padding, 그리고 데이터 내 단어들의 인덱스 변환이 이루어집니다.)  

In [11]:
# Load pre-trained word vectors (about 7mins)
TEXT.build_vocab(train_data, vectors = "glove.6B.100d")

.vector_cache/glove.6B.zip: 862MB [02:41, 5.33MB/s]                           
100%|█████████▉| 399999/400000 [00:14<00:00, 28354.70it/s]


In [12]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")

Unique tokens in TEXT vocabulary: 225481


In [13]:
MAX_VOCAB_SIZE = 25000
# 쓸모없는 건 필요없으니까 25000개만 넣음.
TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_                 
                 )
LABEL.build_vocab(train_data)

In [14]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [15]:
TEXT.vocab.itos[:10]  #itos – A list of token strings indexed by their numerical identifiers.

['<unk>', '<pad>', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 'I']

In [16]:
word_dict = TEXT.vocab.stoi # stoi – A collections.defaultdict instance mapping token strings to numerical identifiers.
print(len(word_dict))
print(word_dict['<unk>'], word_dict['<pad>'])
print(list(word_dict)[-1], word_dict[list(word_dict)[-1]])

25002
0 1
THINK 25001


In [17]:
print(TEXT.vocab.vectors.shape)
print(TEXT.vocab.vectors)

torch.Size([25002, 100])
tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 1.0003,  1.1731, -0.7108,  ...,  0.1007, -0.5921, -0.5974],
        [-0.3989,  0.5126,  0.6494,  ...,  0.0551, -0.6197, -0.1320],
        [-0.3997, -0.9111, -0.3559,  ...,  0.6468,  0.1014, -1.3005]])


In [18]:
# Batching - construct iterator << 반복적 학습 시, 
BATCH_SIZE = 32   

train_iterator = data.Iterator(
    train_data, 
    batch_size = BATCH_SIZE,
    device = device)

# shape: BATCH_SIZE x maximum length of sentence 

for batch in train_iterator:
    break
print(batch.text)
print(len(batch.text[0]))
print(batch.label)

(tensor([[   18,    49,  1925,  ..., 24625,    49,  1646],
        [ 1901,    21,     0,  ...,   139,    21,     5],
        [    5,   521,   160,  ...,  9606,     0, 16252],
        ...,
        [    1,     1,    70,  ...,     1,     1,     1],
        [    1,     1,    12,  ...,     1,     1,     1],
        [    1,     1,   726,  ...,     1,     1,     1]], device='cuda:0'), tensor([779, 393, 901, 756, 232, 312, 284, 130, 128, 293, 171, 213, 744, 154,
        181, 228, 127,  79, 128, 148, 199, 114, 400, 132, 267, 341, 140, 123,
        128, 204, 277, 328], device='cuda:0'))
901
tensor([0., 1., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 0.,
        0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1.],
       device='cuda:0')


In [19]:
# BucketIterator

train_iterator = data.BucketIterator(
    train_data, 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

for batch in train_iterator:
    break
print(batch.text)
print(len(batch.text[0]))
print(batch.label)

(tensor([[    9,    49,  6605,  ..., 14702,   321,    49],
        [  161,     7,  4308,  ...,    29,    33,     7],
        [   12,     3,     4,  ...,     2,     5,   348],
        ...,
        [ 1344,    13,   112,  ...,     0,  5425,  1073],
        [ 2162,  5128,    30,  ...,     1,     1,     1],
        [ 4458,   471,     0,  ...,     1,     1,     1]], device='cuda:0'), tensor([115, 115, 115, 115, 115, 115, 114, 114, 114, 114, 114, 114, 114, 114,
        114, 114, 114, 114, 114, 114, 113, 113, 113, 113, 113, 113, 113, 113,
        113, 113, 113, 113], device='cuda:0'))
115
tensor([0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0.],
       device='cuda:0')


In [20]:
# Batching - construct iterator

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_sizes = (BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
    sort_within_batch = True,
    device = device)

##3. Build Model
- Embedding layer, RNN layer, Dropout layer, Fully-connected layer 로 이루어진 모델을 만듭니다.
- 미리 학습된 워드 임베딩을 임베딩 레이어에 올립니다.

In [51]:
class Model(nn.Module):  # Custom model 정의 
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super().__init__()

        # Define parameters
        # TO-DO
        # hidden_dim, n_layers
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        # Define Layers
        # Embedding layer
        # TO-DO
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)

        # RNN layer
        # TO-DO
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=bidirectional, dropout=dropout)

        # Fully connected layer
        # TO-DO
        self.fc=nn.Linear(hidden_dim * 2, output_dim)

        # Dropout layer
        # TO-DO
        self.dropout = nn.Dropout(dropout)

        
    def forward(self, text):

        # [input]text = [sent len, batch size]
        # TO-DO
        # embedding
        
        embedded = self.embedding(text)
        
        # embedded = [sent len, batch size, emb dim]
        # embedded = [batch size, sent len, emb dim] if batch_first = True
        output, (hidden, cell) = self.lstm(embedded)

        # TO-DO
        # forward 
        hidden = self.dropout(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim =1))

        return self.fc(hidden.squeeze(0))




---

## (추가설명)
### Bidirectional RNN 의 "concatenation" 에 대하여
* `batch_size = 3, hidden_dim = 10, n_layers = 1, bidirectional = True` 일때 
* RNN 모델은 forward layer 와 backward layer 총 2개 레이어를 가지게 됩니다.
* 편의상 forward layer 의 hidden state 의 모든 unit 이 0이 되고,
backward layer 의 경우 모두 1이 된다고 가정하겠습니다.


In [52]:
# 한개의 input 이 들어왔을때, 마지막 타임 스텝에서의 forward/backward hidden state 는 각각 다음과 같은 형태가 될 것입니다.
h_forward = torch.zeros(1,10) 
h_backward = torch.ones(1,10)
print(h_forward)
print(h_backward)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])


 


*   Torch.nn 제공 RNN 모듈은 2개의 아웃풋 중 하나로 hidden state 을 출력하며,
> `output, hidden = self.rnn(embedded)`
*   `hidden`은 모델에 들어있는 **모든 레이어**의 last hidden state 을 출력합니다.
*   따라서 `hidden` 의 형태는 `[num_layers x num_directions, batch_size, hidden_size]`가 됩니다.

* 모델에서 총 n개의 layer 를 사용할 경우, 순서대로 _1번째 forward, 1번째 backward, 2번째 forward, 2번째 backward, ..., n번째 forward, n번째 backward_ 가 표시됩니다.
* 이 예제에서는 `n_layers = 1`이므로 `hidden` 의 shape 은 `[2,3,10]` 이 됩니다.

In [53]:
# hidden 은 아래와 같이 생기게 됩니다.
hidden = torch.cat([h_forward.unsqueeze(0).repeat([1,3,1]),h_backward.unsqueeze(0).repeat([1,3,1])])

print(hidden)
print(hidden.shape)

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]])
torch.Size([2, 3, 10])



*   우리는 forward 와 backward layer 각각에서 나온 last hidden state를 나란히 합치고자 합니다.
*  `hidden[-2,:,:]` 는 forward layer 의 last hidden state 을 나타내고
*  `hidden[-1,:,:]` 는 backward layer 의 last hidden state 을 나타냅니다.
* 이 두개를 hidden_size 를 나타내는 dimension=1 의 방향으로 concatenate 합니다.




In [54]:
# 최종적으로 사용하는 Bidirectional RNN 모델의 아웃풋은 다음과 같은 형태를 가집니다.
h_concat = torch.cat([hidden[-2,:,:],hidden[-1,:,:]],dim=1)
print(h_concat)
print(h_concat.shape)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1.]])
torch.Size([3, 20])


## (추가설명 종료)
---


In [55]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = Model(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)    # Make model instance


In [56]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)  # Count number of elements of all parameters

print('The model has {:,} trainable parameters'.format(count_parameters(model)))

The model has 4,810,857 trainable parameters


In [57]:
# load pretrained embeddings
pretrained_embeddings = TEXT.vocab.vectors
print(type(pretrained_embeddings))
model.embedding.weight.data.copy_(pretrained_embeddings);

<class 'torch.Tensor'>


## 4. Train model

In [58]:
# TO-DO
# optimizer : Adam
optimizer = torch.optim.Adam(model.parameters())

In [59]:
# TO-DO
# criterion : Binarcy Cross Entropy with Logit Loss
criterion = nn.BCEWithLogitsLoss()

In [60]:
model = model.to(device)  #모델을 GPU 로 이동
criterion = criterion.to(device)

In [61]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [65]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        # TO-DO
        # General Training Scheme
        
        optimizer.zero_grad()
        prediction = model(batch.text[0]).squeeze(1)
        loss = criterion(prediction, batch.label)
        acc = binary_accuracy(prediction, batch.label)
        
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [76]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            # TO-DO
            # General Evaluation Scheme
            
            predicitons = model(batch.text[0]).squeeze(1)
            eval_loss = criterion(model(batch.text[0]).squeeze(1), batch.label)
            eval_acc = binary_accuracy(model(batch.text[0]).squeeze(1), batch.label)

            
            epoch_loss += eval_loss.item()
            epoch_acc += eval_acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
#오류로 criterion(predictions, batch.label) 에 predictions 대신 model(batch.text[0]).squeeze(1)로 했습니다.

### *Do Training!*

In [77]:
N_EPOCHS = 5  # about 13 mins

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'rnn-model.pt')
        #나은 성능이면 저장
    
    print('Epoch: {:02}'.format(epoch+1))
    print('\tTrain Loss: {:.3f} | Train Acc: {:.2f}%'.format(train_loss, train_acc*100))
    print('\t Val. Loss: {:.3f} |  Val. Acc: {:.2f}%'.format(valid_loss, valid_acc*100))

Epoch: 01
	Train Loss: 0.163 | Train Acc: 94.50%
	 Val. Loss: 0.357 |  Val. Acc: 86.08%
Epoch: 02
	Train Loss: 0.114 | Train Acc: 96.40%
	 Val. Loss: 0.430 |  Val. Acc: 86.41%
Epoch: 03
	Train Loss: 0.074 | Train Acc: 97.98%
	 Val. Loss: 0.497 |  Val. Acc: 86.16%
Epoch: 04
	Train Loss: 0.051 | Train Acc: 98.76%
	 Val. Loss: 0.541 |  Val. Acc: 85.96%
Epoch: 05
	Train Loss: 0.038 | Train Acc: 99.11%
	 Val. Loss: 0.627 |  Val. Acc: 85.08%


In [78]:
model.load_state_dict(torch.load('rnn-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print('Test Loss: {:.3f} | Test Acc: {:.2f}%'.format(test_loss, test_acc*100))

Test Loss: 0.394 | Test Acc: 84.18%


## 5. Test model
우리가 직접 예문을 작성해서 트레인된 모델에서 예문을 어떻게 평가하는지 확인합니다.



In [79]:
# 토크나이저로 spacy 를 사용합니다.
nlp = spacy.load('en_core_web_sm')

# 사용자가 입력한 sentence 를 훈련된 모델에 넣었을때의 결과값을 확인합니다.
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  # Tokenization
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]   # 위에서 만든 vocab 에 부여된 index 로 indexing
    tensor = torch.LongTensor(indexed).to(device)   # indexing 된 sequence 를 torch tensor 형태로 만들어줌.
    tensor = tensor.unsqueeze(1)   # 입력 텐서에 batch 차원을 만들어줌.
    prediction = torch.sigmoid(model(tensor))  # 모델에 입력한 후 확률값 도출을 위한 sigmoid 적용 
    return prediction.item() # prediction 값 출력

In [80]:
predict_sentiment(model, "This film is terrible") #아주 낮은 값의 확률이 도출되는 것을 확인할 수 있습니다.(부정)

0.011125648394227028

In [81]:
predict_sentiment(model, "This film is great") #아주 높은 값의 확률이 도출되는 것을 확인할 수 있습니다. (긍정)

0.9912176132202148