## [IMDB](https://torchtext.readthedocs.io/en/latest/datasets.html#imdb)

In [None]:
# https://pytorch.org/get-started/previous-versions/

In [None]:
# CUDA 11.8
#conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=11.8 -c pytorch -c nvidia
# CUDA 12.1
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1
#conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 cpuonly -c pytorch

In [None]:
!pip install torchtext

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from typing import List, Tuple



In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [None]:
!pip install torchdata portalocker

In [4]:
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

In [5]:
train_iter, test_iter = IMDB(split=('train', 'test'))
text_vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
text_vocab.set_default_index(text_vocab["<unk>"])

################################################################################
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################



In [6]:
train_iter, test_iter = IMDB(split=('train', 'test'))

In [7]:
for label, text in train_iter:
    print(label)
    print(text)
    break

1
I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [8]:
train_iter, test_iter = IMDB(split=('train', 'test'))
labels = [a for a, b in list(train_iter)]
labels = set(labels)
labels


{1, 2}

In [9]:
def process_text(text, tokenizer):
    return torch.tensor(text_vocab(tokenizer(text)), dtype=torch.long)

In [10]:
# collate 함수에서 text를 procesor로 tensor 변환
def collate_batch(batch):
    label_list, text_list = [], []
    for _label, _text in batch:
        label_list.append(int(_label))
        text_list.append(process_text(_text, tokenizer))

    text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=text_vocab['<pad>'])
    label_list = torch.tensor(label_list, dtype=torch.int64)
    return text_list, label_list

In [11]:
batch_size = 4

# IteratableDataset이므로 그대로 전달해도 됨, 다만 len()는 사용할 수 없음
train_loader = DataLoader(train_iter, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_iter, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [None]:
#print('훈련 샘플의 개수 : {}'.format(len(train_loader)))
#print('테스트 샘플의 개수 : {}'.format(len(test_loader)))

In [12]:
for text, label in train_loader:
    print(text, label)
    break

tensor([[  13, 5111,  443,  ...,    0,    0,    0],
        [  88,  121,    3,  ...,    0,    0,    0],
        [  12,   96,    8,  ...,    0,    0,    0],
        [ 107,    5, 4702,  ...,  406, 5868,    2]]) tensor([1, 1, 1, 1])


In [13]:
print(text.shape, label.shape)

torch.Size([4, 437]) torch.Size([4])


In [14]:
embed_dim = 128
n_layers = 3
n_vocab = len(text_vocab.get_stoi())

hidden_size = 256
output_size = len(labels)

In [15]:
class IMDBModel(nn.Module):
    def __init__(self, embed_dim, hidden_size, output_size, n_layers, device):
        super(IMDBModel, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.device = device

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.rnn = nn.GRU(input_size=embed_dim, hidden_size=hidden_size, num_layers=n_layers, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, output_size)
        )        
    def forward(self, x):
        batch_size = x.shape[0]
        hidden = torch.zeros(n_layers, batch_size, self.hidden_size).to(device)  
        x = self.embed(x)
        x, hidden = self.rnn(x, hidden)
        x = x[:,-1,:]
        output = self.fc(x)
        return output

In [16]:
import numpy as np

model = IMDBModel(embed_dim, hidden_size, output_size, n_layers, device).to(device)
X = torch.LongTensor(text).to(device)
with torch.no_grad():
    y_pred = model(X)
    print(y_pred)
    print(np.argmax(y_pred.cpu(), axis=1), label)

tensor([[-0.1571, -0.0586],
        [-0.1571, -0.0586],
        [-0.1571, -0.0586],
        [-0.1330,  0.0577]], device='cuda:0')
tensor([1, 1, 1, 1]) tensor([1, 1, 1, 1])


In [17]:
batch_size = 120
model = IMDBModel(embed_dim, hidden_size, output_size, n_layers, device).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss().to(device)

n_epochs = 10 # 10회 학습

list_training_loss = []
list_test_loss = []


for epoch in range(n_epochs):
    n_train = 0
    train_loss = 0

    train_loader = DataLoader(train_iter, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    test_loader = DataLoader(test_iter, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

    model.train()
    for text, label in train_loader:
        X_train = torch.LongTensor(text).to(device)
        y_train = torch.LongTensor(label - 1).to(device)
        y = model(X_train)
        del X_train
        loss = criterion(y, y_train).sum()
        train_loss += loss.data.cpu().numpy()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        n_train += len(y_train)
        del y_train

#    if (epoch + 1) % 10 == 0:
    if True:
        model.eval()
        n_test = 0
        test_loss = 0
        correct = 0
        total = 0
        
        for text, label in test_loader:
            X_test = torch.LongTensor(text).to(device)
            y_test = torch.LongTensor(label - 1).to(device)
            y_pred = model(X_test)
            idx_pred = torch.max(y_pred, 1)[1]
            del X_test
            loss = criterion(y_pred, y_test).sum()
            test_loss += loss.data.cpu().numpy()
            n_test += len(y_test)
            correct += (idx_pred == y_test).sum()
            del y_test

        accuracy = correct * 100 / n_test
        print('epoch {}th training loss: {} test loss: {}, accuracy: {}'.format(
                epoch + 1, train_loss / n_train, test_loss / n_test,
                accuracy
            ))
        list_training_loss.append(train_loss / n_train)
        list_test_loss.append(test_loss / n_test)

epoch 1th training loss: 0.004702317092817757 test loss: 0.006076825556755066, accuracy: 50.0
epoch 2th training loss: 0.004572136507864343 test loss: 0.006340211351513862, accuracy: 62.619998931884766


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

plt.plot(list_training_loss, label='training')
plt.plot(list_test_loss, label='test')
plt.legend()
plt.show()