In [1]:
import os
import torch
import torch.nn as nn
import torchtext

In [3]:
!pip install torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp38-cp38-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 8.2 MB/s 
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp38-cp38-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 15 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.13.0+cu116
    Uninstalling torch-1.13.0+cu116:
      Successfully uninstalled torch-1.13.0+cu116
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.14.0
    Uninstalling torchtext-0.14.0:
      Successfully uninstalled torchtext-0.14.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.14.0+cu1

In [2]:
from torchtext.legacy import data, datasets 
from google.colab import drive
drive.mount('/content/drive')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

batch_size = 64
learning_rate = 0.001
training_epochs = 5

Mounted at /content/drive
cuda


In [3]:
# IMDB Dataset

TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL = data.Field(sequential=False, batch_first=True)
trainset, testset = datasets.IMDB.splits(TEXT, LABEL)

TEXT.build_vocab(trainset, min_freq=5)  # TEXT 데이터를 기반으로 Vocab 생성
LABEL.build_vocab(trainset)             # LABEL 데이터를 기반으로 Vocab 생성

# 학습용 데이터를 학습셋 80% 검증셋 20% 로 나누기
trainset, valset = trainset.split(split_ratio=0.8)

# 매 배치마다 비슷한 길이에 맞춰 줄 수 있도록 iterator 정의
train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (trainset, valset, testset), batch_size=batch_size,
        shuffle=True, repeat=False)

vocab_size = len(TEXT.vocab)
n_classes = 2                           # Positive, Negative Class가 두 개

print("[TrainSet]: %d [ValSet]: %d [TestSet]: %d [Vocab]: %d [Classes] %d"
      % (len(trainset),len(valset), len(testset), vocab_size, n_classes))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 22.9MB/s]


[TrainSet]: 20000 [ValSet]: 5000 [TestSet]: 25000 [Vocab]: 46159 [Classes] 2


In [4]:
class BasicGRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(BasicGRU, self).__init__()          
        self.n_layers = n_layers 

        self.embed = nn.Embedding(n_vocab, embed_dim)

        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_p)

        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers,
                          batch_first=True)
        
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        
        x = self.embed(x)
        x, _ = self.gru(x)  
        h_t = x[:,-1,:]
        self.dropout(h_t)

        out = self.out(h_t)  
        return out

In [5]:
# Model
model = BasicGRU(1, 256, vocab_size, 128, n_classes, 0.5).to(device)

# cost/loss & optimizer
criterion = torch.nn.CrossEntropyLoss().to(device)    # Softmax
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train
for epoch in range(training_epochs):
  avg_cost = 0
  for batch in train_iter:
    X, Y = batch.text.to(device), batch.label.to(device)
    Y.data.sub_(1)
    optimizer.zero_grad()
    hypothesis = model(X)
    cost = criterion(hypothesis, Y)
    cost.backward()
    optimizer.step()
    avg_cost += cost / batch_size
  print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, avg_cost))
print('Learning Finished!')

# Model Save
torch.save(model.state_dict(), '/content/drive/MyDrive/AI_Final/model_s1.pt')

[Epoch:    1] cost = 3.42060065
[Epoch:    2] cost = 3.39446473
[Epoch:    3] cost = 3.39399147
[Epoch:    4] cost = 3.21248937
[Epoch:    5] cost = 1.98488772
Learning Finished!


In [6]:
# Model load
model_new = BasicGRU(1, 256, vocab_size, 128, n_classes, 0.5).to(device)
model_new.load_state_dict(torch.load('/content/drive/MyDrive/AI_Final/model_s1.pt'))

corrects = 0
for batch in val_iter:
  x,y = batch.text.to(device), batch.label.to(device)
  y.data.sub_(1)
  hypothesis = model_new(x)
  corrects += (hypothesis.max(1)[1].view(y.size()).data == y.data).sum() 

print('accuracy = ', corrects/len(val_iter.dataset)*100.0)

accuracy =  tensor(83.8000, device='cuda:0')


In [9]:
input_text = testset[2].text
print(input_text)
print(TEXT.vocab[input_text[0]])

['when,', 'oh,', 'when', 'will', 'someone', 'like', 'anchor', 'bay', 'or', 'blue', 'underground', 'release', 'this', 'on', 'widescreen', 'dvd???', 'le', 'orme,', 'which', 'i', 'only', 'know', 'because', 'of', 'my', 'rare/vintage', 'video', 'collecting', 'habit,', 'is', 'a', 'film', 'in', 'my', 'collection', 'that', 'i', 'would', 'not', 'only', 'sit', 'through,', 'but', 'actually', 'enjoy', 'watching.', 'the', 'fact', 'that', 'klaus', 'kinski', 'is', 'top', 'billed,', 'but', 'is', 'only', 'in', 'small', 'parts', 'of', 'the', 'film,', 'means', 'little', 'to', 'me.', '(though', 'several', 'comments', 'expressed', 'disappointment', 'in', 'his', 'rather', 'limited', 'screen', 'time.)', 'i', 'cannot', 'say', 'that', 'this', 'is', 'a', 'good', 'horror', 'film,', 'a', 'good', 'mystery,', 'a', 'sci-fi', 'epic', 'or', 'anything', 'of', 'that', 'nature.', 'it', 'is', 'simply', 'unclassifiable', 'in', 'the', '"genre"', 'sense', 'of', 'things.', 'it', 'is', 'more', 'like', 'a', 'confusing,', 'frigh

In [1]:
input_text = testset[3].text
import numpy as np

temp = []

for i in testset[3].text:
  temp.append(TEXT.vocab[i])
print(temp)

X_t = torch.LongTensor(np.array(temp))
X_t = X_t.unsqueeze(dim=0)
hypothesis = model_new(X_t.to(device))

print(hypothesis)

NameError: ignored