In [1]:
import torch
import torchtext
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import time

In [2]:
start = time.time()
TEXT = torchtext.legacy.data.Field(lower=True, fix_length=200, batch_first=False)
LABEL = torchtext.legacy.data.Field(sequential=False)

In [3]:
from torchtext.legacy import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [4]:
print(vars(train_data.examples[0]))

{'text': ['i', 'went', 'into', 'this', 'movie', 'perhaps', 'a', 'bit', 'jaded', 'by', 'the', 'hack-and-slash', 'films', 'rampant', 'on', 'the', 'screen', 'these', 'days.', 'boy,', 'was', 'i', 'surprised.', 'this', 'little', 'treasure', 'was', 'pleasantly', 'paced', 'with', 'a', 'somber,', 'dark', 'atmosphere.', 'more', 'surprising', 'yet', 'was', 'the', 'very', 'limited', 'amount', 'of', 'blood', 'actually', 'shown.', 'as', 'with', 'most', 'good', 'movies,', 'this', 'one', 'leaves', 'something', 'to', 'the', 'imagination,', 'and', 'bill', 'paxton', 'did', 'a', 'superb', 'job', 'at', 'directing.', 'scenes', 'shot', 'inside', 'the', 'car', 'as', 'are', 'well', 'done', 'and,', 'after', 'watching', 'the', '"anatomy', 'of', 'a', 'scene"', 'episode', 'at', 'the', 'end', 'of', 'the', 'video', 'tape,', 'it', 'was', 'good', 'to', 'see', 'that', 'some', 'of', 'the', 'subtle,', 'yet', 'wonderful', 'things', 'i', 'had', 'noticed', 'were', 'intentional', 'and', 'not', 'just', 'an', '"oh,', 'that', 

In [5]:
import string

for example in train_data.examples:
    text = [x.lower() for x in vars(example)['text']]
    text = [x.replace("<br","")for x in text]
    text = [''.join(c for c in s if c not in string.punctuation) for s in text]
    text = [s for s in text if s]
    vars(example)['text'] = text
    
for example in test_data.examples:
    text = [x.lower() for x in vars(example)['text']]
    text = [x.replace("<br", "") for x in text]
    text = [''.join(c for c in s if c not in string.punctuation) for s in text]
    text = [s for s in text if s]
    vars(example)['text'] = text

In [6]:
import random
train_data, valid_data = train_data.split(random_state = random.seed(0), split_ratio = 0.8)

In [7]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 20000
Number of validation examples: 5000
Number of testing examples: 25000


In [8]:
TEXT.build_vocab(train_data, max_size=10000, min_freq=10, vectors=None)
LABEL.build_vocab(train_data)

print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')

Unique tokens in TEXT vocabulary: 10002
Unique tokens in LABEL vocabulary: 3


In [9]:
print(LABEL.vocab.stoi)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x7f87e4ea9410>>, {'<unk>': 0, 'pos': 1, 'neg': 2})


In [10]:
batch_size = 64
device = 'cuda' if torch.cuda.is_available() else "cpu"
device = 'cpu' #cuda version error

embedding_dim = 10
hidden_size = 300
train_iterator, valid_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = batch_size,
    device = device)

In [11]:
class RNNCell_Encoder(nn.Module):
    def __init__(self, input_dim, hidden_size):
        super(RNNCell_Encoder, self).__init__()
        self.rnn = nn.RNNCell(input_dim, hidden_size)
        self.hidden_size = hidden_size
        
    def forward(self, inputs):
        bz = inputs.shape[1]
        ht = torch.zeros((bz, self.hidden_size)).to(device)
        
        for word in inputs:
            ht = self.rnn(word, ht)
        return ht

class Net(nn.Module):
    def __init__(self, embedding_dim, hidden_size):
        super(Net, self).__init__()
        self.em = nn.Embedding(len(TEXT.vocab.stoi), embedding_dim)
        self.rnn = RNNCell_Encoder(embedding_dim, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 256)
        self.fc2 = nn.Linear(256, 3)
        
    def forward(self, x):
        x = self.em(x)
        x = self.rnn(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [12]:
model = Net(embedding_dim = 10, hidden_size = 300)
model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [13]:
def training(epoch, model, trainloader, validloader):
    correct = 0
    total = 0
    running_loss = 0
    
    model.train()
    for b in trainloader:
        x, y = b.text, b.label
        y_pred = model(x.to(device))
        loss = loss_fn(y_pred, y.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            y_pred = torch.argmax(y_pred, dim=1)
            correct += (y_pred == y).sum().item()
            total += y.size(0)
            running_loss += loss.item()
    epoch_loss = running_loss / len(trainloader.dataset)
    epoch_acc = correct / total
    
    valid_correct = 0
    valid_total = 0
    valid_running_loss = 0
    
    model.eval()
    with torch.no_grad():
        for b in validloader:
            x, y = b.text, b.label
            y_pred = model(x.to(device))
            loss = loss_fn(y_pred, y.to(device))
            y_pred = torch.argmax(y_pred, dim=1)
            valid_correct += (y_pred == y).sum().item()
            valid_total += y.size(0)
            valid_running_loss += loss.item()
            
    epoch_valid_loss = valid_running_loss / len(validloader.dataset)
    epoch_valid_acc = valid_correct / valid_total
    
    
    print('epoch: ', epoch,
          'loss: ', round(epoch_loss, 3),
          'accuaracy: ', round(epoch_acc, 3),
          'valid_loss: ', round(epoch_valid_loss, 3),
          'valid_accuarcy: ', round(epoch_valid_acc, 3))
    
    return epoch_loss, epoch_acc, epoch_valid_loss, epoch_valid_acc

In [14]:
epochs = 5
train_loss = []
train_acc = []
valid_loss = []
valid_acc = []

for epoch in range(epochs):
    epoch_loss, epoch_acc, epoch_valid_loss, epoch_valid_acc \
    = training(epoch,model,train_iterator,valid_iterator)
    train_loss.append(epoch_loss)
    train_acc.append(epoch_acc)
    valid_loss.append(epoch_valid_loss)
    valid_acc.append(epoch_valid_acc)

end = time.time()
print(end-start)

epoch:  0 loss:  0.011 accuaracy:  0.492 valid_loss:  0.011 valid_accuarcy:  0.505
epoch:  1 loss:  0.011 accuaracy:  0.496 valid_loss:  0.011 valid_accuarcy:  0.498
epoch:  2 loss:  0.011 accuaracy:  0.499 valid_loss:  0.011 valid_accuarcy:  0.495
epoch:  3 loss:  0.011 accuaracy:  0.502 valid_loss:  0.011 valid_accuarcy:  0.495
epoch:  4 loss:  0.011 accuaracy:  0.501 valid_loss:  0.011 valid_accuarcy:  0.505
951.3515832424164


In [15]:
def evaluate(epoch, model, testloader):    
    test_correct = 0
    test_total = 0
    test_running_loss = 0
    
    model.eval()
    with torch.no_grad():
        for b in testloader:
            x, y = b.text, b.label
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            y_pred = torch.argmax(y_pred, dim=1)
            test_correct += (y_pred == y).sum().item()
            test_total += y.size(0)
            test_running_loss += loss.item()

    epoch_test_loss = test_running_loss / len(testloader.dataset)
    epoch_test_acc = test_correct / test_total

    print('epoch: ', epoch,
          'test_loss： ', round(epoch_test_loss, 3),
          'test_accuracy:', round(epoch_test_acc, 3)
          )
    return epoch_test_loss, epoch_test_acc

In [16]:
epochs = 5
test_loss = []
test_acc = []

for epoch in range(epochs):
    epoch_test_loss, epoch_test_acc = evaluate(epoch,
                                               model,
                                               test_iterator)
    test_loss.append(epoch_test_loss)
    test_acc.append(epoch_test_acc)

end = time.time()
print(end-start)

epoch:  0 test_loss：  0.011 test_accuracy: 0.501
epoch:  1 test_loss：  0.011 test_accuracy: 0.501
epoch:  2 test_loss：  0.011 test_accuracy: 0.501
epoch:  3 test_loss：  0.011 test_accuracy: 0.501
epoch:  4 test_loss：  0.011 test_accuracy: 0.501
1132.333551645279
