In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import torchtext
from torchtext import data, datasets
from torchtext.vocab import Vocab

import time


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
from torchtext.datasets import IMDB

train_iter, test_iter = IMDB(split=('train' , 'test'), root = './data')

In [5]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')

In [6]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

def get_vocab(train_datapipe):
    vocab = build_vocab_from_iterator(yield_tokens(train_datapipe), specials=['<UNK>', '<PAD>','<BOS>', '<EOS>'], max_tokens=20000)
    vocab.set_default_index(vocab['<UNK>'])
    return vocab

trian_vocab = get_vocab(train_iter)

In [11]:
print("The length of the new vocab is", len(trian_vocab))
new_stoi = trian_vocab.get_stoi()
print("The index of '' is", new_stoi['<PAD>'])
new_itos = trian_vocab.get_itos()
print("The token at index 2 is", new_itos[2])
     

The length of the new vocab is 20000
The index of '' is 1
The token at index 2 is <BOS>


In [12]:
text_transform = lambda x: [trian_vocab['<BOS>']] + [trian_vocab[token] for token in tokenizer(x)] + [trian_vocab['<EOS>']]
label_transform = lambda x: 1 if x == 'pos' else 0

In [13]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    label_list, text_list = [], []

    for (_label, _text) in batch:
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_transform(_text))
        text_list.append(processed_text)

    return torch.tensor(label_list), pad_sequence(text_list, padding_value=3.0)

train_dataloader = DataLoader(list(train_iter), batch_size=64, shuffle=True, collate_fn=collate_batch)

In [14]:
batch_size = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embedding_dim = 100
hidden_size = 300

In [15]:
class RNNCell_Encoder(nn.Module):
    def __init__(self, input_dim, hidden_size):
        super().__init__()
        self.rnn = nn.RNNCell(input_dim, hidden_size)

    def forward(self, inputs):
        bz = inputs.shape[1]
        ht = torch.zeros((bz, hidden_size)).to(device)
        for word in inputs:
            ht = self.rnn(word, ht)
        return ht

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.em = nn.Embedding(len(vocab.get_stoi()), embedding_dim)
        self.rnn = RNNCell_Encoder(embedding_dim, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 256)
        self.fc2 = nn.Linear(256, 3)

    def forward(self, x):
        x = self.em(x)
        x = self.rnn(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [16]:
vocab = trian_vocab
model = Net()
model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [20]:
def training(epoch, model, trainloader):
    correct = 0
    total = 0
    running_loss = 0

    model.train()
    for b in enumerate(trainloader):
        x, y = b.text, b.label
        x, y = x.to(device), y.to(device)

        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            y_pred = torch.argmax(y_pred, dim=1)
            correct += (y_pred == y).sum().item()
            total += y.size(0)
            running_loss += loss.item()

    epoch_loss = running_loss / len(trainloader.dataset)
    epoch_acc = correct / total

    print(
        'epoch:' , epoch,
        'loss:' , round(epoch_loss, 3),
        'accuracy:' , round(epoch_acc, 3)
    )
    return epoch_loss, epoch_acc

In [None]:
start = time.time()
epochs = 5
train_loss = []
train_acc = []

for epoch in range(epochs):
    epoch_loss, epoch_acc = training(epoch, model, train_dataloader)
    train_loss.append(epoch_loss)
    train_acc.append(epoch_acc)

end = time.time()
print(end - start)

In [4]:
class BasicRNN(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p = 0.2):
        super().__init__()
        self.n_layers = n_layers
        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_p)
        self.rnn = nn.RNN(embed_dim, self.hidden_dim, num_layers = self.n_layers, batch_first = True)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size = x.size(0))
        x, _ = self.rnn(x, h_0)
        h_t = x[:, -1, :]
        self.dropout(h_t)
        logit = torch.sigmoid(self.out(h_t))
        return logit

    def _init_state(self, batch_size = 1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()


In [8]:
vocab_size = len(trian_vocab)
n_classes = 2

In [9]:
model = BasicRNN(n_layers=1, hidden_dim=256, n_vocab=vocab_size, embed_dim=128, n_classes=n_classes, dropout_p=0.5)
model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [13]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.text.to(device), batch.label.to(device)
        y.data.sub_(1)
        optimizer.zero_grad()

        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

        if b % 50 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)] \t Loss: {:.6f}'.format(e, b * len(x), len(train_iter.dataset), loss.item() ))


In [14]:
def evaluate(model, test_iter):
    model.eval()
    corrects, total, total_loss = 0, 0, 0

    for batch in test_iter:
        x, y = batch.text.to(device), batch.label.to(device)
        y.data.sub_(1)
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total += y.size(0)
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()

    avg_loss = total_loss / len(test_iter.dataset)
    avg_accuracy = corrects / total
    return avg_loss, avg_accuracy

In [None]:
BATCH_SIZE = 100
lr = 0.001
epochs = 5

for e in range(1, epochs+1):
    train(model, optimizer, train_iter)