In [1]:
import torch
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe

In [2]:
# torch.cuda.empty_cache()

In [2]:
import random

In [3]:
from tqdm import tqdm_notebook as tqdm

In [4]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
NESTED = data.Field()
TEXT = data.NestedField(NESTED)
LABEL = data.LabelField(dtype=torch.float)

In [6]:
fields = {'text': ('t',TEXT), 'label': ('l',LABEL)}

In [7]:
train_data, test_data = data.TabularDataset.splits(
                            path = '/homes/du113/scratch/han_data',
                            train = 'imdb_train.json',
                            test = 'imdb_test.json',
                            format = 'json',
                            fields = fields
)

In [8]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [9]:
print(vars(test_data.examples[0]))

{'t': [['Even', 'though', 'I', 'saw', 'this', 'film', 'when', 'I', 'was', 'very', 'young', ',', 'I', 'already', 'knew', 'the', 'story', 'of', 'Wild', 'the', 'Thief', '-', 'Taker', 'and', 'Shepherd', 'who', 'famously', 'escaped', 'from', 'Newgate', 'prison.<br', '/><br'], ['/>Apart', 'from', 'the', 'liberty', 'taken', 'right', 'at', 'the', 'end', ',', 'the', 'film', 'more', 'or', 'less', 'faithfully', 'follows', 'the', 'true', 'story', '.'], ['The', 'temptation', 'to', 'bend', 'the', 'facts', 'which', 'is', 'the', 'hallmark', 'of', 'so', 'many', 'so', '-', 'called', 'historical', 'films', 'is', 'resisted', 'in', 'this', 'film', 'and', 'the', 'film', 'makers', 'must', 'be', 'praised', 'for', 'that.<br', '/><br'], ['/>Of'], ['the', 'performances', ',', 'There', 'is', 'scarcely', 'a', 'poor', 'performance', ',', 'and', 'Tommy', 'Steele', 'is', 'ideally', 'cast', '.'], ['Also', 'good', 'is', 'Stanley', 'Baker', 'as', 'the', 'Thief', '-', 'Taker', 'and', 'Alan', 'Badel', 'is', 'good', 'as', 

In [10]:
vec = torchtext.vocab.Vectors('glove100d.txt', cache='/homes/du113/scratch')

In [15]:
TEXT.build_vocab(train_data,
                 max_size = 25000, 
                 vectors = vec, 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [16]:
print(f"Unique tokens in NESTED vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in NESTED vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [17]:
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
defaultdict(<function _default_unk_index at 0x7f2087c332f0>, {'neg': 0, 'pos': 1})


In [18]:
BATCH_SIZE = 8

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device)

In [19]:
import torch.nn as nn
import torch.nn.functional as F

In [17]:
# class Attention(nn.Module):
#     def __init__(self, input_dim, hidden_dim):
#         super(Attention, self).__init__()
        
#         self.linear_in = nn.Linear(in_features=input_dim, out_features=hidden_dim)
#         self.context = nn.Linear(in_features=hidden_dim, out_features=1, bias=False)
        
#     def forward(self, input):
#         # input has size [batch, seq_len, dimension]
#         input = self.linear_in(input)
#         scores = self.context(input)
#         # unnormalized score, dimensions [batch, seq_len, 1]
#         scores = F.softmax(scores, dim=1) # normalize across sequence dimension
#         input = input * scores
#         # [batch, seq_len, dimension]
#         return input.sum(dim=1)
    
        
class HAN(nn.Module):
    def __init__(self, vocab_size, emb_dim, word_dim, word_attn_dim, sent_dim, sent_attn_dim):
        super(HAN, self).__init__()
        self.embed_dim = emb_dim

        self.embeds = nn.Embedding(vocab_size, emb_dim)

        self.word_rnn = nn.LSTM(input_size=emb_dim, hidden_size=word_dim, batch_first=True, bidirectional=True)
        
#         self.word_attn = Attention(2*word_dim, word_attn_dim)
        
        self.sent_rnn = nn.LSTM(input_size=2*word_dim, hidden_size=sent_dim, batch_first=True, bidirectional=True)
        
#         self.sent_attn = Attention(2*sent_dim, sent_attn_dim)

        self.classifier = nn.Linear(in_features=2*sent_dim, out_features=1)

    def forward(self, input):
        # input will have dimension [batch_size, num_sents, num_words]
        batch_size, num_sents, num_words = input.shape

        input = self.embeds(input)
        
        # shape: [batch_size, num_sents, num_words, emb_dim]

        input = input.view(-1, num_words, self.embed_dim)
        
        word_out, _ = self.word_rnn(input)

#         # shape: [batch_size x num_sents, num_words, 2*word_out]
        
#         word_out = self.word_attn(word_out)
        
        # shape: [batch_size x num_sents, 2*word_out]
        
        word_out = word_out.view(batch_size, num_sents, -1)
        
        sent_out, _ = self.sent_rnn(word_out)
        
        sent_out = self.sent_attn(sent_out)
        
        # shape: [batch_size, 2*sent_out]
        
        return self.classifier(sent_out)

In [18]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
WORD_DIM = 50
WORD_ATTN_DIM = 100
SENT_DIM = 25
SENT_ATTN_DIM = 50

model = HAN(INPUT_DIM, EMBEDDING_DIM, WORD_DIM, WORD_ATTN_DIM, SENT_DIM, SENT_ATTN_DIM)

model.embeds = model.embeds.from_pretrained(TEXT.vocab.vectors)

In [19]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-2)

In [20]:
criterion = nn.BCEWithLogitsLoss()

In [21]:
model = model.to(device)
criterion = criterion.to(device)

In [22]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in tqdm(iterator):
        
        optimizer.zero_grad()
                
        predictions = model(batch.t).squeeze(1)
        
        loss = criterion(predictions, batch.l)
        
        acc = binary_accuracy(predictions, batch.l)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.t).squeeze(1)
            
            loss = criterion(predictions, batch.l)
            
            acc = binary_accuracy(predictions, batch.l)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
#     test_loss, test_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
#           | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

HBox(children=(IntProgress(value=0, max=3125), HTML(value='')))


| Epoch: 01 | Train Loss: 0.693 | Train Acc: 49.74%


HBox(children=(IntProgress(value=0, max=3125), HTML(value='')))


| Epoch: 02 | Train Loss: 0.693 | Train Acc: 49.66%


HBox(children=(IntProgress(value=0, max=3125), HTML(value='')))


| Epoch: 03 | Train Loss: 0.693 | Train Acc: 49.36%


HBox(children=(IntProgress(value=0, max=3125), HTML(value='')))


| Epoch: 04 | Train Loss: 0.693 | Train Acc: 49.45%


HBox(children=(IntProgress(value=0, max=3125), HTML(value='')))


| Epoch: 05 | Train Loss: 0.693 | Train Acc: 49.77%
