In [26]:
import torch
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe

In [2]:
import random

In [7]:
from tqdm import tqdm_notebook as tqdm

In [4]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [11]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [16]:
def tokenizer(input):
    return [str(sent) for sent in nlp(input).sents]

In [17]:
tokenizer('fat cat jumps. Big dog runs')

['fat cat jumps.', 'Big dog runs']

In [18]:
NESTED = data.Field(tokenize='spacy')
TEXT = data.NestedField(NESTED, tokenize=tokenizer)
LABEL = data.LabelField(dtype=torch.float)

In [20]:
# train_data, test_data = tqdm(datasets.IMDB.splits(TEXT, LABEL))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




In [53]:
import pickle

with open('/homes/du113/scratch/han_data/imdb_train.pkl', 'wb') as f:
    train_data = pickle.load(f)
    
with open('/homes/du113/scratch/han_data/imdb_test.pkl', 'wb') as f:
    pickle.dump(list(test_data), f)

In [94]:
with open('/homes/du113/scratch/han_data/imdb_train.pkl', 'rb') as f:
    train_data_iter = iter(pickle.load(f))

In [98]:
import jsonlines

with jsonlines.open('/homes/du113/scratch/han_data/imdb_train.json', 'w') as writer:
    writer.write_all([ex.__dict__ for ex in train_data.examples])

In [99]:
with jsonlines.open('/homes/du113/scratch/han_data/imdb_test.json', 'w') as writer:
    writer.write_all([ex.__dict__ for ex in test_data.examples])

In [97]:
train_data.examples[0].__dict__

{'text': [['This',
   'is',
   'a',
   'totally',
   'delightful',
   'and',
   'unexpected',
   'film',
   '.'],
  ['You',
   'start',
   'by',
   'following',
   'a',
   'young',
   'person',
   'who',
   'hopes',
   'to',
   'get',
   'into',
   'the',
   'qualifying',
   'football',
   'game',
   'between',
   'Bahrain',
   'and',
   'Iran',
   '.'],
  ['If',
   'Iran',
   'win',
   'they',
   'will',
   'get',
   'into',
   'the',
   '2006',
   'World',
   'Cup',
   'in',
   'Germany',
   '.'],
  ['The',
   'problem',
   'is',
   'the',
   'young',
   'person',
   'is',
   'a',
   'girl',
   'and',
   'girls',
   '(',
   'or',
   'women',
   'for',
   'that',
   'matter',
   ')',
   'are',
   'not',
   'allowed',
   'into',
   'football',
   'matches',
   'to',
   '"',
   'sit',
   'with',
   'men',
   '"',
   '.'],
  ['What',
   'follows',
   'is',
   'a',
   'wonderful',
   'comedy',
   ',',
   'played',
   'with',
   'consummate',
   'skill',
   'by',
   'a',
   'small',
   'en

In [22]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [24]:
print(vars(test_data.examples[0]))

{'text': [['Even', 'though', 'I', 'saw', 'this', 'film', 'when', 'I', 'was', 'very', 'young', ',', 'I', 'already', 'knew', 'the', 'story', 'of', 'Wild', 'the', 'Thief', '-', 'Taker', 'and', 'Shepherd', 'who', 'famously', 'escaped', 'from', 'Newgate', 'prison.<br', '/><br'], ['/>Apart', 'from', 'the', 'liberty', 'taken', 'right', 'at', 'the', 'end', ',', 'the', 'film', 'more', 'or', 'less', 'faithfully', 'follows', 'the', 'true', 'story', '.'], ['The', 'temptation', 'to', 'bend', 'the', 'facts', 'which', 'is', 'the', 'hallmark', 'of', 'so', 'many', 'so', '-', 'called', 'historical', 'films', 'is', 'resisted', 'in', 'this', 'film', 'and', 'the', 'film', 'makers', 'must', 'be', 'praised', 'for', 'that.<br', '/><br'], ['/>Of'], ['the', 'performances', ',', 'There', 'is', 'scarcely', 'a', 'poor', 'performance', ',', 'and', 'Tommy', 'Steele', 'is', 'ideally', 'cast', '.'], ['Also', 'good', 'is', 'Stanley', 'Baker', 'as', 'the', 'Thief', '-', 'Taker', 'and', 'Alan', 'Badel', 'is', 'good', 'as

In [27]:
vec = torchtext.vocab.Vectors('glove100d.txt', cache='/homes/du113/scratch')

In [29]:
TEXT.build_vocab(train_data, test_data, vectors=vec)
LABEL.build_vocab(train_data, test_data)

In [31]:
print(f"Unique tokens in NESTED vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in NESTED vocabulary: 176835
Unique tokens in LABEL vocabulary: 2


In [32]:
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']
defaultdict(<function _default_unk_index at 0x7f1b623442f0>, {'neg': 0, 'pos': 1})


In [44]:
BATCH_SIZE = 64

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device)

In [35]:
import torch.nn as nn
import torch.nn.functional as F

In [66]:
class Attention(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Attention, self).__init__()
        
        self.linear_in = nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.context = nn.Linear(in_features=hidden_dim, out_features=1, bias=False)
        
    def forward(self, input):
        # input has size [batch, seq_len, dimension]
        input = self.linear_in(input)
        scores = self.context(input)
        # unnormalized score, dimensions [batch, seq_len, 1]
        scores = F.softmax(scores, dim=1) # normalize across sequence dimension
        input = input * scores
        # [batch, seq_len, dimension]
        return input.sum(dim=1)
    
        
class HAN(nn.Module):
    def __init__(self, vocab_size, emb_dim, word_dim, word_attn_dim, sent_dim, sent_attn_dim):
        super(HAN, self).__init__()
        self.embed_dim = emb_dim

        self.embeds = nn.Embedding(vocab_size, emb_dim)

        self.word_rnn = nn.GRU(input_size=emb_dim, hidden_size=word_dim, batch_first=True, bidirectional=True)
        
        self.word_attn = Attention(2*word_dim, word_attn_dim)
        
        self.sent_rnn = nn.GRU(input_size=2*word_dim, hidden_size=sent_dim, batch_first=True, bidirectional=True)
        
        self.sent_attn = Attention(2*sent_dim, sent_attn_dim)

        self.classifier = nn.Linear(in_features=2*sent_dim, out_features=1)

    def forward(self, input):
        # input will have dimension [batch_size, num_sents, num_words]
        batch_size, num_sents, num_words = input.shape

        input = self.embeds(input)
        
        # shape: [batch_size, num_sents, num_words, emb_dim]

        input = input.view(-1, num_words, self.embed_dim)
        
        word_out, _ = self.word_rnn(input)
        
        print(word_out.shape)

        # shape: [batch_size x num_sents, num_words, 2*word_out]
        
        word_out = self.word_attn(word_out)
        
        print(word_out.shape)
        
        # shape: [batch_size x num_sents, 2*word_out]
        
        word_out = word_out.view(batch_size, num_sents, -1)
        
        print(word_out.shape)
        
        sent_out, _ = self.sent_rnn(word_out)
        
        sent_out = self.sent_attn(sent_out)
        
        # shape: [batch_size, 2*sent_out]
        
        return self.classifier(sent_out)

In [72]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
WORD_DIM = 256
WORD_ATTN_DIM = 256
SENT_DIM = 128
SENT_ATTN_DIM = 128

model = HAN(INPUT_DIM, EMBEDDING_DIM, WORD_DIM, WORD_ATTN_DIM, SENT_DIM, SENT_ATTN_DIM)

model.embeds = model.embeds.from_pretrained(TEXT.vocab.vectors)

In [46]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [73]:
criterion = nn.BCEWithLogitsLoss()

In [74]:
model = model.to(device)
criterion = criterion.to(device)

In [42]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [90]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

RuntimeError: CUDA out of memory. Tried to allocate 1.41 GiB (GPU 1; 11.91 GiB total capacity; 10.92 GiB already allocated; 185.06 MiB free; 238.26 MiB cached)

In [92]:
torch.cuda.empty_cache()

In [91]:
del train_iterator
del test_iterator
del model
del criterion

In [93]:
del optimizer