<a href="https://colab.research.google.com/github/zeroxenator/rug-ltp-project/blob/master/SentimentAnalysisBERT_Local.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Preparing Data**

In [1]:
!pip install pytorch_pretrained_bert
!pip install torch torchvision
!pip install pytorch-nlp



In [0]:
import torch

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Load IMDB dataset from torchnlp

In [0]:
import random as rn
from torchnlp.datasets import imdb_dataset

rn.seed(321)

org_train_data, org_test_data = imdb_dataset(train=True, test=True)
rn.shuffle(org_train_data)
rn.shuffle(org_test_data)

In [6]:
train_data = org_train_data[:1000]
valid_data = org_train_data[1001:1501]
test_data = org_test_data[:100]

print(len(train_data))
print(len(valid_data))
print(len(test_data))

print(train_data[0])
print(valid_data[0])
print(test_data[0])


1000
500
100
{'text': "I stumbled upon this movie on cable and was totally hooked. The story of a group of surfers who ride the big waves, waves that are monstrously huge, waves that would make any rational person run away in terror is a one that manages to be spectacular and make you understand why people spend their lives chasing waves. There is nothing special about the film, other than it brings together some very interesting people who are are in love with what they do and lets them talk. Sure there are scenes of them surfing, but what makes this movie so special is the people. Here are a bunch of guys who are so enthusiastic about what they do that it crosses over to the people watching. Half way into this movie you'll want to go off and learn to surf as well. Few documentaries have ever managed to covey the passion that these people have and its the films ability to make us feel it that makes this a great film. See it.", 'sentiment': 'pos'}
{'text': "Joshua Seftel's first film -

Using pre-trained tokens from Bert

In [9]:
from pytorch_pretrained_bert import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
valid_texts, valid_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), valid_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))

print(len(train_texts), len(train_labels), len(valid_texts), len(valid_labels), len(test_texts), len(test_labels))

train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
valid_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], valid_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

len(train_tokens), len(valid_tokens), len(test_tokens)

1000 1000 500 500 100 100


(1000, 500, 100)

In [10]:
from keras.preprocessing.sequence import pad_sequences
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
valid_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, valid_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, valid_tokens_ids.shape, test_tokens_ids.shape

Using TensorFlow backend.


((1000, 512), (500, 512), (100, 512))

In [11]:
import numpy as np
train_y = np.array(train_labels) == 'pos'
valid_y = np.array(valid_labels) == 'pos'
test_y = np.array(test_labels) == 'pos'
train_y.shape, valid_y.shape, test_y.shape, np.mean(train_y), np.mean(valid_y), np.mean(test_y)

((1000,), (500,), (100,), 0.489, 0.496, 0.5)

In [0]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
valid_masks = [[float(i > 0) for i in ii] for ii in valid_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

In [0]:
train_tokens_ids[0]

In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
train_iterator

from torchtext import data
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_texts, valid_texts, test_texts), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

AttributeError: ignored

# Build the Model

In [0]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, embedding_matrix, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        super().__init__()
        
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))
        #self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [0]:
INPUT_DIM = len(train_tokens_ids)
EMBEDDING_DIM = 512
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
#PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(train_tokens_ids, INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT)

In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,154,433 trainable parameters


In [25]:
pretrained_embeddings = torch.Tensor(train_tokens_ids)

print(pretrained_embeddings.shape)

torch.Size([1000, 512])


In [26]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 101., 1045., 9845.,  ...,    0.,    0.,    0.],
        [ 101., 1000., 2028.,  ...,    0.,    0.,    0.],
        [ 101., 1037., 2980.,  ...,    0.,    0.,    0.],
        ...,
        [ 101., 2672., 2025.,  ...,    0.,    0.,    0.],
        [ 101., 2292., 1005.,  ...,    0.,    0.,    0.],
        [ 101., 1000., 2265.,  ...,    0.,    0.,    0.]])

Set the unknown parameters to zero

In [31]:
t = pretrained_embeddings
print (model.embedding.weight.data[(t == 101.).nonzero()])

# UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

# model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
# model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

# print(model.embedding.weight.data)

tensor([[[ 101., 1045., 9845.,  ...,    0.,    0.,    0.],
         [ 101., 1045., 9845.,  ...,    0.,    0.,    0.]],

        [[ 101., 1000., 2028.,  ...,    0.,    0.,    0.],
         [ 101., 1045., 9845.,  ...,    0.,    0.,    0.]],

        [[ 101., 1037., 2980.,  ...,    0.,    0.,    0.],
         [ 101., 1045., 9845.,  ...,    0.,    0.,    0.]],

        ...,

        [[ 101., 2672., 2025.,  ...,    0.,    0.,    0.],
         [ 101., 1045., 9845.,  ...,    0.,    0.,    0.]],

        [[ 101., 2292., 1005.,  ...,    0.,    0.,    0.],
         [ 101., 1045., 9845.,  ...,    0.,    0.,    0.]],

        [[ 101., 1000., 2265.,  ...,    0.,    0.,    0.],
         [ 101., 1045., 9845.,  ...,    0.,    0.,    0.]]])


# Train the Model

In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [0]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|█████████▉| 399622/400000 [00:30<00:00, 23879.92it/s]

Epoch: 01 | Epoch Time: 0m 41s
	Train Loss: 0.670 | Train Acc: 58.50%
	 Val. Loss: 0.700 |  Val. Acc: 50.75%
Epoch: 02 | Epoch Time: 0m 42s
	Train Loss: 0.658 | Train Acc: 60.35%
	 Val. Loss: 0.605 |  Val. Acc: 66.71%
Epoch: 03 | Epoch Time: 0m 42s
	Train Loss: 0.559 | Train Acc: 71.37%
	 Val. Loss: 0.433 |  Val. Acc: 80.81%
Epoch: 04 | Epoch Time: 0m 43s
	Train Loss: 0.464 | Train Acc: 79.12%
	 Val. Loss: 0.351 |  Val. Acc: 85.36%
Epoch: 05 | Epoch Time: 0m 43s
	Train Loss: 0.338 | Train Acc: 85.87%
	 Val. Loss: 0.307 |  Val. Acc: 87.27%


In [0]:

model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.325 | Test Acc: 86.43%
