## References

1. https://github.com/bentrevett/pytorch-sentiment-analysis

In [14]:
import numpy as np
import torch

## Load data

TEXT field has tokenize='spacy' as an argument. This defines that the "tokenization" is done using the spaCy tokenizer. If no tokenize argument is passed, the default is simply splitting the string on spaces.

LABEL is defined by a LabelField, a special subset of the Field class specifically used for handling labels.

In [15]:
from torchtext import data, datasets

MAX_SEQ_LEN=500

TEXT = data.Field(tokenize='spacy', fix_length=MAX_SEQ_LEN)
LABEL = data.LabelField(dtype=torch.float)
d_train_all, d_test = datasets.IMDB.splits(TEXT, LABEL)

In [16]:
d_train, d_vali = d_train_all.split(split_ratio=0.8)

In [17]:
print(f'Number of training examples: {len(d_train)}')
print(f'Number of validation examples: {len(d_vali)}')
print(f'Number of testing examples: {len(d_test)}')

Number of training examples: 20000
Number of validation examples: 5000
Number of testing examples: 25000


## Encoding

The number of unique words in our training set is over 100,000, which means that our one-hot vectors will have over 100,000 dimensions! This will make training slow and possibly won't fit onto your GPU (if you're using one).

There are two ways effectively cut down our vocabulary, we can either only take the top $n$ most common words or ignore words that appear fewer than $m$ times. We'll do the former, only keeping the top 25,000 words.

In [18]:
MAX_VOCAB_SIZE = 5000

TEXT.build_vocab(d_train, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(d_train)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 5002
Unique tokens in LABEL vocabulary: 2


### Most common words

In [19]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 230705), (',', 219180), ('.', 189079), ('and', 124685), ('a', 124505), ('of', 114833), ('to', 106574), ('is', 86976), ('in', 70030), ('I', 61766), ('it', 60943), ('that', 55969), ('"', 50066), ("'s", 49268), ('this', 48275), ('-', 41880), ('/><br', 40363), ('was', 39931), ('as', 34632), ('with', 34133)]


### Index to string map

In [20]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


### String to index map

In [21]:
print(LABEL.vocab.stoi)

defaultdict(None, {'pos': 0, 'neg': 1})


## Training

In [22]:
BATCH_SIZE = 64

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cuda'

d_train_iter, d_vali_iter, d_test_iter = data.BucketIterator.splits(
    (d_train, d_vali, d_test), 
    batch_size = BATCH_SIZE,
    device = device)

### RNN

In [23]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        #text = [sent len, batch size]
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        return self.fc(hidden.squeeze(0))

class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden)
        return self.fc(hidden.squeeze(0))

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 32
HIDDEN_DIM = 100
OUTPUT_DIM = 1

model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 213,765 trainable parameters


### Optimizer

In [25]:
import torch.optim as optim

# optimizer = optim.SGD(model.parameters(), lr=1e-3)
optimizer = optim.Adam(model.parameters())

### Criterion

In [26]:
criterion = nn.BCEWithLogitsLoss()

Using `.to` to place the model and the criterion on the GPU (if we have one).

In [27]:
model = model.to(device)
criterion = criterion.to(device)

In [28]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [29]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()

        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        
        loss.backward()
        optimizer.step()
        
        acc = binary_accuracy(predictions, batch.label)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [30]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [32]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, d_train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, d_vali_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 13s
	Train Loss: 0.695 | Train Acc: 49.82%
	 Val. Loss: 0.693 |  Val. Acc: 49.70%
Epoch: 02 | Epoch Time: 0m 13s
	Train Loss: 0.694 | Train Acc: 50.24%
	 Val. Loss: 0.693 |  Val. Acc: 51.15%
Epoch: 03 | Epoch Time: 0m 13s
	Train Loss: 0.693 | Train Acc: 50.30%
	 Val. Loss: 0.693 |  Val. Acc: 49.82%
Epoch: 04 | Epoch Time: 0m 13s
	Train Loss: 0.693 | Train Acc: 50.88%
	 Val. Loss: 0.693 |  Val. Acc: 51.15%
Epoch: 05 | Epoch Time: 0m 13s
	Train Loss: 0.692 | Train Acc: 51.19%
	 Val. Loss: 0.693 |  Val. Acc: 49.72%


In [34]:
for batch in d_train_iter:
    # print(batch)
    # print(batch.text)
    # print(batch.text[:,0])
    # print(batch.text.shape)
    # print(batch.label[0])
    break