# 1 - Simple Sentiment Analysis


## Preparing Data

In [1]:
import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(use_vocab=False,dtype=torch.float)

Read data into torchtext dataset.

In [3]:
from torchtext.data import TabularDataset
 
tv_datafields = [("PhraseId", None), # we won't be needing the id, so we pass in None as the field
                 ("SentenceId", None), ("Phrase", TEXT),
                 ("Sentiment", LABEL)]

trn = TabularDataset(
               path='data/train.tsv', # the root directory where the data lies
               #train='train.csv', validation="valid.csv",
               format='tsv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)

In [4]:
train_data, test_data = trn.split()

We can see how many examples are in each split by checking their length.

In [5]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 109242
Number of testing examples: 46818


We can also check an example.

In [6]:
print(vars(train_data.examples[0]))

{'Phrase': ['like', 'a', 'living', '-', 'room', 'War', 'Of', 'The', 'Worlds', ',', 'gaining', 'most', 'of', 'its', 'unsettling', 'force', 'from', 'the', 'suggested', 'and', 'the', 'unknown'], 'Sentiment': '3'}


In [7]:
import random

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

We'll view how many examples are in each split.

In [8]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 76469
Number of validation examples: 32773
Number of testing examples: 46818


In [9]:
TEXT.build_vocab(train_data, max_size=25000,vectors="glove.6B.100d")
LABEL.build_vocab(train_data)

In [10]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 16853
Unique tokens in LABEL vocabulary: 5


We can also view the most common words in the vocabulary. 

In [11]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 22982), (',', 20568), ('a', 16641), ('of', 16009), ('and', 15741), ('-', 11241), ('to', 11008), ('.', 8743), ("'s", 8284), ('in', 6714), ('is', 6596), ('that', 5934), ('it', 5115), ('as', 4085), ('with', 3623), ('for', 3608), ('its', 3421), ('film', 3281), ('movie', 3051), ('an', 2882)]


We can also see the vocabulary directly using either the `stoi` (**s**tring **to** **i**nt) or `itos` (**i**nt **to**  **s**tring) method.

In [12]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', 'a', 'of', 'and', '-', 'to', '.']


In [14]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([16853, 100])


In [16]:
BATCH_SIZE = 64

print(torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key=lambda x: len(x.Phrase),
    batch_size=BATCH_SIZE,
    device=device)

False


## Build the RNN Model


In [17]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden.squeeze(0))

Define parameters for RNN.

In [18]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [19]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.0774,  0.2873,  0.6158,  ...,  0.1669, -0.3281,  0.3940],
        [-0.1639, -0.4033,  0.2748,  ...,  0.3425, -0.4674,  0.2989],
        [-0.2805,  0.1506,  0.3955,  ...,  0.6393,  0.0779,  0.7722]])

## Train the Model

Create an optimizer.

In [20]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

Loss function.

In [21]:
criterion = nn.MSELoss()

We can place the model and the criterion on the GPU (if we have one). 

In [22]:
model = model.to(device)
criterion = criterion.to(device)

Criterion function to calculate the loss.

In [23]:
def calculate_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

Functions to train and evaluate model.

In [24]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.Phrase).squeeze(1)
        
        loss = criterion(predictions, batch.Sentiment)
        
        acc = calculate_accuracy(predictions, batch.Sentiment)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [25]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.Phrase).squeeze(1)
            
            loss = criterion(predictions, batch.Sentiment)
            
            acc = calculate_accuracy(predictions, batch.Sentiment)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

We then train the model through multiple epochs, an epoch being a complete pass through all examples in the split.

In [26]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.618 | Train Acc: 55.14% | Val. Loss: 0.767 | Val. Acc: 45.12% |
| Epoch: 02 | Train Loss: 0.459 | Train Acc: 60.35% | Val. Loss: 0.524 | Val. Acc: 53.60% |
| Epoch: 03 | Train Loss: 0.405 | Train Acc: 62.57% | Val. Loss: 0.463 | Val. Acc: 56.20% |
| Epoch: 04 | Train Loss: 0.365 | Train Acc: 64.51% | Val. Loss: 0.449 | Val. Acc: 57.76% |
| Epoch: 05 | Train Loss: 0.343 | Train Acc: 65.69% | Val. Loss: 0.403 | Val. Acc: 61.78% |


Test loss and accuracy.

In [27]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.403 | Test Acc: 62.13% |


Save model

In [29]:
torch.save(model.state_dict(),'models/ltsentiment.pt')

In [None]:
model = TheModelClass(*args, **kwargs)
model.load_state_dict(torch.load(PATH))
model.eval()

Test custom instances written in instance.tsv file

In [62]:
instance_data = TabularDataset(
               path='data/instance.tsv', # the root directory where the data lies
               #train='train.csv', validation="valid.csv",
               format='tsv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)

inst_iterator = data.BucketIterator(
    instance_data,
    sort_key=lambda x: len(x.Phrase),
    batch_size=64,
    device=device)

for batch in inst_iterator:
    predictions = model(batch.Phrase)#.squeeze(1)
    print(torch.round(predictions))
    

# The sentiment labels are:

# 0 - negative
# 1 - somewhat negative
# 2 - neutral
# 3 - somewhat positive
# 4 - positive
    
#    return prediction

tensor([0.], grad_fn=<RoundBackward>)
