### Train a sentiment model using transformer transfer learning, with later layers that will actually be trained are from a bidirectional GRU model. Thanks to @bentrevett for the modeling code, this is mainly an excercise in deploying NLP model as a REST API

In [5]:
import torch

import random
import numpy as np
import pickle

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
!ls

 get_custom_dataset_cleaned_and_prepared.ipynb
 pickled_yelp_reviews.pkl
 sentiment_train_on_yelp.ipynb
'used on imdb dataset just to learn code.ipynb'


In [7]:
!pwd

/home/ubuntu/Projects/pytorch_sentiment_api/notebooks


In [8]:
data_file = "../data/yelp_sample.csv"

In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
len(tokenizer.vocab)

30522

In [11]:
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')

print(tokens)

['hello', 'world', 'how', 'are', 'you', '?']


In [12]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[7592, 2088, 2129, 2024, 2017, 1029]


In [13]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [14]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [15]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [16]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [17]:
from torchtext import data

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

In [18]:
pos = data.TabularDataset(
    path=data_file,format='csv',
    fields=[('text', TEXT),
           ('label',LABEL)]
      ) 

In [19]:
# with open("pickled_yelp_reviews.pkl", "wb") as f:
#     pickle.dump(pos, f)
    

In [20]:
len(pos)

89031

In [21]:
train_data, test_data = pos.split(split_ratio = 0.8,
                                  random_state = random.seed(SEED))

In [22]:
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [23]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 49858
Number of validation examples: 21367
Number of testing examples: 17806


In [24]:
print(vars(train_data.examples[2]))

{'text': [1045, 2428, 2066, 2023, 2173, 2009, 2038, 1037, 2307, 3528, 1997, 6350, 7047, 2130, 2065, 1057, 2024, 3110, 2066, 2070, 6265, 2027, 2031, 1037, 2307, 28046, 9587, 3597, 2007, 1037, 25628, 24165, 24665, 11431, 2100, 2074, 9805, 18879, 1996, 2069, 2518, 1045, 2052, 2360, 2009, 1005, 1055, 1037, 2978, 4030, 2006, 2326, 2021, 2040, 2064, 7499, 2068, 2027, 2024, 2467, 2428, 5697, 2061, 2022, 4810, 2000, 3524, 2021, 2009, 1005, 1055, 4276, 2009], 'label': 'pos'}


In [25]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[0])['text'])
tokens

['jack',
 'took',
 'in',
 'my',
 'wheel',
 'and',
 'brand',
 'new',
 'tire',
 'monday',
 'evening',
 ',',
 'no',
 'problem',
 'should',
 'take',
 'a',
 'day',
 '.',
 'after',
 'two',
 'days',
 'no',
 'phone',
 'call',
 '.',
 'we',
 'have',
 'called',
 'and',
 'called',
 '.',
 'his',
 'shop',
 'locked',
 'up',
 'tight',
 '.',
 'he',
 'has',
 'my',
 'brand',
 'new',
 'tire',
 'and',
 'my',
 'wheel',
 '.',
 'yet',
 'he',
 'doesn',
 "'",
 't',
 'think',
 'he',
 'needs',
 'to',
 'call',
 'back',
 'apparently',
 '.',
 'we',
 'have',
 'taken',
 'the',
 'day',
 'off',
 'work',
 'to',
 'stand',
 'outside',
 'his',
 'shop',
 'in',
 'case',
 'he',
 'rear',
 '##s',
 'his',
 'head',
 'so',
 'we',
 'can',
 'get',
 'our',
 'property',
 'back',
 '.',
 'no',
 'hope',
 'he',
 'actually',
 'fixed',
 'this',
 '.',
 'by',
 'the',
 'way',
 'this',
 'was',
 'a',
 'warrant',
 '##y',
 'deal',
 '.',
 'he',
 'fixed',
 'the',
 'cracked',
 'wheel',
 'about',
 'six',
 'months',
 'ago',
 '.',
 'said',
 'that',
 'it'

In [26]:
LABEL.build_vocab(train_data)

In [27]:
print(LABEL.vocab.stoi)

defaultdict(None, {'pos': 0, 'neg': 1, 'text': 2})


In [46]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key = lambda x: len(x.text),
    sort_within_batch=False,
    repeat=False)

### Build out the Model Class

In [47]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

In [48]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = bert(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [49]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [50]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


In [51]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [52]:

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


In [53]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


## Train the Model

In [54]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [55]:
criterion = nn.BCEWithLogitsLoss()

In [56]:
model = model.to(device)
criterion = criterion.to(device)

In [57]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds==y).float()
    acc = correct.sum() / len(correct)
    return acc

In [58]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [59]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [60]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [61]:
device

device(type='cuda')

In [62]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'yelp_sample_model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 66m 50s
	Train Loss: 0.199 | Train Acc: 91.89%
	 Val. Loss: 0.095 |  Val. Acc: 96.45%
Epoch: 02 | Epoch Time: 66m 42s
	Train Loss: 0.105 | Train Acc: 95.85%
	 Val. Loss: 0.088 |  Val. Acc: 96.77%
Epoch: 03 | Epoch Time: 67m 2s
	Train Loss: 0.089 | Train Acc: 96.61%
	 Val. Loss: 0.094 |  Val. Acc: 96.48%
Epoch: 04 | Epoch Time: 67m 0s
	Train Loss: 0.076 | Train Acc: 97.08%
	 Val. Loss: 0.091 |  Val. Acc: 96.62%
Epoch: 05 | Epoch Time: 66m 51s
	Train Loss: 0.065 | Train Acc: 97.52%
	 Val. Loss: 0.095 |  Val. Acc: 96.36%


In [63]:
model.load_state_dict(torch.load('yelp_sample_model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.087 | Test Acc: 96.82%


In [64]:
torch.save(model, 'yelp_entire_model.pt') 

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


### Inference

In [65]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

## 0 is Positive Class, and 1 is negative class

In [66]:
predict_sentiment(model, tokenizer, "This restaurant is terrible")


0.9918892979621887

In [67]:
predict_sentiment(model, tokenizer, "This restaurant is great")


0.03485602140426636

#### Just wanted to check if loading the model from file worked correctly

In [68]:
model2  = torch.load("yelp_entire_model.pt")

In [71]:
predict_sentiment(model2, tokenizer, "Worst chinese ever")

0.9887314438819885