# CNN-based sentiment classifier
The code below illustrates a working example of a CNN-based sentiment classifier. The data used to train the model is from coursework assignment 1.

In [1]:
# load data and take a quick look
import pandas as pd
from sklearn.utils import shuffle
raw_data = pd.read_csv('../class8/coursework1_train.csv') # put the dataset from CW1 under the same directory
raw_data = shuffle(raw_data)

In [32]:
# check the size of the data and its class distribution

# only use a small fraction of the data to speed up training and testing
# training on the full dataset requires a large RAM (>30GB) or a GPU card
all_text = raw_data['text'].tolist()[:40000] 
all_raw_labels = raw_data['sentiment'].tolist()[:40000]
labels_list = ['pos','neg']
all_labels = [labels_list.index(ll) for ll in all_raw_labels]

print('entry num', len(all_text))
print(len([ll  for ll in  all_labels if ll==1]))

entry num 40000
20000


In [33]:
# data split. 
# Feel free to use differnt raios to split the data.
train_docs = all_text[:35000]
train_labels = all_labels[:35000]
dev_docs = all_text[35000:]
dev_labels = all_labels[35000:]

print('neg in train', len([ll for ll in train_labels if ll==1]))
print('neg in dev', len([ll for ll in dev_labels if ll==1]))

neg in train 17484
neg in dev 2516


In [7]:
dev_docs[19]

'Now I love Bela Lugosi,don\'t get me wrong,he is one of the most interesting people to ever make a movie but he certainly did his share of clunkers.This is just another one of those.<br /><br />Lugosi plays Dr.Lorenz,a doctor who has had his medical license pulled for unexplained reasons.He is however doing experiments to keep his wife young and beautiful.It\'s revealed that she is 70-80 years old yet Lugosi looks to be in his mid 50\'s so why he is married to this old woman is never really explained.<br /><br />Anyway these treatments or experiments involved giving brides who are at the altar being married some sort of sweet smelling substance whereby they pass out but are thought to be dead.Then Lugosi and some of his assistants steal the body on its way to the morgue and take it back to his lab where it\'s kept in some sort of suspended animation or catatonic state.Then the stolen brides have a needle rammed somewhere in their bodies,maybe the neck,and then the needle is rammed int

In [8]:
# load pre-trained glove embeddings
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# specify the loaction of the downloaded glove file
path_of_downloaded_files = "/home/cim/staff/uhac002/Library/Embeddings/GloVe/glove.6B.300d.txt"
# path_of_downloaded_files = "/Users/yg211/Embeddings/Glove/glove.6B.300d.txt"
glove_file = datapath(path_of_downloaded_files)
word2vec_glove_file = get_tmpfile("glove.6B.300d.txt")
glove2word2vec(glove_file, word2vec_glove_file)
word_vectors = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [34]:
# then we define the RNN-based classifier
import torch
import torch.nn as nn

class RNN_Classifier(nn.Module):
    def __init__(self, embd_dim, hidden_dim, model_type, cls_num, pooler_type, dropout, gpu):
        super(RNN_Classifier, self).__init__()
        assert model_type in ['rnn','lstm','bilstm','gru']
        assert pooler_type in ['max','avg']
        # rnn type
        if model_type == 'rnn':
            self.rnn = nn.RNN(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        elif model_type == 'lstm':
            self.rnn = nn.LSTM(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        elif model_type == 'bilstm':
            self.rnn = nn.LSTM(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, bidirectional=True, dropout=dropout)
        else: # model_type == 'gru'
            self.rnn = nn.GRU(hidden_size=hidden_dim, batch_first=True, input_size=embd_dim, dropout=dropout)
        # map from rnn output to logits
        if model_type == 'bilstm':
            self.fc = nn.Linear(2*hidden_dim, cls_num)
        else:
            self.fc = nn.Linear(hidden_dim, cls_num)
        # pooler type
        self.pooler_type = pooler_type
        # gpu or not
        self.gpu = gpu
        if gpu: self.to('cuda')
            
    def forward(self, input_matrix):
        token_num = input_matrix.shape[1]
        hidden_vecs = self.rnn(input_matrix)[0]
        if self.pooler_type == 'max':
            pooler = nn.MaxPool1d(token_num)
        else: 
            pooler = nn.AvgPool1d(token_num)
        if self.gpu: pooler.to('cuda')
        pooled_hidden = pooler(torch.transpose(hidden_vecs,1,2)).squeeze()
        return self.fc(pooled_hidden)

In [35]:
# define functions that build mini-batches
from nltk.tokenize import word_tokenize
import numpy as np

embd_dim = 300
hidden_dim = 300
rnn_type = 'bilstm'
pooler_type = 'avg'
dropout = 0.5
gpu = True

oov_vec = oov_vec = np.random.rand(embd_dim)

def get_sent_word_vecs(word_vectors, sent_words, largest_len):
    vecs = []
    for ww in sent_words:
        if ww in word_vectors:
            vecs.append(word_vectors[ww])
        else:
            vecs.append(oov_vec)
    return np.array(vecs)

def build_mini_batch(sent_list, word_vectors):
    tokenized_sents = [word_tokenize(ss.lower()) for ss in sent_list]
    largest_len = np.max([len(tokens) for tokens in tokenized_sents])
    text_vecs = []
    for ts in tokenized_sents:
        vv = get_sent_word_vecs(word_vectors, ts, largest_len)
        text_vecs.append(vv)
    # print('mini batch shape',np.array(text_vecs).shape)
    return np.array(text_vecs)

def make_batch_prediction(sent_list, word_vectors, model, use_gpu=False):
    batch = build_mini_batch(sent_list, word_vectors)
    batch_logits = torch.tensor([])
    if use_gpu: batch_logits = batch_logits.to('cuda')
    for i in range(batch.shape[0]):
        input_sents = torch.from_numpy(batch[i]).float()
        if use_gpu: input_sents = input_sents.to('cuda')
        logits = model(input_sents.unsqueeze(0))
        batch_logits = torch.cat( (batch_logits, logits) )
    return batch_logits.view(batch.shape[0],-1)
  
# sanity check 
model = RNN_Classifier(embd_dim, hidden_dim, rnn_type, len(labels_list), pooler_type, dropout, gpu)
batch_pred = make_batch_prediction(
    ['hello world!','hello','another test sentence this is'],
    word_vectors, model, gpu)
print(batch_pred)

tensor([[ 0.0008, -0.0544],
        [ 0.0256, -0.0171],
        [ 0.0205, -0.0428]], device='cuda:0', grad_fn=<ViewBackward>)


In [36]:
loss_fnc = torch.nn.CrossEntropyLoss() # cross entropy loss

# hyper parameters
n_epochs = 20 # number of epoch (i.e. number of iterations)
batch_size = 50
lr = 0.001 # initial learning rate

# init optimizer and scheduler (lr adjustor)
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.999) # after each epoch, the learning rate is discounted to its 95%

In [37]:
# training the CNN model

best_f1 = -1.
best_model = None
import copy
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from nltk.tokenize import word_tokenize
from tqdm import tqdm

for epoch_i in tqdm(range(n_epochs)):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    ep_loss = []
    for idx in range(0,len(train_docs),batch_size):
        # Step 0: Get the data
        sents = train_docs[idx:idx+batch_size]
        if len(sents) == 0: break
        y_target = torch.tensor([train_labels[idx:idx+batch_size]], dtype=torch.int64).squeeze()
        if gpu:
            y_target = y_target.to('cuda')
        
        # Step 1: Clear the gradients 
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        y_pred = make_batch_prediction(sents, word_vectors, model, gpu)
        pred_labels = [np.argmax(entry) for entry in y_pred.cpu().detach().numpy()]
        # print('pred labels', pred_labels)
        # print('true labels', y_target)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)
        # print(loss)
        ep_loss.append(loss.cpu().detach().numpy())

        # Step 4: Propagate the loss signal backward
        loss.backward()
        
        # Step 4+: clip the gradient, to avoid gradient explosion
        nn.utils.clip_grad_value_(model.parameters(), clip_value=3.)

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()
    
    print('\n======epoch {} loss======'.format(epoch_i),np.mean(ep_loss))
    
    # after each epoch, we can test the model's performance on the dev set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        predictions = []
        test_docs = dev_docs
        test_labels = dev_labels
        
        for idx in range(0,len(test_docs),batch_size):
            y_pred = make_batch_prediction(
                test_docs[idx:idx+batch_size], word_vectors, model, gpu)
            pred_labels = [np.argmax(entry) for entry in y_pred.cpu().detach().numpy()]
            predictions += pred_labels
        pre, rec, f1, _ = precision_recall_fscore_support(test_labels, predictions,average='macro')
        print('\n---> after epoch {} the macro-F1 on dev set is {}'.format(epoch_i, f1))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])
        
        # save the best model
        if f1 > best_f1:
            best_f1 = f1
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best macro-F1',f1)
    
    # (optional) adjust learning rate according to the scheduler
    scheduler.step()







  0%|          | 0/20 [00:00<?, ?it/s][A[A[A[A[A[A










  5%|▌         | 1/20 [13:57<4:25:21, 837.96s/it][A[A[A[A[A[A


---> after epoch 0 the macro-F1 on dev set is 0.8705735709787179
learning rate 0.001
best model updated; new best macro-F1 0.8705735709787179









 10%|█         | 2/20 [27:56<4:11:24, 838.01s/it][A[A[A[A[A[A


---> after epoch 1 the macro-F1 on dev set is 0.8987578670739615
learning rate 0.000999
best model updated; new best macro-F1 0.8987578670739615









 15%|█▌        | 3/20 [41:53<3:57:21, 837.74s/it][A[A[A[A[A[A


---> after epoch 2 the macro-F1 on dev set is 0.9089950165671072
learning rate 0.000998001
best model updated; new best macro-F1 0.9089950165671072









 20%|██        | 4/20 [55:52<3:43:29, 838.06s/it][A[A[A[A[A[A


---> after epoch 3 the macro-F1 on dev set is 0.9081835134316718
learning rate 0.000997002999









 25%|██▌       | 5/20 [1:09:48<3:29:26, 837.73s/it][A[A[A[A[A[A


---> after epoch 4 the macro-F1 on dev set is 0.9077976949423736
learning rate 0.000996005996001









 30%|███       | 6/20 [1:23:47<3:15:31, 837.98s/it][A[A[A[A[A[A


---> after epoch 5 the macro-F1 on dev set is 0.9091886086190651
learning rate 0.000995009990004999
best model updated; new best macro-F1 0.9091886086190651









 35%|███▌      | 7/20 [1:37:45<3:01:32, 837.85s/it][A[A[A[A[A[A


---> after epoch 6 the macro-F1 on dev set is 0.8900502542567806
learning rate 0.000994014980014994









 40%|████      | 8/20 [1:51:43<2:47:36, 838.07s/it][A[A[A[A[A[A


---> after epoch 7 the macro-F1 on dev set is 0.9023969391680123
learning rate 0.000993020965034979









 45%|████▌     | 9/20 [2:05:40<2:33:35, 837.81s/it][A[A[A[A[A[A


---> after epoch 8 the macro-F1 on dev set is 0.8982878888425979
learning rate 0.000992027944069944









 50%|█████     | 10/20 [2:19:39<2:19:39, 837.98s/it][A[A[A[A[A[A


---> after epoch 9 the macro-F1 on dev set is 0.8984325803281097
learning rate 0.000991035916125874









 55%|█████▌    | 11/20 [2:33:36<2:05:39, 837.70s/it][A[A[A[A[A[A


---> after epoch 10 the macro-F1 on dev set is 0.8900746914388564
learning rate 0.0009900448802097482









 60%|██████    | 12/20 [2:47:35<1:51:45, 838.13s/it][A[A[A[A[A[A


---> after epoch 11 the macro-F1 on dev set is 0.8995189560091588
learning rate 0.0009890548353295385









 65%|██████▌   | 13/20 [3:01:31<1:37:43, 837.64s/it][A[A[A[A[A[A


---> after epoch 12 the macro-F1 on dev set is 0.9053983312265628
learning rate 0.000988065780494209









 70%|███████   | 14/20 [3:15:29<1:23:45, 837.52s/it][A[A[A[A[A[A


---> after epoch 13 the macro-F1 on dev set is 0.9063898751288939
learning rate 0.0009870777147137147









 75%|███████▌  | 15/20 [3:29:27<1:09:48, 837.73s/it][A[A[A[A[A[A


---> after epoch 14 the macro-F1 on dev set is 0.9029313180834231
learning rate 0.000986090636999001









 80%|████████  | 16/20 [3:43:25<55:50, 837.75s/it]  [A[A[A[A[A[A


---> after epoch 15 the macro-F1 on dev set is 0.9028727205278886
learning rate 0.000985104546362002









 85%|████████▌ | 17/20 [3:57:23<41:53, 837.87s/it][A[A[A[A[A[A


---> after epoch 16 the macro-F1 on dev set is 0.8999000441881245
learning rate 0.00098411944181564









 90%|█████████ | 18/20 [4:11:20<27:55, 837.62s/it][A[A[A[A[A[A


---> after epoch 17 the macro-F1 on dev set is 0.9002589901499143
learning rate 0.0009831353223738245









 95%|█████████▌| 19/20 [4:25:17<13:57, 837.52s/it][A[A[A[A[A[A


---> after epoch 18 the macro-F1 on dev set is 0.9095779949008391
learning rate 0.0009821521870514505
best model updated; new best macro-F1 0.9095779949008391









100%|██████████| 20/20 [4:39:14<00:00, 837.71s/it][A[A[A[A[A[A


---> after epoch 19 the macro-F1 on dev set is 0.8949937476105054
learning rate 0.000981170034864399





## Exercises
* Optimize the hyper parameters, e.g. using different RNN architectures, dropout rates and hidden state dimensions.
* (Optional) You may have noticed that the current implementation sends one sentence to the RNN model at a time (see function *make_batch_prediction*). To speed up the training we may want to let the model make predictions for multiple sentences at once. Consider to how to implement the batch prediction. *Hint*: you may need to zero-pad the shorter sentences.