In [1]:
%%html
<style> table {float:left} </style>

In [2]:
#!pip install torch tqdm lazyme nltk gensim
#!python -m nltk.downloader punkt



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\monis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import numpy as np
from tqdm import tqdm #tqdm for progress bar

from gensim.corpora import Dictionary

import torch
from torch import nn, optim, tensor, autograd
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

In [4]:
try: # Use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize 
    # Testing whether it works. 
    # Sometimes it doesn't work on some machines because of setup issues.
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: # Use a naive sentence tokenizer and toktok.
    import re
    from nltk.tokenize import ToktokTokenizer
    # See https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

# Classification 

Text Categorization (textcat) is a common task in NLP. As long as, we have labelled data and we want to assign a discrete label to every input data point, it's a classification problem. E.g. 

| Tasks | Possible Labels | 
|:-|:-|
| Sentiment analysis | Positive, Negative, Neutral | 
| Tweetstorm detection | True, False |
| Author profiling | Author1, Author2, ... | 
| Language Identification | EN, ZH, DE, JA, FR, ...|

There are various datasets for sentiment classification, previously we looked at the movie reviews dataset in `nltk`. There's also this other popular IMDB movie reviews dataset from Stanford. Lets use that.

Download the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and put it in the same directory as where you're running this jupyter notebook.

# Munge the data!

As always we have to preprocess the data.

In [9]:
from lazyme import find_files
import io

def tokenize_data(path_to_dir, file_ext):
    for filename in tqdm(find_files(path_to_dir, file_ext)):
        with io.open(filename,encoding='utf8') as fin:
            yield word_tokenize(fin.read())
        
X_train_pos = list(tokenize_data('aclImdb/train/pos/', '*.txt'))
X_train_neg = list(tokenize_data('aclImdb/train/neg/', '*.txt'))
X_test_pos = list(tokenize_data('aclImdb/test/pos/', '*.txt'))
X_test_neg = list(tokenize_data('aclImdb/test/neg/', '*.txt'))

12500it [00:40, 305.35it/s]
12500it [00:55, 225.68it/s]
12500it [01:02, 200.55it/s]
12500it [00:47, 262.59it/s]


In [10]:
X_train = X_train_pos + X_train_neg
X_test = X_test_pos + X_test_neg

y_train = ['pos'] * len(X_train_pos) + ['neg'] * len(X_train_neg)
y_test = ['pos'] * len(X_test_pos) + ['neg'] * len(X_test_neg)

# Create our IMDB PyTorch Dataset 

Although we have a binary class problem, we will demonstrate a multi-class solution issue that can be also used on binary classification. 


First trick is to convert the "human" labels to a one-hot encoding.

For example, if we have 

| Text Index | Label |
|:-|:-|
|0 | pos|
|1 |neg|
|2 |pos|
|3 |neu|



If we use the 

 - first position of the label vector to represent negative  
 - second to represent positive
 - third to represent neutral


we should represent the labels as such:

| Text Index | Label | One-hot |
|:-|:-|:-|
|0 | 1|[0, 1, 0]|
|1 | 0|[1, 0, 0]|
|2 | 1|[0, 1, 0]|
|3 | 2|[0, 0, 1]|



In [11]:
# To get the one-hot encoding:
labels = [1, 0, 1, 2]
torch.eye(max(labels)+1)[labels] #torch.eye gives identity matrix

tensor([[0., 1., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

In [12]:
# In PyTorch version 1.0.1, simply use this:
labels = [1, 0, 1, 2]
torch.one_hot([1, 0, 1, 2])

#no problem if there is attribute error in this step

AttributeError: module 'torch' has no attribute 'one_hot'

In [2]:
import gensim
gensim.__version__

ImportError: cannot import name 'smart_open'

In [18]:
!pip install -U pip
!pip install --upgrade gensim>=3.7.0

Requirement already up-to-date: pip in c:\users\monis\anaconda3\lib\site-packages (19.0.2)


In [396]:
class IMDBDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.vocab = Dictionary(texts)
        # Vectorize labels
        label_set = {'neg':0, 'pos':1}
        labels = [label_set[l] for l in labels]
        self.labels = torch.tensor(labels).long()
        # Keep track of how many data points.
        self._len = len(texts)
        
    def __getitem__(self, index):
        vectorized_sent = self.vectorize(self.texts[index])
        return {'x':vectorized_sent, 
                'y':self.labels[index], 
                'x_len':len(vectorized_sent)}
    
    def __len__(self):
        return self._len
    
    def vectorize(self, tokens):
        """
        :param tokens: Tokens that should be vectorized. 
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx 
        # Lets just cast list of indices into torch tensors directly =)
        return torch.tensor(self.vocab.doc2idx(tokens))
    
    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.vocab[i] for i in indices]

In [397]:
imdb_data = IMDBDataset(X_train, y_train)

In [398]:
print(imdb_data[0]) # First data point.

{'x': tensor([ 4, 17, 33, 43, 27, 34, 38, 44, 42, 21, 17, 31, 35, 32, 37, 30, 24, 45,
        26,  2,  6, 17, 33, 46,  7, 10, 28, 19, 25,  0,  8, 13, 28, 17, 39, 41,
         2, 14,  9, 23, 28, 20, 18, 40,  2, 15, 24,  3, 16, 14, 12,  1,  5, 29,
        22, 17, 36, 11,  2]), 'y': tensor(1), 'x_len': 59}


# PyTorch DataLoader

The [`torch.utils.data.DataLoader` object](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)  will help us easily create batches from the `torch.utils.data.Dataset` so that we can do mini-batch SGD and fully utilize GPU/CPU computation during gradient optimization.

The `DataLoader` requires the following function to be implemented in the `Dataset`:

 - `__getitem__`: Return the dictionary of inputs 
 - `__len__`: Return the no. of indices that `__getitem__` can fetch
 

In [74]:
batch_size = 1
dataloader = DataLoader(dataset=imdb_data, batch_size=batch_size, shuffle=True)

In [76]:
for data_dict in dataloader:
    # Sort indices of data in batch by lengths.
    sorted_indices = np.array(data_dict['x_len']).argsort()[::-1].tolist() #sorts in reverse
    data_batch = {name:_tensor[sorted_indices]
                  for name, _tensor in data_dict.items()}
    print(data_batch)
    break

{'x': tensor([[ 1072,    53, 20332,    53,  2573,     2,  5698, 12440,  3131,    43,
           722,   149,    33,    35,   149,  2299,     2,   608,    28,  7202,
            17,  2778,    46,   149,    33,   409,   372,   332,   270,   535,
           267,   149, 25242,    35,   149,  2299,     2,    14,   949,   183,
            35,   149,    33,   194,   149,  2689,    53,   990,     2,    14,
          4449,   183,   194,   149,  1072,   366, 13299,    35,   149,   477,
           447,  5192,   162,  4215,  3505, 32134,     2,    14,  4352,    28,
          2867,    84, 83691,     2,  1815,    53,  6976,    45,    53,   149,
           477,  1115,   155,  1236,   267,  2223,  2367,   152,   269,    17,
          1245,  1560,   162,    17,  2906,  1284,     2,   237,   194,  3213,
            43,   523,  2136,   341, 55701,    44,   194,  3098,    17,  1268,
            35, 80433, 22509,  6791,   257,   149,  2136,   455,  1216,   341,
           149,  5128,   139, 14451, 55701,   

# Lets try batch of size > 1

In [None]:
batch_size = 5
dataloader = DataLoader(dataset=imdb_data, batch_size=batch_size, shuffle=True)

for data_dict in dataloader:
    # Sort indices of data in batch by lengths.
    sorted_indices = np.array(data_dict['x_len']).argsort()[::-1].tolist()
    data_batch = {name:_tensor[sorted_indices]
                  for name, _tensor in data_dict.items()}
    print(data_batch)
    break
    
#You will get an error here because they are not fixed size tensors

# Gotcha! Everything should be a fixed-size tensor

To use the `DataLoader` to generate batches, one thing that we need to keep consistent is the size of the tensors for our inputs and outputs. 

For the outputs (`y`), it shouldn't be much of a problem since they are already in fixed size one-hot encoding.

It's the inputs (`x`), that has variable length and we need to somehow fix it. 

There are a couple of ways to accomplish the fixed-size inputs:

 - Set the size of `x` tensors to a certain size and cut-off extra words after that
 - Set the size of `x` tensors to the max length seen in the train data and pad the other data points with lower length with a special `<pad>` symbol. 
 
 
Lets do both:

 - Set a max size limit
 - For sentences that has length > max, we cut the rest of the sentence off
 - For sentences that has length < max, we pad till we reach the max

In [123]:
# Here's a clean way to pad 1-Dimensional tensors in PyTorch
a = torch.randn(10)
print(a.shape)
print(a)

torch.Size([10])
tensor([-1.2528,  0.1755, -0.4251,  0.9689,  1.3231, -0.1331,  0.5136,  2.1149,
         1.2246, -0.7345])


In [124]:
max_len = 15
pad_left = 0
pad_right = max_len - len(a)
b = F.pad(a, (pad_left, pad_right), 'constant') #for text we only pad left and right
print(b.shape)
print(b)

torch.Size([15])
tensor([-1.2528,  0.1755, -0.4251,  0.9689,  1.3231, -0.1331,  0.5136,  2.1149,
         1.2246, -0.7345,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000])


Now we have to rewrite the `IMDBDataset` to account for fixed-length `x` tensors. 

In [400]:
class IMDBDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        
        # Remember the `patch_with_special_tokens` from gensim?
        # Now we can put it into good use.
        special_tokens = {'<pad>': 0, '<unk>':1}
        self.vocab = Dictionary(texts)
        self.vocab.patch_with_special_tokens(special_tokens)
        # Keep track of vocab size.
        self.vocab_size = len(self.vocab)
        
        # Vectorize labels
        label_set = {'neg':0, 'pos':1}
        labels = [label_set[l] for l in labels]
        # Keep track of num of labels.
        self.num_labels = max(labels)+1
        self.labels = torch.tensor(labels).long()
        self.labels_onehot = torch.eye(self.num_labels)[labels].long()
        
        # Keep track of how many data points.
        self._len = len(texts)
        
        # Find the longest text in the data.
        self.max_len = max(len(txt) for txt in texts)
        
    def __getitem__(self, index):
        vectorized_sent = self.vectorize(self.texts(index))
        # To pad the sentence:
        # Pad left = 0; Pad right = max_len - len of sent.
        pad_dim = (0, self.max_len - len(vectorized_sent))
        vectorized_sent = F.pad(vectorized_sent, pad_dim, 'constant')
        return {'x':vectorized_sent, 
                'y':self.labels[index], 
                'x_len':len(vectorized_sent)}
    
    
    def __len__(self):
        return self._len
    
    def vectorize(self, tokens):
        """
        :param tokens: Tokens that should be vectorized. 
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx 
        # Lets just cast list of indices into torch tensors directly =)
        return torch.tensor(self.vocab.doc2idx(tokens, unknown_word_index=1))
    
    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.vocab[i] for i in indices]

In [401]:
imdb_data = IMDBDataset(X_train, y_train)

In [402]:
imdb_data.vocab.token2id['the']

149

In [403]:
imdb_data[0]

{'x': tensor([ 4, 17, 33,  ...,  0,  0,  0]), 'y': tensor(1), 'x_len': 2818}

In [404]:
batch_size = 5
dataloader = DataLoader(dataset=imdb_data, batch_size=batch_size, shuffle=True)

for data_dict in dataloader:
    # Sort indices of data in batch by lengths.
    sorted_indices = np.array(data_dict['x_len']).argsort()[::-1].tolist()
    data_batch = {name:_tensor[sorted_indices]
                  for name, _tensor in data_dict.items()}
    print(data_batch)
    break

{'x': tensor([[  165,  1771,   625,  ...,     0,     0,     0],
        [  163,   156,  1202,  ...,     0,     0,     0],
        [   14,   174,    28,  ...,     0,     0,     0],
        [  667, 30633,    84,  ...,     0,     0,     0],
        [  608,   777,  1197,  ...,     0,     0,     0]]), 'y': tensor([1, 1, 0, 1, 0]), 'x_len': tensor([2818, 2818, 2818, 2818, 2818])}


# Training a model with Feed-Forward Net

Now that we have everything about the data in place, we can make use of all the knowledge we've gained thus far:

 - **Multi-Layered Perceptron**, aka. **Feed-Forward Network** that we've learnt from the previous XOR examples
   - *Linear* layers
   - *Activation function*, which?
   - *Criterion* which?
   - *Optimizer*, Adam vs SGD
 

In [405]:
output_dim = 2
max_len = 2818
class FFNet(nn.Module):
    def __init__(self, max_len, num_labels, vocab_size, embedding_size, hidden_dim):
        super(FFNet, self).__init__()
        self.embeddings = nn.Embedding(num_embeddings=vocab_size,
                                       embedding_dim=embedding_size, 
                                       padding_idx=0)
        # The no. of inputs to the linear layer is the 
        # no. of tokens in each input * embedding_size
        self.linear1 = nn.Linear(2818 * embedding_size, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, inputs):
        # We want to flatten the inputs so that we get the matrix of shape.
        # batch_size x no. of tokens in each input * embedding_size
        batch_size, max_len = inputs.shape
        embedded = self.embeddings(inputs).view(batch_size, -1) # Change the size of the embedded matrix.
        hid = F.relu(self.linear1(embedded))
        out = self.linear2(hid)
        return F.sigmoid(out) #sigmoid is used for binary classification
        

# The Training Routine

In [406]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

embedding_size = 100
learning_rate = 0.003
hidden_size = 100

# Initialize the dataset.
batch_size = 5
imdb_data = IMDBDataset(X_train,y_train)
dataloader = DataLoader(dataset=imdb_data, batch_size=batch_size, shuffle=True)

criterion = nn.CrossEntropyLoss() # because we use sigmoid in the above block of code, we should use use binary cross entropy loss
# Hint: the CBOW model object you've created.
model = FFNet(imdb_data.max_len, 
              imdb_data.num_labels, 
              imdb_data.vocab_size, 
              embedding_size=embedding_size, 
              hidden_dim=hidden_size).to(device)


optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#model = nn.DataParallel(model)

losses = []
num_epochs = 5
for _e in range(num_epochs):
    epoch_loss = []
    for batch in tqdm(dataloader):
        x = batch['x'].to(device)
        y = batch['y'].to(device)
        # Zero gradient.
        optimizer.zero_grad()
        # Feed forward.
        predictions = model(x)
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
        epoch_loss.append(float(loss))
        #break
    print(sum(epoch_loss)/len(epoch_loss))
    #break
    losses.append(sum(epoch_loss)/len(epoch_loss))
     

  0%|          | 0/5000 [00:00<?, ?it/s]

0.6863493323326111





In [407]:
print(predictions.shape)

torch.Size([5, 2])


In [408]:
torch.max(predictions, 1)  # Predictions of the last batch.

(tensor([0.5063, 0.5023, 0.5161, 0.5047, 0.5059], grad_fn=<MaxBackward0>),
 tensor([1, 0, 0, 1, 1]))

# Prediction with the model

In [409]:
print(X_test[0]) # First test review.

['Based', 'on', 'an', 'actual', 'story', ',', 'John', 'Boorman', 'shows', 'the', 'struggle', 'of', 'an', 'American', 'doctor', ',', 'whose', 'husband', 'and', 'son', 'were', 'murdered', 'and', 'she', 'was', 'continually', 'plagued', 'with', 'her', 'loss', '.', 'A', 'holiday', 'to', 'Burma', 'with', 'her', 'sister', 'seemed', 'like', 'a', 'good', 'idea', 'to', 'get', 'away', 'from', 'it', 'all', ',', 'but', 'when', 'her', 'passport', 'was', 'stolen', 'in', 'Rangoon', ',', 'she', 'could', 'not', 'leave', 'the', 'country', 'with', 'her', 'sister', ',', 'and', 'was', 'forced', 'to', 'stay', 'back', 'until', 'she', 'could', 'get', 'I.D', '.', 'papers', 'from', 'the', 'American', 'embassy', '.', 'To', 'fill', 'in', 'a', 'day', 'before', 'she', 'could', 'fly', 'out', ',', 'she', 'took', 'a', 'trip', 'into', 'the', 'countryside', 'with', 'a', 'tour', 'guide', '.', '``', 'I', 'tried', 'finding', 'something', 'in', 'those', 'stone', 'statues', ',', 'but', 'nothing', 'stirred', 'in', 'me', '.', '

In [1]:
max_len = 2818
def vectorize_test_inputs(inputs):
    # Process the input text in the same way as you did with the training data.
    vectorized_sent = imdb_data.vectorize(inputs)
    pad_dim = (0, max_len - len(vectorized_sent))
    vectorized_sent = F.pad(vectorized_sent, pad_dim, 'constant')
    return vectorized_sent.unsqueeze(0)

print('Input tensor:', vectorize_test_inputs(X_test[0]))
label_set = {'neg':0, 'pos':1}
print('Label:', label_set[y_test[0]])

SyntaxError: invalid syntax (<ipython-input-1-ab8beda5162c>, line 4)

In [411]:
# Apply the model to the inputs.
with torch.no_grad():
    predictions = model(vectorize_test_inputs(X_test[0])).unsqueeze(0)
    print(predictions)
    print(F.softmax(predictions))

tensor([0.5093, 0.5479])
tensor([0.4903, 0.5097])


  """
