# Using Sentencepiece BPE for Document Classification

### Base code is the original code from the book

## Imports

In [66]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string
import tqdm
import sys

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

## Data Vectorization classes

### The Vocabulary

In [67]:
####################################################################### 
# No changes made to this class
####################################################################### 

class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""

    def __init__(self, token_to_idx=None):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
      
    
    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx}

    
    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    
    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
         
        
    def add_many(self, tokens):
        """Add a list of tokens into the Vocabulary
        
        Args:
            tokens (list): a list of string tokens
        Returns:
            indices (list): a list of indices corresponding to the tokens
        """
        return [self.add_token(token) for token in tokens]

    
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        """
        return self._token_to_idx[token]

    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [68]:
####################################################################### 
# Changed class name to bpeSequenceVocabulary
####################################################################### 

class bpeSequenceVocabulary(Vocabulary):
    def __init__(self, 
                 token_to_idx=None, 
                 unk_token="<UNK>",
                 mask_token="<MASK>", 
                 begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(bpeSequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

        
    def to_serializable(self):
        contents = super(bpeSequenceVocabulary, self).to_serializable()
        
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

### The Vectorizer

In [69]:
####################################################################### 
# Changed names: title_bpe_vocab
#
# Added functions:
#
# To train BPE vocabulary:
# - concat_titles_sentencepiece
# - create_tdict
# - get_pairs_for_vocab
# - merge_pieces_for_vocab
# - merge
# Code adapted from:
# https://github.com/rsennrich/subword-nmt/blob/master/subword_nmt/bpe_toy.py
#
# To encode each title input.
# - create tokens
# - get_pairs_for_data
# - find_best_pair
# - merge_pieces_for_data
####################################################################### 

class NewsVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""    
    
    def __init__(self, title_bpe_vocab=None, category_vocab=None):
        self.title_bpe_vocab = title_bpe_vocab
        self.category_vocab = category_vocab

        
    ####################################################################### 
    # Functions for BPE 
    # Called from the class method from_dataframe()
    ####################################################################### 

    @staticmethod
    def concat_titles_sentencepiece(df_titles):
        """
        Concatenate all the titles in the dataset to 
        be parsed for creation of sentencepiece BPE vocabulary
        Args:
            df_titles (str): the titles in the dataframe
        Returns:
            index (str): all titles concatenated
        """
        all_titles = ''
        for i, title in enumerate(df_titles):
            title = title.translate(str.maketrans('', '', string.punctuation))         # remove punctuation
            title = title.replace(' ', '_')
            if i == 0:
                all_titles = title
            else:
                all_titles = all_titles + '_' + title           
        return all_titles
    
    
    # Create tokens dictionary for BPE algorithm
    @staticmethod
    def create_tdict(text):
        """
        Create a dictionary mapping token to frequency
        Args:
            text (str): title text
        Returns:
            tdict (collections.Counter): token dictionary
        """
        tdict = Counter()
        tokens = text.split(" ")
        for t in tokens:
            tdict[' '.join(list(t))] += 1    # dict of tokens (pieces of characters) with spaces between characters 
        return tdict    
        
        
    @staticmethod
    def get_pairs_for_vocab(tdict):
        """
        Get pairs of pieces.
        Args:
            tdict (collections.Counter): tokens dictionary
        Returns:
            pairs (collections.Counter): pairs of pieces 
        """
        pairs = Counter()
        for w, freq in tdict.items():
            t = w.split()
  
            for i in range(len(t)-1):
                pairs[t[i], t[i+1]] += freq
          
        return pairs

        
    @staticmethod
    def merge_pieces_for_vocab(pair, v_in):
        """
        Merge wordpieces.
        Args:
            pair (tuple): the pair to be merged
            v_in (collections.Counter): tokens dictionary
        Returns:
            v_out (collections.Counter): tokens dictionary
        """
        v_out = Counter()
        bigram = re.escape(' '.join(pair))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        for w in v_in:
            w_out = p.sub(''.join(pair), w)
            v_out[w_out] = v_in[w]   
        return v_out
    
            
    @staticmethod
    def merge(tdict, num_merges, vocab_size):
        """
        Function to process merging.
        Args:
            num_merges (int): the maximum number of merges
            tdict (collections.Counter): tokens dictionary
            vocab_size (int): target vocab size
        Returns:
            bpe_vocab (list): list of BPE vocab
            bpe_dict (collections.Counter): BPE dictionary
        """
        # Assume the titles data contains all the letters in the alphabet
        bpe_vocab = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
        bpe_dict = Counter()
        
        count = 0
        for i in range(num_merges):
            pairs = NewsVectorizer.get_pairs_for_vocab(tdict)
            try:
                best = max(pairs, key=pairs.get)
                
                if pairs[best] > 1:
                    vocab_token = best[0] + best[1] 
                    bpe_vocab.append(vocab_token)
                    bpe_dict[vocab_token] = pairs[best]
                
                    count += 1
                    
                if count + 26 >= vocab_size:
                    break
                        
            except ValueError:
                break
                    
            if pairs[best] < 2:
                print('Number of merges: ', i)
                sys.stderr.write('No pair has freq > 1. Stopping.')
                break
                    
            tdict = NewsVectorizer.merge_pieces_for_vocab(best, tdict)        
      
        return bpe_vocab, bpe_dict
    
    ###############################################################################
    # Functions to split a word into pieces which are found in BPE vocabulary
    # Called in vectorize()
    ###############################################################################   

    @staticmethod
    def create_tokens(text):
        """
        Create a string of character tokens.
        Args:
            text (str): title text
        Returns:
            tokens (str): title text with spaces between characters        
        """
        tokens = ' '.join(list(text))
        return tokens

    @staticmethod
    def get_pairs_for_data(text):
        """
        Get pairs of pieces.
        Args:
            text (str): title text
        Returns:
            pairs (list of tuples): pairs of pieces 
        """
        pairs = []
        t = text.split(" ")
        for i in range(len(t)-1):
            pairs.append((t[i], t[i+1]))       
        return pairs

    @staticmethod
    def find_best_pair(pairs, bpe_dict):
        """
        Find the pair with the highest piece frequency in the vocabulary
        Args:
            pairs (list of tuples): piece pairs
            bpe_dict(collections.Counter): piece dictionary
        Returns:
            best (tuples): highest freq pair 
        """
        pdict = Counter()
        best = None
        best_freq = 0
        
        for p in pairs:
            piece = p[0] + p[1]
            
            if bpe_dict[piece] != 0:
                pdict[piece] = bpe_dict[piece]
                
                if best_freq < bpe_dict[piece]: 
                    best = p
                    best_freq = bpe_dict[piece]
        return best
    
    @staticmethod
    def merge_pieces_for_data(best, text):
        """
        Merge wordpieces.
        Args:
            best (tuple): highest freq pair
            text (str): title text
        Returns:
            v_out (str): title text in pieces
        """
        bigram = re.escape(' '.join(best))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
        v_out = p.sub(''.join(best), text)
        return v_out

    ###############################################################################
    # Newly defined vectorize()
    ###############################################################################
        
    def vectorize(self, title, vector_length=-1):
        """
        Args:
            title (str): the string of words separated by a space
            vector_length (int): an argument for forcing the length of index vector
        Returns:
            the vetorized title (numpy.array)
        """
        
        bpe_dict = NewsVectorizer.bpe_dict
            
        ###############################################################################
        # Convert title text to pieces for encoding
        ###############################################################################    

        title2 = title.replace(' ', '_')    # title2 = title with underscores
        
        w1 = NewsVectorizer.create_tokens(title2)
        pairs = NewsVectorizer.get_pairs_for_data(w1)
        best = NewsVectorizer.find_best_pair(pairs, bpe_dict)
    
        while best is not None:
            w2 = NewsVectorizer.merge_pieces_for_data(best, w1)
            pairs = NewsVectorizer.get_pairs_for_data(w2)
            best = NewsVectorizer.find_best_pair(pairs, bpe_dict)
            w1 = w2
            
        title3 = w1   # title3 = title tokenized into BPE pieces
        #print(title3)
        
        indices = [self.title_bpe_vocab.begin_seq_index]      
        indices.extend(self.title_bpe_vocab.lookup_token(token) for token in title3.split(" "))
        indices.append(self.title_bpe_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.title_bpe_vocab.mask_index
            
        #print(out_vector)    
        return out_vector

    
    @classmethod
    def from_dataframe(cls, news_df, cutoff=1):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            news_df (pandas.DataFrame): the target dataset
            cutoff (int): frequency threshold for including in Vocabulary 
        Returns:
            an instance of the NewsVectorizer
        """
        category_vocab = Vocabulary()   
        
        for category in sorted(set(news_df.category)):
            category_vocab.add_token(category)

        #######################################################################    
        # Train vocabulary. Vocab_size to be set at args.vocab_size
        #######################################################################    
 
        all_titles = NewsVectorizer.concat_titles_sentencepiece(news_df.title)
        print('Using sentencepiece BPE.')
                   
        tdict = NewsVectorizer.create_tdict(all_titles)

        bpe_vocab, bpe_dict = NewsVectorizer.merge(tdict, args.num_merges, args.vocab_size)
        print('BPE vocabulary created. Pre-set maximum vocabulary size = ', args.vocab_size)
        
        #print('\nSentencepiece BPE vocabulary:\n',bpe_vocab)
        print('\nSentencepiece BPE vocabulary (>= 2 characters, frequency >= 2):\n',bpe_dict)

        title_bpe_vocab = bpeSequenceVocabulary()
        title_bpe_vocab.add_many(bpe_vocab)
     
        self = cls()
        cls.bpe_dict = bpe_dict            # to access bpe_dict outside of this class method
        
        return cls(title_bpe_vocab, category_vocab)        
    
 
    #######################################################################    
    # Name changed to title_bpe_vocab
    #######################################################################
    
    @classmethod
    def from_serializable(cls, contents):
        title_bpe_vocab = bpeSequenceVocabulary.from_serializable(contents['title_bpe_vocab'])
        category_vocab = Vocabulary.from_serializable(contents['category_vocab'])

        return cls(title_bpe_vocab=title_bpe_vocab, 
                   category_vocab=category_vocab)  

      
    def to_serializable(self):
        return {'title_bpe_vocab': self.title_bpe_vocab.to_serializable(),
                'category_vocab': self.category_vocab.to_serializable()} 

### The Dataset

In [70]:
####################################################################### 
# No changes made to this class (original code from the book)
####################################################################### 

class NewsDataset(Dataset):
    def __init__(self, news_df, vectorizer):
        """
        Args:
            news_df (pandas.DataFrame): the dataset
            vectorizer (NewsVectorizer): vectorizer instatiated from dataset
        """
        self.news_df = news_df
        self._vectorizer = vectorizer

        # +1 if only using begin_seq, +2 if using both begin and end seq tokens
        measure_len = lambda context: len(list(context))
        self._max_seq_length = max(map(measure_len, news_df.title)) + 2
        
        self.train_df = self.news_df[self.news_df.split=='train']
        self.train_size = len(self.train_df)
        
        self.val_df = self.news_df[self.news_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.news_df[self.news_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')

        
        # Class weights
        class_counts = news_df.category.value_counts().to_dict()
        
        def sort_key(item):
            return self._vectorizer.category_vocab.lookup_token(item[0])
        
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv):
        """Load dataset and make a new vectorizer from scratch
        
        Args:
            surname_csv (str): location of the dataset
        Returns:
            an instance of SurnameDataset
        """
        news_df = pd.read_csv(news_csv)                                     # READ FROM CSV
        
        train_news_df = news_df[news_df.split=='train']
        
        return cls(news_df, NewsVectorizer.from_dataframe(train_news_df))

    
    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        """Load dataset and the corresponding vectorizer. 
        Used in the case in the vectorizer has been cached for re-use
        
        Args:
            surname_csv (str): location of the dataset
            vectorizer_filepath (str): location of the saved vectorizer
        Returns:
            an instance of SurnameDataset
        """
        news_df = pd.read_csv(news_csv)                                     # READ FROM CSV
        
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        
        return cls(news_csv, vectorizer)

    
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        """a static method for loading the vectorizer from file
        
        Args:
            vectorizer_filepath (str): the location of the serialized vectorizer
        Returns:
            an instance of SurnameVectorizer
        """
        with open(vectorizer_filepath) as fp:
            return NameVectorizer.from_serializable(json.load(fp))

        
    def save_vectorizer(self, vectorizer_filepath):
        """saves the vectorizer to disk using json
        
        Args:
            vectorizer_filepath (str): the location to save the vectorizer
        """
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

            
    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer

    
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

        
    def __len__(self):
        return self._target_size

    
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        title_vector = self._vectorizer.vectorize(row.title, self._max_seq_length)
        
        category_index = self._vectorizer.category_vocab.lookup_token(row.category)

        return {'x_data': title_vector,
                'y_target': category_index}

    
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

In [71]:
####################################################################### 
# No changes made to this function
####################################################################### 

def generate_batches(dataset, 
                     batch_size, 
                     shuffle=True,
                     drop_last=True, 
                     device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, 
                            batch_size=batch_size,
                            shuffle=shuffle, 
                            drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
            
        yield out_data_dict

## The Model: NewsClassifier

In [72]:
###################################################################################
# No changes (from original book code)
###################################################################################

class NewsClassifier(nn.Module):
    def __init__(self, 
                 embedding_size, 
                 num_embeddings, 
                 num_channels, 
                 hidden_dim, 
                 num_classes, 
                 dropout_p, 
                 pretrained_embeddings=None, 
                 padding_idx=0):
        """
        Args:
            embedding_size (int): size of the embedding vectors
            num_embeddings (int): number of embedding vectors
            filter_width (int): width of the convolutional kernels
            num_channels (int): number of convolutional kernels per layer
            hidden_dim (int): the size of the hidden dimension
            num_classes (int): the number of classes in classification
            dropout_p (float): a dropout parameter 
            pretrained_embeddings (numpy.array): previously trained word embeddings
                default is None. If provided, 
            padding_idx (int): an index representing a null position
        """
        super(NewsClassifier, self).__init__()

        if pretrained_embeddings is None:

            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx)        
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=embedding_size,
                                    num_embeddings=num_embeddings,
                                    padding_idx=padding_idx,
                                    _weight=pretrained_embeddings)
        
            
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embedding_size, 
                   out_channels=num_channels, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(in_channels=num_channels, out_channels=num_channels, 
                   kernel_size=3),
            nn.ELU()
        )

        self._dropout_p = dropout_p
        self.fc1 = nn.Linear(num_channels, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

        
    def forward(self, x_in, apply_softmax=False):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor. 
                x_in.shape should be (batch, dataset._max_seq_length)
            apply_softmax (bool): a flag for the softmax activation
                should be false if used with the Cross Entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch, num_classes)
        """
        
        # embed and permute so features are channels
        x_embedded = self.emb(x_in).permute(0, 2, 1)

        features = self.convnet(x_embedded)

        # average and remove the extra dimension
        remaining_size = features.size(dim=2)
        features = F.avg_pool1d(features, remaining_size).squeeze(dim=2)
        features = F.dropout(features, p=self._dropout_p)
        
        # mlp classifier
        intermediate_vector = F.relu(F.dropout(self.fc1(features), p=self._dropout_p))
        prediction_vector = self.fc2(intermediate_vector)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

## Training Routine

### Helper functions

In [73]:
###################################################################################
# No changes (from original book code)
###################################################################################

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}


def update_train_state(args, model, train_state):
    """Handle the training state updates.

    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= train_state['early_stopping_best_val']:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state


def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

#### general utilities

In [74]:
###################################################################################
# No changes (from original book code)
###################################################################################

def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

        
def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
        
        
def load_glove_from_file(glove_filepath):
    """
    Load the GloVe embeddings 
    
    Args:
        glove_filepath (str): path to the glove embeddings file 
    Returns:
        word_to_index (dict), embeddings (numpy.ndarary)
    """

    word_to_index = {}
    embeddings = []
    with open(glove_filepath, "r") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0] 
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)


def make_embedding_matrix(glove_filepath, words):
    """
    Create embedding matrix for a specific set of words.
    
    Args:
        glove_filepath (str): file path to the glove embeddigns
        words (list): list of words in the dataset
    """
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    
    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i

    return final_embeddings

### Settings and some prep work

In [75]:
from argparse import Namespace

In [76]:
####################################################################### 
# Changes made to the names of hyperparameters:
# bpe_embedding_size, bpe_kernel_size
#
# Added args for BPE:
#    num_merges = 11000
#    vocab_size = 1000, 3000, 10000
#######################################################################

args = Namespace(
    # Data and Path hyper parameters
    news_csv="data/ag_news/news_with_splits.csv",
    vectorizer_file="vectorizer_sp1k.json",
    model_state_file="model_sp1k.pth",
    save_dir="./model/",
    
    # Model hyper parameters
    glove_filepath='data/glove/glove.6B.100d.txt', 
    use_glove=False,
    embedding_size=100, 
    hidden_dim=100, 
    num_channels=100, 
    
    # Training hyper parameter
    seed=1337, 
    learning_rate=0.001, 
    dropout_p=0.1, 
    batch_size=256, 
    num_epochs=100, 
    early_stopping_criteria=5, 
    
    # Runtime option
    cuda=True, 
    catch_keyboard_interrupt=True, 
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
    
    # For BPE
    num_merges = 11000,        # max number of merge operations during creation of BPE vocabulary
    vocab_size = 1000         # max vocabulary size
) 

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
    
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	./model/vectorizer_sp1k.json
	./model/model_sp1k.pth
Using CUDA: True


### Initializations

In [77]:
args.use_glove = False

In [78]:
####################################################################### 
# Changes made only to the names:
# title_bpe_vocab
#######################################################################

if args.reload_from_files:
    # training from a checkpoint
    dataset = NewsDataset.load_dataset_and_load_vectorizer(args.news_csv,
                                                              args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = NewsDataset.load_dataset_and_make_vectorizer(args.news_csv)
    dataset.save_vectorizer(args.vectorizer_file)

vectorizer = dataset.get_vectorizer()



# Use GloVe or randomly initialized embeddings
if args.use_glove:
    words = vectorizer.title_bpe_vocab._token_to_idx.keys()
    embeddings = make_embedding_matrix(glove_filepath=args.glove_filepath, 
                                       words=words)
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

    
classifier = NewsClassifier(embedding_size=args.embedding_size, 
                            num_embeddings=len(vectorizer.title_bpe_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim, 
                            num_classes=len(vectorizer.category_vocab), 
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)

Using sentencepiece BPE.
BPE vocabulary created. Pre-set maximum vocabulary size =  1000

Sentencepiece BPE vocabulary (>= 2 characters, frequency >= 2):
 Counter({'s_': 122723, 'e_': 65329, 'n_': 57651, 't_': 56216, 'er': 46314, 'in': 40382, 'd_': 33211, 'or': 31299, 're': 29151, 'ar': 27717, 'o_': 26871, 'es_': 25826, 'y_': 25305, 'al': 24138, 'an': 24075, '__': 22826, 'st': 21755, 'en': 18884, 'on_': 18624, 'ro': 18351, 'to_': 18176, 'er_': 18106, 'th': 17893, 'a_': 17522, 'in_': 16739, 'at': 16115, 'g_': 15393, 'ic': 15282, 'on': 15245, 'il': 15209, 'ch': 14365, 'it': 13100, 'el': 12933, 'ing_': 12320, 'ra': 12209, 'ou': 12094, 'of': 12011, 'for': 11798, '_s': 11716, 'is': 11109, 'ap': 10870, 'ed_': 10858, 'an_': 10720, 'ers_': 10582, 'ac': 10171, 'ol': 9742, 'as': 9662, '_p': 9346, 'am': 9296, 'ts_': 9112, 'de': 8788, 'ur': 8766, 'le': 8572, 'om': 8274, '_c': 8231, 'ne': 8207, '_g': 7666, '_b': 7321, 'up': 7312, 'ec': 7300, '_m': 7229, 'us': 7150, 'ig': 7057, 'lo': 6853, 'ion_': 6

### Vocab size 3000

### Vocab size 10,000

### Training loop

In [79]:
####################################################################### 
# No changes made
#######################################################################

classifier = classifier.to(args.device)

dataset.class_weights = dataset.class_weights.to(args.device)


loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm.notebook.tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm.notebook.tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm.notebook.tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)


try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset
        # setup: batch generator, set loss and acc to 0, set train mode on

        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
                
        running_loss = 0.0
        running_acc = 0.0
        
        classifier.train()
                

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = classifier(batch_dict['x_data'])
            
             
            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset
        
        

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()


        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier(batch_dict['x_data'])

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

            
        print(f'Epoch {epoch_index}: Val Loss: {running_loss}, Val Acc: {running_acc}')    
        
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        
        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
        
except KeyboardInterrupt:
    print("Exiting loop")


HBox(children=(FloatProgress(value=0.0, description='training routine', style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='split=train', max=328.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='split=val', max=70.0, style=ProgressStyle(description_wid…

Epoch 0: Val Loss: 0.943820606810706, Val Acc: 61.87499999999998
Epoch 1: Val Loss: 0.7728732075010027, Val Acc: 70.35156250000001
Epoch 2: Val Loss: 0.7092193390641894, Val Acc: 73.43191964285712
Epoch 3: Val Loss: 0.6709229230880737, Val Acc: 75.27901785714286
Epoch 4: Val Loss: 0.6543399214744567, Val Acc: 75.76450892857144
Epoch 5: Val Loss: 0.6234502473047803, Val Acc: 77.55022321428574
Epoch 6: Val Loss: 0.6118793725967409, Val Acc: 77.92968749999997
Epoch 7: Val Loss: 0.6311793182577408, Val Acc: 78.00781249999999
Epoch 8: Val Loss: 0.6342359917504445, Val Acc: 78.35379464285718
Epoch 9: Val Loss: 0.6430682867765427, Val Acc: 79.05691964285714
Epoch 10: Val Loss: 0.6792250335216521, Val Acc: 78.54352678571429
Epoch 11: Val Loss: 0.7129932037421632, Val Acc: 78.42633928571426
Epoch 12: Val Loss: 0.7377261970724377, Val Acc: 78.17522321428572
Epoch 13: Val Loss: 0.7790042749473028, Val Acc: 78.28124999999999
Epoch 14: Val Loss: 0.7934795277459281, Val Acc: 78.23660714285711
Epoch 

In [80]:
# compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)



dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.

classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

    
    
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc


In [81]:
print("Train Loss: {}".format(train_state['train_loss']))
print("Train Accuracy: {}".format(train_state['train_acc']))

print('\n')

print("Validation Loss: {}".format(train_state['val_loss']))
print("Validation Accuracy: {}".format(train_state['val_acc']))

print('\n')

print("Test Loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Train Loss: [1.1672936412619388, 0.8284978559467855, 0.7088255286216737, 0.63465384339414, 0.5799343991206911, 0.5286481644867395, 0.4833020735986348, 0.4462238373007721, 0.41234009767450935, 0.34347068863671026, 0.3148596952782897, 0.2705242386678369, 0.254379994122357, 0.22932672041763616, 0.22132965808779725, 0.20825096381055877, 0.203900808218594, 0.1961357186253116, 0.19468294640594147, 0.19108683409214755, 0.19038123194491724, 0.18850932885869964, 0.18710850733445916, 0.1872254588709373, 0.18642044610275732, 0.1855012944149899]
Train Accuracy: [46.919064405487795, 67.6602991615854, 73.2743426067073, 76.43626143292678, 78.79072980182926, 80.81054687500007, 82.60885099085365, 83.9010099085365, 85.25152439024393, 87.97160823170725, 88.99104420731706, 90.70836509146338, 91.40029535060981, 92.42568597560977, 92.65910823170736, 93.25814596036588, 93.41296684451216, 93.69045350609757, 93.7928734756098, 93.97627667682927, 93.91792111280486, 94.0084317835366, 94.0239138719512, 94.05964176

### Results (Vocab size 3000)

### Results (Vocab size 10,000)

### Inference

In [82]:
# Preprocess the reviews for predict_category()

def preprocess_text(text):

    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)

    return text

In [83]:
def predict_category(title, classifier, vectorizer, max_length):
    """Predict a News category for a new title
    
    Args:
        title (str): a raw title string
        classifier (NewsClassifier): an instance of the trained classifier
        vectorizer (NewsVectorizer): the corresponding vectorizer
        max_length (int): the max sequence length
            Note: CNNs are sensitive to the input data tensor size. 
                  This ensures to keep it the same size as the training data
    """
    title = preprocess_text(title)
    
    #print('* ',max_length)
    
    vectorized_title = \
        torch.tensor(vectorizer.vectorize(title, vector_length=max_length))
    
    result = classifier(vectorized_title.unsqueeze(0), apply_softmax=True)
    
    probability_values, indices = result.max(dim=1)
    predicted_category = vectorizer.category_vocab.lookup_index(indices.item())

    return {'category': predicted_category, 
            'probability': probability_values.item()}

In [84]:
def get_samples():
    samples = {}
    for cat in dataset.val_df.category.unique():
        samples[cat] = dataset.val_df.title[dataset.val_df.category==cat].tolist()[:5]
    return samples

val_samples = get_samples()

In [85]:
#title = input("Enter a news title to classify: ")
classifier = classifier.to("cpu")

for truth, sample_group in val_samples.items():
    print(f"True Category: {truth}")
    print("="*30)
    
    for sample in sample_group:
        prediction = predict_category(sample, classifier, 
                                      vectorizer, dataset._max_seq_length + 1)
        
        print("Prediction: {} (p={:0.2f})".format(prediction['category'],
                                                  prediction['probability']))
        print("\t + Sample: {}".format(sample))

    print("-"*30 + "\n")

True Category: Business
Prediction: Business (p=0.97)
	 + Sample: az suspends marketing of cancer drug
Prediction: Business (p=0.53)
	 + Sample: business world has mixed reaction to perez move
Prediction: Sports (p=0.93)
	 + Sample: betting against bombay
Prediction: Business (p=0.73)
	 + Sample: malpractice insurers face a tough market
Prediction: Sci/Tech (p=0.73)
	 + Sample: nvidia is vindicated
------------------------------

True Category: Sci/Tech
Prediction: Sci/Tech (p=0.98)
	 + Sample: spies prize webcam s eyes
Prediction: Sci/Tech (p=0.92)
	 + Sample: sober worm causes headaches
Prediction: Sci/Tech (p=0.47)
	 + Sample: local search missing pieces falling into place
Prediction: Sci/Tech (p=1.00)
	 + Sample: hackers baiting internet users with beckham pix
Prediction: Sci/Tech (p=1.00)
	 + Sample: nokia adds blackberry support to series handsets
------------------------------

True Category: Sports
Prediction: Sports (p=0.97)
	 + Sample: is meyer the man to get irish up ? 
Pred