## Very Deep CNN for Text Classification Tasks

Based on this paper: https://arxiv.org/pdf/1606.01781.pdf - Very Deep Convolutional Networks
for Text Classification, Conneau et al.

Following this implementation: https://github.com/cjiang2/VDCNN - have opened a pull request for an error in this implementation, commented it out in the relevant place in this file.

The implementation above is broken into 3 parts: utility functions (Tokenizer), the VD-CNN code in TensorFlow, and a train loop. Wanted to code out the VD-CNN to get a good understanding of the architecture.

Sometimes there is an issue with fetching the dataset from tensorflow datasets. Running Training Set-Up section again usually works it out.




Things to try:

1. Paper suggests trying tasks with a lot more labels, possible to find other datasets? Movie genre classification possible?

2. Implementing TPU

3. Lot of hyperparameters / settings to tweak in this build.

~Samyukt Sriram

### Tokenizer

In [3]:
#Tokenizer. This is in the utils.py file in the implementation.

#Paper uses character based tokenizer, thought of as the atomic level of representation of text, similar to pixels for images.

import numpy as np

class Tokenizer(object):
    def __init__(
        self,
        chars = 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’"/|_#$%ˆ&*˜‘+=<>()[]{} ',
        unk_token = True
    ):

        self.chars = chars
        self.unk_token = 69 if unk_token == True else None
        self.build()
    
    def build(self):
        '''Build up char2idx'''
        self.idx = 1 #Implementation mentions this is bc 0 is reserved for zero padding
        self.char2idx = {}
        self.idx2char = {}

        for char in self.chars:

            #Each character has an ID (number) assigned
            self.char2idx[char] = self.idx
            self.idx2char[self.idx] = char
            self.idx += 1
    
    def char_2_idx(self, c): #c is the character we want the ID for
        '''returns the integer index ID for the character c'''

        if not c in self.char2idx:
            if self.unk_token is None:
                return None
            else: return self.unk_token
        
        return self.char2idx[c]
    
    def idx_2_chars(self, idx):
        '''return the character for the index idx'''

        #Unknown token case:
        
        if idx > len(self.idx2char):
            if self.unk_token is None:
                return ''
            else: return '<UNK>'
        
        #Return empty string for 0 padding case

        elif idx == 0:
            return ''
        
        return self.idx2char[idx]

    def __len__(self):
        '''returns the length of the vocabulary'''
        return len(self.char2idx)
    
    def text_to_sequence(self, text, maxlen = 1014): #paper specifies 1014, might be based on datasets and tasks. Possible to tweak?

        text = text.lower() #Paper specifies this, might be interesting to try without and include caps in chars as well.
        data = np.zeros(maxlen).astype(int)

        for i in range(len(text)):
            if i > maxlen:
                return data
            if text[i] in self.char2idx:
                data[i] = self.char_2_idx(text[i])
        
        return data
    
    def sequence_to_text(self, seq):
        text = ''
        for idx in seq:
            text += self.idx_2_chars(idx)
        
        return text

### VD-CNN

In [4]:
#VD-CNN model

import tensorflow as tf
from tensorflow.keras import Model, layers

#This dictionary is only used in class VDCNN
N_BLOCKS = {
    9: (1,1,1,1),
    17: (2,2,2,2),
    29: (5,5,2,2),
    49: (8,8,5,3)
}

class KMaxPooling(layers.Layer):

  '''K-Max Pooling layer that extracts the k-highest activations from a sequence (2nd dimension)
  TensorFlow Backend'''

  #Read up more about what is going on here.

  def __init__(self,
               k=None,
               sorted=False):
    
    super(KMaxPooling, self).__init__()

    self.k = k
    self.sorted = sorted

  
  def compute_output_shape(self, input_shape):
    return (input_shape[0], self.k, input_shape[2])
  
  def call(self, inputs):
    if self.k is None:
      k = int(tf.round(inputs.shape[1] / 2))
    else: k = self.k

    #Swapping the last 2 dimensions, bc top_k will be applied along the last dimension
    #?
    shifted_inputs = tf.transpose(inputs, [0,2,1])

    #Extract top_k, returns 2 tensors = [values, indices]. 
    #Taking 0th element = values (?)
    top_k = tf.nn.top_k(shifted_inputs, k=k, sorted= self.sorted)[0]

    return tf.transpose(top_k, [0,2,1])


class Pooling(layers.Layer):

  '''Wrapper for different pooling operations. Included in maxpooling and k-maxpooling'''
  #Again, read more about this and figure out what's going on here.
  #https://www.youtube.com/watch?v=ZjM_XQa5s6s&ab_channel=deeplizard
  #This helps extract features by taking max of sets of (pool_size). Also reduces computation

  def __init__(self,
               pool_type = 'max',
               name = None):
    super(Pooling, self).__init__(name=name)

    assert pool_type in ['max', 'k_max']

    self.pool_type = pool_type

    if pool_type == 'max':
      self.pool = layers.MaxPooling1D(pool_size=3, strides=2, padding='same')
    elif pool_type == 'k_max':
      self.pool = KMaxPooling() #We defined this above
    
  def call(self, x):
    return self.pool(x)

class ZeroPadding(layers.Layer):

  #https://www.youtube.com/watch?v=qSTv_m-KFk0&ab_channel=deeplizard
  #This basically adds 0s around the input to preserve the size of the output after convolution

  def __init__(self,
               values,
               name=None):
    super(ZeroPadding, self).__init__(name=name)
    self.values = values

  def call(self,
           x):
    x = tf.pad(x, [[0,0], [0,0], [self.values[0],self.values[1]]], mode = 'CONSTANT', constant_values= 0)
    return x

class Conv1D_BN(layers.Layer):
  '''A stack of conv 1x1 and BatchNorm'''
  def __init__(self,
               filters,
               kernel_size = 3,
               strides = 2,
               padding = 'same',
               use_bias = True,
               name=None):
    super(Conv1D_BN, self).__init__(name=name)
    self.filters = filters
    self.use_bias = use_bias
    self.conv = layers.Conv1D(filters, kernel_size, strides=strides, padding=padding, use_bias=use_bias,
                              kernel_initializer = 'he_normal')
    self.bn = layers.BatchNormalization()

  def call(self, x):

    x = self.conv(x)
    x = self.bn(x)

    return x

class ConvBlock(layers.Layer):
  '''Conv block with downsampling. 1x1 conv to increase dimensions'''
  #What is downsampling, what does it mean to increase dimension and why
  def __init__(
      self,
      filters,
      kernel_size=3,
      use_bias=True,
      shortcut=True,
      pool_type=None,
      proj_type=None,
      name=None,
  ):
    super(ConvBlock, self).__init__(name=name),
    self.filters = filters
    self.kernel_size = kernel_size
    self.use_bias = use_bias
    self.shortcut = shortcut
    self.pool_type = pool_type
    self.proj_type = proj_type

    #dealing with downsampling and pooling
    assert pool_type in ['max', 'k_max', 'conv', None]

    if pool_type is None:
      strides = 1
      self.pool = None
      self.downsample = None

    elif pool_type == 'conv':
      strides = 2 #Conv pool with stride = 2
      #Note that the strides variable defined above is only used later, not in the self.downsample below.
      self.pool = None
      if shortcut:
        self.downsample = Conv1D_BN(filters, 3, strides = 2, padding = 'same', use_bias = use_bias)
    
    else:
      strides = 1
      self.pool = Pooling(pool_type)
      if shortcut:
        self.downsample = Conv1D_BN(filters, 3, strides = 2, padding = 'same', use_bias = use_bias)
    
    #Defining layers
    self.conv1 = layers.Conv1D(filters, kernel_size, strides = strides, padding='same', use_bias=use_bias, kernel_initializer='he_normal')
    self.bn1 = layers.BatchNormalization()
    self.conv2 = layers.Conv1D(filters, kernel_size, strides = 1, padding='same', use_bias=use_bias, kernel_initializer='he_normal')
    self.bn2 = layers.BatchNormalization()

    assert proj_type in ['identity', 'conv', None]

    if shortcut:
      if proj_type == 'conv':
      #1x1 conv, for projection
        self.proj = Conv1D_BN(filters *2, 1, strides = 1, padding='same', use_bias=use_bias)
      
      elif proj_type == 'identity':
        #Identity using 0 padding
        self.proj = ZeroPadding([int(filters//2), filters - int(filters//2)])
    

  def call(self, x):

    residual = x #Used for skip connections if needed later

    out = self.conv1(x)
    out = self.bn1(out)
    out = tf.nn.relu(out)

    out = self.conv2(out)
    out = self.bn2(out)

    if self.pool is not None:
      out = self.pool(out)
    
    if self.shortcut:
      if self.downsample is not None:
        residual = self.downsample(residual)
      out += residual
    
    out = tf.nn.relu(out)

    if self.proj_type is not None and self.shortcut:
      out = self.proj(out)
    
    return out

class VDCNN(Model):
  '''
  Args:
    num_classes: Number of classes for the classification task
    depth: depth of the VDCNN - must be one of [9,17,29,49]
    vocab_size: length of the vocabulary
    seqlen: sequence length
    embed_dim: dimension for character embedding
    shortcut: Boolean, use skip connections
    pool_type: Pooling operations, must be one of ['max', 'k_max', 'conv']
    proj_type: Operation to increase dim for dotted skip connections, one of ['identity', 'conv']
    use_bias: Use bias for all layers or not
    logits: If False, returns softmax probabilities.
  '''

  def __init__(self,
               num_classes,
               depth=9,
               vocab_size=69,
               seqlen=None,
               embed_dim=16,
               shortcut=True,
               pool_type='max',
               proj_type = 'conv',
               use_bias = True,
               logits = True
               ):
    
    super(VDCNN, self).__init__()
    
    self.num_classes = num_classes
    self.depth = depth
    self.vocab_size = vocab_size
    self.seqlen = seqlen
    self.embed_dim = embed_dim
    self.shortcut = shortcut
    self.pool_type = pool_type
    self.proj_type = proj_type
    self.use_bias = use_bias
    self.logits = logits #Error in original implementation, has this as = True when self.logits should be assigned = logits

    assert pool_type in ['max', 'k_max', 'conv']
    assert proj_type in ['conv', 'identity']

    self.n_blocks = N_BLOCKS[depth]

    self.embed_char = layers.Embedding(vocab_size, embed_dim, input_length=seqlen)
    self.conv = layers.Conv1D(64, 3, strides=1, padding='same', use_bias=use_bias, kernel_initializer='he_normal')

    
    #In each block, only the last block has pooling and projection. that's why the for loop is -1
    
    #Convolutional Block 64
    self.conv_block_64 = []

    for _ in range(self.n_blocks[0] - 1):
      self.conv_block_64.append(ConvBlock(64,3,use_bias, shortcut))
    self.conv_block_64.append(ConvBlock(64,3,use_bias, shortcut, pool_type=pool_type, proj_type=proj_type))

    #Convolutional Block 128
    self.conv_block_128 = []
    
    for _ in range(self.n_blocks[1] - 1):
      self.conv_block_128.append(ConvBlock(128,3,use_bias, shortcut))
    self.conv_block_128.append(ConvBlock(128,3,use_bias, shortcut, pool_type=pool_type, proj_type=proj_type))

    #Convolutional Block 256
    self.conv_block_256 = []
    
    for _ in range(self.n_blocks[2] - 1):
      self.conv_block_256.append(ConvBlock(256,3,use_bias, shortcut))
    self.conv_block_256.append(ConvBlock(256,3,use_bias, shortcut, pool_type=pool_type, proj_type=proj_type))

    #Convolutional Block 512
    self.conv_block_512 = []
    
    for _ in range(self.n_blocks[3] - 1):
      self.conv_block_512.append(ConvBlock(512,3,use_bias, shortcut))
    self.conv_block_512.append(ConvBlock(512,3,use_bias, shortcut, pool_type=pool_type, proj_type=proj_type))

    self.k_maxpool = KMaxPooling(k=8) #Why is this a thing, turns out we use one final k_maxpooling operation after all these layers in call()
   #But why is this 8?

    self.flatten = layers.Flatten()

    #Dense layers
    self.fc1 = layers.Dense(2048, activation='relu')
    self.fc2 = layers.Dense(2048, activation='relu')

    self.out = layers.Dense(num_classes)

  def call(self, x):

    x = self.embed_char(x)
    x = self.conv(x)

    for l in self.conv_block_64:
      x = l(x)
    
    for l in self.conv_block_128:
      x = l(x)

    for l in self.conv_block_256:
      x = l(x)
    
    for l in self.conv_block_512:
      x = l(x)


    x = self.k_maxpool(x)
    x = self.flatten(x)

    x = self.fc1(x)
    x = self.fc2(x)

    out = self.out(x)

    if self.logits:
      return out
    else: return tf.nn.softmax(out)

### Small test for the Model

In [5]:
#Testing VDCNN

if __name__ == '__main__':

  x = tf.zeros([4, 1014])

  model = VDCNN(10, depth=9, shortcut=True, pool_type='max', proj_type='identity', logits = False)
  out = model(x)
  model.summary()


Model: "vdcnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  1104      
                                                                 
 conv1d (Conv1D)             multiple                  3136      
                                                                 
 conv_block (ConvBlock)      multiple                  37824     
                                                                 
 conv_block_1 (ConvBlock)    multiple                  149376    
                                                                 
 conv_block_2 (ConvBlock)    multiple                  593664    
                                                                 
 conv_block_3 (ConvBlock)    multiple                  2366976   
                                                                 
 k_max_pooling (KMaxPooling)  multiple                 0     

### Training Set-Up

In [6]:
#Training

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

#Hyperparameters

MAXLEN = 1014
DEPTH = 9
EMBED_DIM = 16
SHORTCUT = True
POOL_TYPE = 'k_max'
PROJ_TYPE = 'identity'
USE_BIAS = True

BATCH_SIZE = 128
SHUFFLE_BUFFER = 1024
LR = 1e-2
EPOCHS = 5
CLIP_NORM = 7.0

DATASET_NAME = 'ag_news'

CHECKPOINT_PATH = './checkpoints'
DISPLAY_EVERY = 20


#Helper Functions
#Some functions and objects used in these functions are only defined later. Might be a good idea to reorder this


def prepare_data(dataset_name = 'ag_news', split='train'):

  shuffle_files = True if split == 'train' else False

  if dataset_name == 'ag_news':
    ds = tfds.load('ag_news_subset', split=split, shuffle_files=shuffle_files)
    num_classes = 4
  
  return ds, num_classes

@tf.function
def train_step(inputs, labels):

  #Forward Pass
  with tf.GradientTape() as tape:
    logits = model(inputs, training=True)
    loss = loss_object(labels, logits)
  
  #Backward Pass
  gradients = tape.gradient(loss, model.trainable_variables)

  if CLIP_NORM is not None:
    #Using gradient clipping to stabilize the training
    gradients = [tf.clip_by_norm(grad, CLIP_NORM) for grad in gradients]
  
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  #Metrics
  preds = tf.nn.softmax(logits)
  train_loss(loss)
  train_accuracy(labels, preds) #training accuracy

@tf.function
def test_step(inputs, labels):

  logits = model(inputs, training = False)
  t_loss = loss_object(labels, logits)

  preds = tf.nn.softmax(logits)

  test_loss(t_loss)
  test_accuracy(labels, preds)


#Training Prep

#Dataset

ds_train, num_classes = prepare_data(DATASET_NAME, 'train')
ds_train = ds_train.shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

ds_test, _ = prepare_data(DATASET_NAME, 'test')
ds_test = ds_test.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

#Tokenizer

tokenizer = Tokenizer()

#Model
model = VDCNN(num_classes = num_classes,
              depth = DEPTH,
              vocab_size = 69,
              seqlen = MAXLEN,
              embed_dim=EMBED_DIM,
              shortcut=SHORTCUT,
              pool_type=POOL_TYPE,
              proj_type=PROJ_TYPE,
              use_bias=USE_BIAS)

#Optimizer
#Could experiment with a different algorithm
optimizer = tf.keras.optimizers.SGD(learning_rate=LR, momentum=0.0)

#Loss and Metrics
loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits = True)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy')


#Checkpoint
ckpt = tf.train.Checkpoint(model=model)
ckpt_manager = tf.train.CheckpointManager(ckpt, CHECKPOINT_PATH, max_to_keep = 5) #setting max to keep as 5, =None in implementation

In [23]:
#Just to examine an example of the data

for batch in ds_train:
  print(batch['description'].numpy()[0])
  print(tf.keras.utils.to_categorical(batch['label'][0], num_classes = num_classes))
  break

b'FRX ECO INDUSAFIN ENG  2004-10-13 22:22:02 Fed #39;s McTeer tapped to head Texas A amp;M - UPDATE 1 WASHINGTON (AFX) -- Robert McTeer, president of the Federal Reserve Bank of Dallas, said he would retire from his post if named, as '
[0. 0. 1. 0.]


### Training Loop

In [None]:
#Training Loop

#Loop
step = 0

for epoch in range(EPOCHS):

  train_accuracy.reset_states()
  test_accuracy.reset_states()

  #Training loop
  for batch in ds_train:

    #Modify this for different datasets?
    texts = batch['description'].numpy() 
    labels = tf.keras.utils.to_categorical(batch['label'], num_classes = num_classes)

    #Convert to sequence here
    #Implementation mentions bypassing tfds in favour of a custom data operation. Unclear what this exactly means

    inputs = np.array([tokenizer.text_to_sequence(text.decode('ascii')) for text in texts])
    inputs = tf.convert_to_tensor(inputs)

    #One training step
    train_step(inputs, labels)


    #displaying progress
    if step % DISPLAY_EVERY == 0:
      print(f'Epoch: {epoch+1} \n Step:{step} \n Loss: {train_loss.result()} \n Accuracy: {train_accuracy.result() * 100}')
    
    step += 1
  
  #Test Loop
  
  for batch_test in ds_test:

    #Again, edit this for different datasets
    texts = batch_test['description'].numpy()
    labels = tf.keras.utils.to_categorical(batch_test['label'], num_classes=num_classes)

    #converting to sequence
    #again, mentions bypassing tfds in favour of a custom data operation. Unclear what this exactly means

    inputs = np.array([tokenizer.text_to_sequence(text.decode('ascii')) for text in texts])
    inputs = tf.convert_to_tensor(inputs)

    #1 test step
    test_step(inputs, labels)

  print(f'Epoch: {epoch+1} \n Test Loss: {test_loss.result()} \n Test Accuracy: {test_accuracy.result() * 100}')

  #Saving model
  ckpt_manager.save()