<a href="https://colab.research.google.com/github/sracha4355/Character-Aware-Neural-Language-Model/blob/main/notebooks/character_aware_nlm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install transformers



In [11]:
from transformers import BertTokenizer
import torch
from torch import nn
import pprint


In [86]:
characters, EMBEDDING_DIM = set(), 15
START_OF_WORD, END_OF_WORD = chr(2), chr(3)
max_word_length = 0

In [87]:
with open('wsj_train.txt', 'r') as file:
  text = file.read()
  for char in text:
    characters.add(char)
  words = text.split()
  for word in words:
    max_word_length = max(max_word_length, len(word))
  del words


In [88]:
print(len(characters), max_word_length)

86 54


In [89]:
char_to_idx = {ch: i for i, ch in enumerate(characters)}
idx_to_char = {i: ch for ch, i in char_to_idx.items()}

### add the start-of-word and end-of-word symbols
char_to_idx[ord(START_OF_WORD)] = len(characters)
idx_to_char[len(characters)] = ord(START_OF_WORD)
char_to_idx[ord(END_OF_WORD)] = len(characters) + 1
idx_to_char[len(characters) + 1] = ord(END_OF_WORD)

In [91]:
print(sorted(list(idx_to_char.items())))

[(0, 'i'), (1, 'Z'), (2, '*'), (3, 'D'), (4, '0'), (5, '?'), (6, '}'), (7, 'd'), (8, 'P'), (9, 'c'), (10, '('), (11, '$'), (12, 'j'), (13, '%'), (14, 'k'), (15, 'h'), (16, 'V'), (17, ';'), (18, 'L'), (19, '\\'), (20, 'S'), (21, '4'), (22, ')'), (23, 'n'), (24, 't'), (25, 'O'), (26, '7'), (27, '{'), (28, 'r'), (29, '3'), (30, 'e'), (31, '2'), (32, 'F'), (33, '-'), (34, 'p'), (35, 'u'), (36, 'H'), (37, 'y'), (38, '!'), (39, 'o'), (40, 'R'), (41, 'I'), (42, 'N'), (43, 'b'), (44, 'K'), (45, 'X'), (46, ' '), (47, '9'), (48, '/'), (49, 'J'), (50, "'"), (51, '@'), (52, '`'), (53, 'G'), (54, 'A'), (55, 'C'), (56, 'Y'), (57, 'T'), (58, 'l'), (59, ','), (60, '#'), (61, 'g'), (62, '1'), (63, 'M'), (64, 'q'), (65, 'z'), (66, ':'), (67, 'W'), (68, 'U'), (69, '\n'), (70, 'm'), (71, '&'), (72, 'a'), (73, 'f'), (74, 'E'), (75, 'x'), (76, '='), (77, '.'), (78, 'w'), (79, 's'), (80, 'Q'), (81, 'B'), (82, 'v'), (83, '6'), (84, '8'), (85, '5'), (86, 2), (87, 3)]


In [44]:
def zero_pad_tensor(tensor, rows, num, dim):
  shape = None
  if rows:
    shape = (num, tensor.shape[1])
  else:
    shape = (tensor.shape[0], num)
  zero_tensor = torch.zeros(shape[0], shape[1])
  return torch.cat((tensor, zero_tensor), dim=dim)

In [129]:
class CharEmbeddings(torch.nn.Module):
  def __init__(self, num_characters: int, embedding_dim: int, max_word_length):
    super().__init__()
    self.character_embeddings = torch.nn.Embedding(num_characters, embedding_dim)
    self.max_word_length = max_word_length + 2

  def forward(self, x: torch.tensor):
    ### concat start-of-word, x, and end-of-word tensor and then zero pad on the right
    character_embeddings = torch.transpose(self.character_embeddings(x), 0, 1)

    start_of_word_embedding = torch.transpose(
      self.character_embeddings(torch.tensor([ord(START_OF_WORD)])), 0, 1
    )

    end_of_word_embedding = torch.transpose(
      self.character_embeddings(torch.tensor([ord(END_OF_WORD)])), 0, 1
    )

    char_sequence = torch.cat(
      (start_of_word_embedding, character_embeddings, end_of_word_embedding),
      dim=1
    )

    if self.max_word_length != char_sequence.shape[1]:
      char_sequence = zero_pad_tensor(
        char_sequence, False, self.max_word_length - char_sequence.shape[1], 1
      )

    return char_sequence


In [130]:
def convert_sequence(ce: CharEmbeddings, mapping, characters: str | list):
  if isinstance(characters, str):
    characters = list(characters)
  encoded_str = torch.tensor([char_to_idx[char] for char in characters])
  return ce(encoded_str).unsqueeze(0)

In [131]:
char_embeddings = CharEmbeddings(len(char_to_idx), EMBEDDING_DIM, max_word_length)

In [132]:
print(convert_sequence(char_embeddings, char_to_idx, "33").shape)

torch.Size([1, 15, 56])


In [138]:
# input to the CharCNN, Ck, is tensor of size d x l where l is the sequence of characters
# apply a narrow convolution of width w between each filter and Ck

'''
# the of filters -> width
Mapping : filter numbers -> width
'''
SMALL_FILTER_CONFIG = [(25 * width, width) for width in range(1, 7)]
LARGE_FILTER_CONFIG = [(min(200, 50 * width), width) for width in range(1, 8)]

SMALL_CONFIG = {
    "FILTER_CONFIG": SMALL_FILTER_CONFIG,
    "EMBEDDING_DIM": EMBEDDING_DIM,
    "activation": "tanh"
}
pprint.pprint(SMALL_CONFIG)

LARGE_CONFIG = {
    "FILTER_CONFIG": LARGE_FILTER_CONFIG,
    "EMBEDDING_DIM": EMBEDDING_DIM,
    "activation": "tanh"
}

class CharCNN(nn.Module):
  def __init__(self, config: dict, filter_height):
    super().__init__()
    filter_width_mapping = config["FILTER_CONFIG"]
    self.conv_layers = nn.ModuleList()
    for num_filters, filter_width in filter_width_mapping:
      print(num_filters, filter_width)
      self.conv_layers.append (
          torch.nn.Conv2d (
            in_channels=1,
            out_channels=num_filters,
            kernel_size=(filter_height, filter_width),
            stride=1,
            padding=0
          )
        )


  # passing x to a conv layer will produce a feature map for that layer
  def forward(self, x):
    # do something with the feature maps here?
    feature_maps = []
    print(x.shape)

    for mod in self.conv_layers:
      #print(mod)
      width_of_kernel = mod.kernel_size[1]
      #print('width', width_of_kernel, mod.kernel_size)
      feature_maps.append((width_of_kernel, mod(x)))
      print(feature_maps[-1],)
    return None

# 15
char_cnn = CharCNN(LARGE_CONFIG, EMBEDDING_DIM)
converted_seq = convert_sequence(char_embeddings, char_to_idx, "hello")
char_cnn(converted_seq)


{'EMBEDDING_DIM': 15,
 'FILTER_CONFIG': [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)],
 'activation': 'tanh'}
50 1
100 2
150 3
200 4
200 5
200 6
200 7
torch.Size([1, 15, 56])
(1, tensor([[[ 0.2422,  0.9080,  0.0907,  ...,  0.1138,  0.1138,  0.1138]],

        [[-0.7010,  0.2248,  0.4313,  ..., -0.0892, -0.0892, -0.0892]],

        [[ 0.0568,  0.0489,  0.7417,  ...,  0.0663,  0.0663,  0.0663]],

        ...,

        [[ 0.1642,  0.2980, -1.1362,  ...,  0.1042,  0.1042,  0.1042]],

        [[-0.8339,  0.7108,  0.8210,  ...,  0.1663,  0.1663,  0.1663]],

        [[ 0.0724, -0.4084, -0.0708,  ...,  0.0488,  0.0488,  0.0488]]],
       grad_fn=<SqueezeBackward1>))
(2, tensor([[[-0.0748,  0.7126, -0.3942,  ..., -0.1020, -0.1020, -0.1020]],

        [[ 0.6554, -0.0156, -0.0576,  ...,  0.1101,  0.1101,  0.1101]],

        [[ 0.0046,  0.6839,  0.7434,  ...,  0.0264,  0.0264,  0.0264]],

        ...,

        [[ 0.9789, -1.2988, -0.5887,  ...,  0.1150,  0.1150,  0.1150]],

        [[-