<a href="https://colab.research.google.com/github/sracha4355/Character-Aware-Neural-Language-Model/blob/main/character_aware_nlm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [13]:
from transformers import BertTokenizer
import torch
from torch import nn
import pprint


In [3]:
characters, EMBEDDING_DIM = set(), 15

In [4]:
with open('wsj_train.txt', 'r') as file:
  text = file.read()
  for char in text:
    characters.add(char)
  del char

In [5]:
char_to_idx = {ch: i for i, ch in enumerate(characters)}
idx_to_char = {i: ch for ch, i in char_to_idx.items()}

In [6]:
print(sorted(list(idx_to_char.items())))

[(0, 'M'), (1, 'R'), (2, '&'), (3, 'v'), (4, 'p'), (5, 'r'), (6, '@'), (7, 'J'), (8, '2'), (9, ')'), (10, 'S'), (11, 'j'), (12, 'E'), (13, '0'), (14, '`'), (15, '3'), (16, 'A'), (17, '('), (18, '{'), (19, 'D'), (20, '*'), (21, 'z'), (22, ';'), (23, "'"), (24, '='), (25, 'Q'), (26, '5'), (27, 'V'), (28, 'X'), (29, 'u'), (30, 'k'), (31, 'G'), (32, ','), (33, 't'), (34, '/'), (35, '\n'), (36, 'm'), (37, 'e'), (38, 'I'), (39, 'W'), (40, 'f'), (41, '#'), (42, 'K'), (43, 'w'), (44, 'L'), (45, 'U'), (46, '8'), (47, 'l'), (48, '!'), (49, 'H'), (50, 'T'), (51, '-'), (52, 'x'), (53, 'y'), (54, '%'), (55, '$'), (56, 'a'), (57, 's'), (58, 'F'), (59, '9'), (60, 'B'), (61, '7'), (62, 'b'), (63, ':'), (64, '}'), (65, 'Y'), (66, 'd'), (67, '?'), (68, 'i'), (69, 'n'), (70, 'Z'), (71, 'O'), (72, 'N'), (73, 'o'), (74, '1'), (75, '\\'), (76, 'q'), (77, 'c'), (78, 'g'), (79, '4'), (80, 'P'), (81, '6'), (82, 'C'), (83, 'h'), (84, ' '), (85, '.')]


In [7]:
class CharEmbeddings(torch.nn.Module):
  def __init__(self, num_characters: int, embedding_dim: int):
    super().__init__()
    self.character_embeddings = torch.nn.Embedding(num_characters, embedding_dim)
    # mapping: int -> tensor(1, embedding_dim)

  def __getitem__(self, x):
    return self.forward(x)

  def forward(self, x):
    return self.character_embeddings(x)


In [35]:
def convert_sequence(ce: CharEmbeddings, mapping, characters: str | list):
  if isinstance(characters, str):
    characters = list(characters)
  encoded_str = torch.tensor([char_to_idx[char] for char in characters])
  return ce(encoded_str).unsqueeze(0)


In [8]:
char_embeddings = CharEmbeddings(len(characters), EMBEDDING_DIM)

In [36]:
# input to the CharCNN, Ck, is tensor of size d x l where l is the sequence of characters
# apply a narrow convolution of width w between each filter and Ck

'''
# the of filters -> width
Mapping : filter numbers -> width
'''
SMALL_FILTER_CONFIG = [(25 * width, width) for width in range(1, 7)]
LARGE_FILTER_CONFIG = [(min(200, 50 * width), width) for width in range(1, 8)]

SMALL_CONFIG = {
    "FILTER_CONFIG": SMALL_FILTER_CONFIG,
    "EMBEDDING_DIM": EMBEDDING_DIM,
    "activation": "tanh"
}
pprint.pprint(SMALL_CONFIG)

LARGE_CONFIG = {
    "FILTER_CONFIG": LARGE_FILTER_CONFIG,
    "EMBEDDING_DIM": EMBEDDING_DIM,
    "activation": "tanh"
}

class CharCNN(nn.Module):
  def __init__(self, config: dict, filter_height):
    super().__init__()
    filter_width_mapping = config["FILTER_CONFIG"]
    self.conv_layers = nn.ModuleList()
    for num_filters, filter_width in filter_width_mapping:
      print(num_filters, filter_width)
      self.conv_layers.append (
          torch.nn.Conv2d (
            in_channels=1,
            out_channels=num_filters,
            kernel_size=(filter_height, filter_width),
            stride=1,
            padding=0
          )
        )


  # passing x to a conv layer will produce a feature map for that layer
  def forward(self, x):
    # do something with the feature maps here?
    feature_maps = []
    for mod in self.conv_layers:
      width_of_kernel = mod.kernel_size[1]

      feature_maps.append((width_of_kernel, mod(x)))

      print(feature_maps[-1])
    return None

# 15
char_cnn = CharCNN(LARGE_CONFIG, EMBEDDING_DIM)
char_cnn(convert_sequence(char_embeddings, char_to_idx, "hello"))


{'EMBEDDING_DIM': 15,
 'FILTER_CONFIG': [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)],
 'activation': 'tanh'}
50 1
100 2
150 3
200 4
200 5
200 6
200 7


RuntimeError: Calculated padded input size per channel: (5 x 15). Kernel size: (15 x 1). Kernel size can't be greater than actual input size

50 1
100 2
150 3
200 4
200 5
200 6
1
2
3
4
5
6


[]