In [None]:
!pip install transformers



In [None]:
from transformers import BertTokenizer
import torch
from torch import nn
import pprint


In [None]:
characters, EMBEDDING_DIM = set(), 15
START_OF_WORD, END_OF_WORD = chr(2), chr(3)
max_word_length = 0

In [None]:
with open('wsj_train.txt', 'r') as file:
  text = file.read()
  for char in text:
    characters.add(char)
  words = text.split()
  for word in words:
    max_word_length = max(max_word_length, len(word))
  del words
  del text




In [None]:
print(len(characters), max_word_length)

85 29


In [None]:
char_to_idx = {ch: i for i, ch in enumerate(characters)}
idx_to_char = {i: ch for ch, i in char_to_idx.items()}

### add the start-of-word and end-of-word symbols
char_to_idx[ord(START_OF_WORD)] = len(characters)
idx_to_char[len(characters)] = ord(START_OF_WORD)
char_to_idx[ord(END_OF_WORD)] = len(characters) + 1
idx_to_char[len(characters) + 1] = ord(END_OF_WORD)

In [None]:
print(sorted(list(idx_to_char.items())))

[(0, 'x'), (1, 'z'), (2, '2'), (3, '5'), (4, 's'), (5, ' '), (6, '('), (7, 'd'), (8, 'Y'), (9, '&'), (10, 'B'), (11, 'p'), (12, 'V'), (13, 'N'), (14, 'i'), (15, 'b'), (16, 'c'), (17, 'k'), (18, 'F'), (19, '%'), (20, ','), (21, 'u'), (22, 't'), (23, '`'), (24, 'v'), (25, '8'), (26, '@'), (27, '}'), (28, 'C'), (29, 'h'), (30, 'A'), (31, 'X'), (32, 'R'), (33, 'U'), (34, 'e'), (35, '3'), (36, 'w'), (37, '\\'), (38, '{'), (39, 'E'), (40, 'D'), (41, 'm'), (42, 'K'), (43, 'P'), (44, '$'), (45, 'y'), (46, 'j'), (47, 'o'), (48, 'M'), (49, ':'), (50, 'q'), (51, '9'), (52, 'W'), (53, 'T'), (54, 'l'), (55, '1'), (56, '0'), (57, 'I'), (58, 'Z'), (59, 'H'), (60, 'n'), (61, '6'), (62, '!'), (63, "'"), (64, 'J'), (65, 'O'), (66, '4'), (67, '.'), (68, '*'), (69, ';'), (70, 'a'), (71, '\n'), (72, '/'), (73, '#'), (74, 'S'), (75, 'L'), (76, 'r'), (77, ')'), (78, '-'), (79, '?'), (80, 'g'), (81, 'G'), (82, '7'), (83, 'Q'), (84, 'f'), (85, 2), (86, 3)]


In [None]:
def zero_pad_tensor(tensor, rows, num, dim):
  shape = None
  if rows:
    shape = (num, tensor.shape[1])
  else:
    shape = (tensor.shape[0], num)
  zero_tensor = torch.zeros(shape[0], shape[1])
  return torch.cat((tensor, zero_tensor), dim=dim)

In [None]:
class CharEmbeddings(torch.nn.Module):
  def __init__(self, num_characters: int, embedding_dim: int, max_word_length):
    super().__init__()
    self.character_embeddings = torch.nn.Embedding(num_characters, embedding_dim)
    self.max_word_length = max_word_length + 2

  def forward(self, x: torch.tensor):
    ### concat start-of-word, x, and end-of-word tensor and then zero pad on the right
    character_embeddings = torch.transpose(self.character_embeddings(x), 0, 1)

    start_of_word_embedding = torch.transpose(
      self.character_embeddings(torch.tensor([ord(START_OF_WORD)])), 0, 1
    )

    end_of_word_embedding = torch.transpose(
      self.character_embeddings(torch.tensor([ord(END_OF_WORD)])), 0, 1
    )

    char_sequence = torch.cat(
      (start_of_word_embedding, character_embeddings, end_of_word_embedding),
      dim=1
    )

    if self.max_word_length != char_sequence.shape[1]:
      char_sequence = zero_pad_tensor(
        char_sequence, False, self.max_word_length - char_sequence.shape[1], 1
      )

    return char_sequence


In [None]:
def convert_sequence(ce: CharEmbeddings, mapping, characters: str | list):
  if isinstance(characters, str):
    characters = list(characters)
  encoded_str = torch.tensor([char_to_idx[char] for char in characters])
  return ce(encoded_str).unsqueeze(0)

In [None]:
char_embeddings = CharEmbeddings(len(char_to_idx), EMBEDDING_DIM, max_word_length)

In [None]:
print(convert_sequence(char_embeddings, char_to_idx, "33").shape)

torch.Size([1, 15, 31])


In [None]:
# input to the CharCNN, Ck, is tensor of size d x l where l is the sequence of characters
# apply a narrow convolution of width w between each filter and Ck

'''
# the of filters -> width
Mapping : filter numbers -> width
'''
SMALL_FILTER_CONFIG = [(25 * width, width) for width in range(1, 7)]
LARGE_FILTER_CONFIG = [(min(200, 50 * width), width) for width in range(1, 8)]

SMALL_CONFIG = {
    "FILTER_CONFIG": SMALL_FILTER_CONFIG,
    "EMBEDDING_DIM": EMBEDDING_DIM,
    "activation": "tanh"
}
pprint.pprint(SMALL_CONFIG)

LARGE_CONFIG = {
    "FILTER_CONFIG": LARGE_FILTER_CONFIG,
    "EMBEDDING_DIM": EMBEDDING_DIM,
    "activation": "tanh"
}

class CharCNN(nn.Module):
  def __init__(self, config: dict, filter_height):
    super().__init__()
    filter_width_mapping = config["FILTER_CONFIG"]
    self.conv_layers = nn.ModuleList()
    for num_filters, filter_width in filter_width_mapping:
      print(num_filters, filter_width)
      self.conv_layers.append (
          torch.nn.Conv2d (
            in_channels=1,
            out_channels=num_filters,
            kernel_size=(filter_height, filter_width),
            stride=1,
            padding=0
          )
        )


  # passing x to a conv layer will produce a feature map for that layer
  def forward(self, x):
    # do something with the feature maps here?
    filters = []
    for mod in self.conv_layers:
      width_of_kernel = mod.kernel_size[1]
      filters.append((width_of_kernel, mod(x)))
    self.get_feature_maps(filters, x)
    return None

  def get_feature_maps(self, filters, x: torch.tensor):
    bias = 0 # change later, turn it into a learnable parameter
    feature_maps = []
    for width, tensor in filters:
        print(tensor.shape, width)
        # num_f by rows by col
        ### produces k scalars
        feature_map = []
        for k in range(tensor.shape[0]):
         filter = tensor[k]
         ### loop produces scalars
         for i in range(x.shape[2] - width + 2):
            character_submatrix = x[:, i: i + width]
            feature_map.append(
                torch.tanh(
                    self.frobenius_inner_product(
                        character_submatrix,
                        filter
                    ) + bias
                )
            )
        feature_maps.append(feature_map)
        ### turn it into proper tensor here





    def frobenius_inner_product(self, A, B):
      '''
      Tr(A, B^T), perform element-wise multiplication then sum to achieve this effect
      '''
      return torch.sum(A * torch.transpose(B, 0, 1)).item()





            # get all subtensors Ck
            # i to i + w - 1, [0, 10]




# 15
char_cnn = CharCNN(LARGE_CONFIG, EMBEDDING_DIM)
converted_seq = convert_sequence(char_embeddings, char_to_idx, "hello")
char_cnn(converted_seq)


{'EMBEDDING_DIM': 15,
 'FILTER_CONFIG': [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)],
 'activation': 'tanh'}
50 1
100 2
150 3
200 4
200 5
200 6
200 7
torch.Size([50, 1, 31]) 1


AttributeError: module 'torch' has no attribute 'tanhx'