<a href="https://colab.research.google.com/github/sracha4355/Character-Aware-Neural-Language-Model/blob/main/notebooks/character_aware_neural_language_refined.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn.functional as F
from torch import nn
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

### Initalize Spacy's English tokenizer

In [3]:
nlp = English()
tokenizer = nlp.tokenizer

### Tokenize the text and extract all the unique characters

In [4]:
content, train_tokens, characters = "", [], set()
with open("wsj_train.txt", "r") as file:
  content = file.read()
train_tokens = [token.text for token in tokenizer(content)]
for token in train_tokens:
  for char in token:
    characters.add(char)

### Get the max word length because we will zero pad words smaller than this length

In [5]:
max_word_length = -1
for index, token in enumerate(train_tokens):
  if len(token) > max_word_length:
    max_word_length = len(token)
assert max_word_length == 29

### Encoding the text and characters into integers for the model to understand

In [6]:
char_to_int = dict((c, i) for i, c in enumerate(characters))
int_to_char = dict((i, c) for i, c in enumerate(characters))
token_to_int = dict((token, i) for i, token in enumerate(train_tokens))
int_to_token = dict((i, token) for i, token in enumerate(train_tokens))

In [7]:
class Sequence:
  def __init__(self, num_batches, token_count, drop_batch=True):
    self.num_batches = num_batches
    if drop_batch:
      self.token_count = token_count - (token_count % self.num_batches)
      assert self.token_count % self.num_batches == 0
    else:
      self.token_count = token_count
    self.batch_size = self.token_count // self.num_batches
    self.current_token = 0

  def __next__(self):
    if self.current_token == self.token_count:
      raise StopIteration

    interval = [self.current_token, self.current_token + self.batch_size]
    self.current_token += self.batch_size
    return interval

  def __iter__(self):
    return self

In [8]:
def encode_tokens(tokens, int_to_token):
  return [[int_to_token[char] for char in token] for token in tokens]


In [9]:
EMBEDDING_DIM = 16

In [14]:

# responsibility of this class is getting encoded tokens and converting them to char embeddings
class CharEmbeddings(nn.Module):
  def __init__(self, num_chars, embedding_dim, max_token_length, device):
    super(CharEmbeddings, self).__init__()
    self.embedding_dim = embedding_dim
    self.max_token_length = max_token_length
    self.start_of_word_idx = num_chars
    self.end_of_word_idx = num_chars + 1
    self.embeddings = nn.Embedding(num_chars + 2, embedding_dim)
    self.device = device

  # change this to return the transformed characters in batches
  def forward(self, tokens):
    batch_size = len(tokens[0])
    batches = len(tokens)
    embeddings = torch.zeros(batches, batch_size, self.max_token_length + 2, self.embedding_dim)

    for batch in range(batches):
      for index, token_list in enumerate(tokens[batch]):
        character_embeddings = self.embeddings(torch.tensor([self.start_of_word_idx] + token_list + [self.end_of_word_idx], dtype=torch.long, device=self.device))
        character_embeddings = torch.nn.functional.pad(
            character_embeddings,
            (0,0,0, self.max_token_length + 2 - character_embeddings.shape[0])
        )
        embeddings[batch][index] = character_embeddings
    return embeddings

'''ce = CharEmbeddings(len(characters), 16, max_word_length)
encoded_tokens = [encode_tokens(train_tokens[0:2], char_to_int)]
tensor = ce(encoded_tokens)'''

'ce = CharEmbeddings(len(characters), 16, max_word_length)\nencoded_tokens = [encode_tokens(train_tokens[0:2], char_to_int)]\ntensor = ce(encoded_tokens)'

In [19]:
SMALL = [(w, 25 * w) for w in range(1, 7)]
LARGE = [(w, min(200, w * 50)) for w in range(1, 8)]


class CharCNN(nn.Module):
  def __init__(self, embedding_dim, activation, filter_width_mapping):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.activation = activation
    self.conv_layers = nn.ModuleList()
    self.total_num_filters = 0
    # filter_width_mapping is a set of tuples
    # (int, int)
    for width, num_filters in filter_width_mapping:
      self.conv_layers.append(
          nn.Conv2d (
            in_channels=1,
            out_channels =num_filters,
            kernel_size=(width, self.embedding_dim),
            padding=0,
            stride=1,
            bias=True
        )
      )
      self.total_num_filters += num_filters

  def forward(self, tokens):
    max_over_time_poolings = torch.zeros(tokens.shape[0], self.total_num_filters)
    convolution_results = []
    for conv_layer in self.conv_layers:
      x = conv_layer(tokens)
      x = torch.squeeze(x, dim=3)
      x = self.activation(x)
      max_over_time, _ = torch.max(x, dim=2, keepdim=False)
      convolution_results.append(max_over_time)
    x = torch.cat(convolution_results,dim=1)
    return x

In [28]:
class HighwayNetwork(nn.Module):
  def __init__(self, layers, activation, input_size):
    super().__init__()
    self.activation = activation
    self.layers = layers
    # z = t  g(WHy + bH) + (1 − t) y
    self.highway_matrices = nn.ModuleList([
        nn.Linear(input_size, input_size, bias=True)
        for _ in range(layers * 2)
    ])

  def _highway_layers(self, y):
    z = y
    for i in range(self.layers):
      highway_gate, transform_gate = self.highway_matrices[i * 2], self.highway_matrices[i * 2 + 1]
      t = F.sigmoid(transform_gate(z))
      z = t * self.activation(highway_gate(z)) + (1 - t) * z
    return z

  def forward(self, tokens):
    batches = tokens.shape[0]
    for batch in range(batches):
      tensor = tokens[batch]
      for i in range(len(tensor)):
        row_view = tensor[i]
        row_view = torch.unsqueeze(row_view, dim=0)
        row_view = self._highway_layers(row_view)
        tensor[i] = torch.squeeze(row_view, dim=0)
    return tokens

class character_aware_nlm(nn.Module):
  def __init__(
      self,
      Sequencer,
      max_token_length,
      embedding_dim,
      num_characters,
      activation,
      filter_width_mapping,
      batch_size,
      num_highway_layers,
      highway_activation,
      num_rnn_layers,
      rnn_hidden_units,
      vocab_size,
      device
    ):
    super().__init__()
    self.sequencer = Sequencer # get the intervals for the batching the dataset
    self.CharEmbeddingModule = CharEmbeddings(num_characters, embedding_dim, max_token_length, device)
    self.CharCNNModule = CharCNN(embedding_dim, activation, filter_width_mapping).to(device)
    self.HighwayNetworkModule = HighwayNetwork(num_highway_layers, highway_activation, self.CharCNNModule.total_num_filters).to(device)
    self.lstm = nn.LSTM(
        input_size=self.CharCNNModule.total_num_filters,
        num_layers=num_rnn_layers,
        hidden_size=rnn_hidden_units,
        batch_first=True
      ).to(device)
    self.word_prediction_layer = nn.Linear(rnn_hidden_units, vocab_size).to(device)

  def forward(self, tokens):
    batches = len(tokens)
    batch_size = len(tokens[0])
    # get the character embeddings
    character_embeddings = self.CharEmbeddingModule(tokens)

    # feed them to the CNN and store the max over time output of each filter
    cnn_output = torch.zeros(batches, batch_size, self.CharCNNModule.total_num_filters).to(device)
    for batch in range(character_embeddings.shape[0]):
      tensor = torch.unsqueeze(character_embeddings[batch], dim=1).to(device)
      print('in batch loop', batch ,tensor.device)
      cnn_output[batch] = self.CharCNNModule(tensor)

    # pass CNN's out to highway network
    highway_output = self.HighwayNetworkModule(cnn_output)
    return self.lstm_forward(highway_output)

  def lstm_forward(self, input):
    print('in lstm_forward', input.shape)
    initial_hidden_state = torch.zeros(self.lstm.num_layers, input.size(0), self.lstm.hidden_size).to(device)
    initial_memory_cell = torch.zeros(self.lstm.num_layers, input.size(0), self.lstm.hidden_size).to(device)
    out, _ = self.lstm(input, (initial_hidden_state, initial_memory_cell))
    print('out is on device', out.device)
    out = out[:, -1, :]
    print(out.shape)
    predicted_word = self.word_prediction_layer(out)
    return predicted_word

  def predict(self, training_data):
    logits = self.forward(training_data)
    logits = F.softmax(logits, dim=1)
    logits = torch.flatten(logits)
    max_logit, prediction_index = torch.max(logits, dim=0)
    return prediction_index


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size_test = 4
print('number of tokens', len(train_tokens))
print('number of unique tokens', len(token_to_int))
model = character_aware_nlm(
    None,
    max_word_length,
    EMBEDDING_DIM,
    len(characters),
    F.tanh,
    SMALL,
    batch_size_test,
    2,
    F.relu,
    2,
    300,
    len(token_to_int),
    device
)
model.to(device)
encoded_tokens = [encode_tokens(train_tokens, char_to_int)]
predicted_word = model.predict(encoded_tokens)
print(int_to_token[predicted_word.item()])


number of tokens 1078473
number of unique tokens 41586
in batch loop 0 cuda:0


OutOfMemoryError: CUDA out of memory. Tried to allocate 6.03 GiB. GPU 0 has a total capacity of 14.75 GiB of which 2.67 GiB is free. Process 10679 has 12.08 GiB memory in use. Of the allocated memory 8.92 GiB is allocated by PyTorch, and 3.04 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)