In [2]:
import torch
import torch.nn as nn
import random
import torch.nn.functional as F

In [None]:
def readWords(path):
  words = set()
  try:
    with open(path, 'r', encoding="utf-8") as f:
      words.update(set(f.read().split("\n")))
  except:
    with open(path, 'r', encoding="cp1251") as f:
      words.update(set(f.read().split("\n")))
  words.discard("")
  return words

def getAllWords(listNames):
  allWords = set()

  for name in listNames:
    allWords.update(readWords("data/" + name + ".txt"))

  return list(allWords)

words_s = getAllWords(["locations", "locations2", "names2", "names3", "words1"])
print(f"small words list size = {len(words_s)}")

In [None]:
words_set = words_s

chars = set()
for word in words_set:
  chars.update(set(word))
chars = sorted(list(chars))

print("Number of unique chars = " + str(len(chars)))
print("".join(chars))

ctoi = dict([(x, i) for i, x in enumerate(chars)])
itoc = dict([(i, x) for i, x in enumerate(chars)])
delim = len(chars)
size = delim + 1
ctoi['.'] = delim
itoc[delim] = '.'

Log for different models:

model4_tiny (p = 1,375): train_llh = 2.384 valid_llh = 2.384

model4_normal (p = 11,265): train_llh = 2.166 valid_llh = 2.171

model4_large (p = 23,405): train_llh = 2.078 valid_llh = 2.091

model4_largest (p = 134,349): train_llh = 1.930 valid_llh = 1.965

In [6]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
m4_input_len = 11

def m4_align_word(word):
  len_diff = (m4_input_len - 1) - len(word)
  if len_diff < 0:
    word = word[-m4_input_len + 1:]
  if len_diff > 0:
    word = ("." * len_diff) + word
  word = word + "."
  return word

def m4_split_word_into_tokens(word):
  res = []
  for i in range(len(word)):
    res.append((m4_align_word(word[0:i]), word[i]))
  res.append((m4_align_word(word), "."))
  return res

def m4_str_to_int(samples):
  return [([ctoi[c] for c in sample[0]], ctoi[sample[1]]) for sample in samples]

for x,y in m4_split_word_into_tokens(words_set[0]):
  print(f"{x} -> {y}")

m4_all_samples = []
for word in words_set:
  m4_all_samples.extend(m4_str_to_int(m4_split_word_into_tokens(word)))
random.shuffle(m4_all_samples)

print(f"total of {len(m4_all_samples)} tokens")

m4_n_split = int(len(m4_all_samples) * 0.9)

m4_x_trn = torch.tensor([sample[0] for sample in m4_all_samples[0:m4_n_split]])
m4_y_trn = torch.tensor([sample[1] for sample in m4_all_samples[0:m4_n_split]])

m4_x_vld = torch.tensor([sample[0] for sample in m4_all_samples[m4_n_split:]])
m4_y_vld = torch.tensor([sample[1] for sample in m4_all_samples[m4_n_split:]])

print(f"train shape x = {m4_x_trn.shape} y = {m4_y_trn.shape}")
print(f"validation shape x = {m4_x_vld.shape} y = {m4_y_vld.shape}")

In [19]:
m4_batch_size = 32
m4_steps_in_epoch = m4_n_split // m4_batch_size

#m4_n_embd, m4_l1_channels, m4_l2_channels, m4_l3_channels, m4_l4_channels, m4_head_channels = 2, 4, 8, 8, 12, 12
#m4_n_embd, m4_l1_channels, m4_l2_channels, m4_l3_channels, m4_l4_channels, m4_head_channels = 12, 16, 24, 24, 32, 64
m4_n_embd, m4_l1_channels, m4_l2_channels, m4_l3_channels, m4_l4_channels, m4_head_channels = 16, 24, 32, 48, 64, 64
#m4_n_embd, m4_l1_channels, m4_l2_channels, m4_l3_channels, m4_l4_channels, m4_head_channels = 40, 64, 96, 128, 128, 192

m4_leaky_relu_alpha = 0.05

In [20]:
class Model4(nn.Module):
  def __init__(self):
    super(Model4, self).__init__()

    self.embedding = nn.Embedding(size, m4_n_embd)
    nn.init.kaiming_normal_(self.embedding.weight, mode='fan_in')

    self.l1 = nn.Conv1d(m4_n_embd, m4_l1_channels, 2)
    nn.init.kaiming_normal_(self.l1.weight, mode='fan_in', nonlinearity='leaky_relu', a=m4_leaky_relu_alpha)
    self.batch1 = nn.BatchNorm1d(m4_l1_channels)
    self.activ1 = nn.LeakyReLU(m4_leaky_relu_alpha)

    self.l2 = nn.Conv1d(m4_l1_channels, m4_l2_channels, 2, stride=2)
    nn.init.xavier_uniform_(self.l2.weight)
    self.batch2 = nn.BatchNorm1d(m4_l2_channels)
    self.activ2 = nn.Tanh()

    self.l3 = nn.Conv1d(m4_l2_channels, m4_l3_channels, 2)
    nn.init.xavier_uniform_(self.l3.weight)
    self.batch3 = nn.BatchNorm1d(m4_l3_channels)
    self.activ3 = nn.Tanh()

    self.l4 = nn.Conv1d(m4_l3_channels, m4_l4_channels, 2, stride=2)
    nn.init.xavier_uniform_(self.l4.weight)
    self.batch4 = nn.BatchNorm1d(m4_l4_channels)
    self.activ4 = nn.Tanh()

    self.flat = nn.Flatten()
    self.l5 = nn.Linear(2 * m4_l4_channels, m4_head_channels)
    nn.init.xavier_uniform_(self.l5.weight)
    self.batch5 = nn.BatchNorm1d(m4_head_channels)
    self.activ5 = nn.Tanh()
    
    self.head = nn.Linear(m4_head_channels, size)

  def forward(self, x):
    x = self.embedding(x).permute(0, 2, 1)
    x = self.activ1(self.batch1(self.l1(x)))
    x = self.activ2(self.batch2(self.l2(x)))
    x = self.activ3(self.batch3(self.l3(x)))
    x = self.activ4(self.batch4(self.l4(x)))
    x = self.activ5(self.batch5(self.l5(self.flat(x))))
    x = self.head(x)

    return x

In [None]:
model4 = Model4()
print(f"Number of parameters in model4 = {count_parameters(model4)}")

In [None]:
# model training
torch.manual_seed(74)
m4_optimizer = torch.optim.AdamW(model4.parameters(), lr = 0.01, weight_decay = 0.001)
m4_scheduler = torch.optim.lr_scheduler.ExponentialLR(m4_optimizer, gamma=0.5)

def m4_getScore():
  model4.eval()

  t_batch = 256
  train_llh, valid_llh = 0.0, 0.0
  train_tot, valid_tot = 0, 0

  with torch.no_grad():
    for i in range(t_batch, m4_x_vld.size(0), t_batch):
      idx = torch.arange(i - t_batch, i, 1)
      x, y = m4_x_vld[idx], m4_y_vld[idx]
      
      outputs = model4(x)
      llh = F.cross_entropy(outputs, y)

      valid_llh += llh.item() * t_batch
      valid_tot += t_batch
  
  with torch.no_grad():
    for i in range(t_batch, m4_x_trn.size(0), t_batch):
      idx = torch.arange(i - t_batch, i, 1)
      x, y = m4_x_trn[idx], m4_y_trn[idx]
      
      outputs = model4(x)
      llh = F.cross_entropy(outputs, y)

      train_llh += llh.item() * t_batch
      train_tot += t_batch
  
  valid_llh /= valid_tot
  train_llh /= train_tot

  return train_llh, valid_llh

train_llh, valid_llh = m4_getScore()
print(f"epoch {-1} loss = {train_llh}   {valid_llh}")

for epoch in range(10):
  model4.train()

  for step in range(m4_steps_in_epoch):
    m4_optimizer.zero_grad()

    ix = torch.randint(0, m4_x_trn.size(0), (m4_batch_size,))
    x, y = m4_x_trn[ix], m4_y_trn[ix]

    outputs = model4(x)
    loss = F.cross_entropy(outputs, y)

    loss.backward()
    m4_optimizer.step()
  
  m4_scheduler.step()
  for param_group in m4_optimizer.param_groups:
    param_group['weight_decay'] = [param_group['lr'] for param_group in m4_optimizer.param_groups][0] * 0.1

  train_llh, valid_llh = m4_getScore()

  print(f"epoch {epoch} loss = {train_llh}   {valid_llh}")

In [24]:
model_path = 'data/model4_large.pth'
torch.save(model4.state_dict(), model_path)

In [None]:
import matplotlib.pyplot as plt

l = model4.embedding.weight.tolist()
x_coords, y_coords = zip(*l)

plt.figure(dpi=300)
plt.scatter(x_coords, y_coords, s=400)

# Add text labels to each point
for i, (x, y) in enumerate(l):
    plt.text(x, y, itoc[i], fontsize=12, ha='center', va='center')

plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('2D Points')

plt.show()