In [1]:
import torch
import torch.nn as nn
import random
import torch.nn.functional as F

In [2]:
# -*- coding: windows-1251 -*-

def readWords(path):
  words = set()
  try:
    with open(path, 'r', encoding="utf-8") as f:
      words.update(set(f.read().split("\n")))
  except:
    with open(path, 'r', encoding="cp1251") as f:
      words.update(set(f.read().split("\n")))
  words.discard("")
  return words

def getAllWords(listNames):
  allWords = set()

  for name in listNames:
    allWords.update(readWords("data/" + name + ".txt"))

  return list(allWords)

words_s = getAllWords(["locations", "locations2", "names2", "names3", "words1"])
print(f"small words list size = {len(words_s)}")
#words_f = getAllWords(["locations", "locations2", "names2", "names3", "words1", "words2_1", "words2_2"])
#print(f"full words list size = {len(words_f)}")

small words list size = 173228


In [3]:
words_set = words_s

chars = set()
for word in words_set:
  chars.update(set(word))
chars = sorted(list(chars))

print("Number of unique chars = " + str(len(chars)))
print("".join(chars))

ctoi = dict([(x, i) for i, x in enumerate(chars)])
itoc = dict([(i, x) for i, x in enumerate(chars)])
delim = len(chars)
size = delim + 1
ctoi['.'] = delim
itoc[delim] = '.'

Number of unique chars = 36
 '-абвгдежзийклмнопрстуфхцчшщьюяєіїґ


Log for different models:

cnt2 (p = 1,369): train_llh = 2.586 valid_llh = 2.586

cnt3 (p = 50,653): train_llh = 2.263 valid_llh = 2.276

model1_tiny (p = 1,263): train_llh = 2.386 valid_llh = 2.382

model1_normal (p = 10,617): train_llh = 2.169 valid_llh = 2.170

model1_large (p = 25,165): train_llh = 2.061 valid_llh = 2.073

model2_tiny (p = 899): train_llh = 2.445 valid_llh = 2.447

model2_normal (p = 10,997): train_llh = 2.155 valid_llh = 2.163

model2_large (p = 21,525): train_llh = 2.097 valid_llh = 2.109

model3_tiny (p = 951): train_llh = 2.438 valid_llh = 2.433

model3_normal (p = 10,965): train_llh = 2.191 valid_llh = 2.189

model3_large (p = 29,829): train_llh = 2.112 valid_llh = 2.116

In [4]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
m1_input_len = 14

def m1_align_word(word):
  len_diff = (m1_input_len - 4) - len(word)
  if len_diff < 0:
    word = word[-m1_input_len + 4:]
  if len_diff > 0:
    word = ("." * len_diff) + word
  word = ".." + word + ".."
  return word

def m1_split_word_into_tokens(word):
  res = []
  for i in range(len(word)):
    res.append((m1_align_word(word[0:i]), word[i]))
  res.append((m1_align_word(word), "."))
  return res

def m1_str_to_int(samples):
  return [([ctoi[c] for c in sample[0]], ctoi[sample[1]]) for sample in samples]

for x,y in m1_split_word_into_tokens(words_set[0]):
  print(f"{x} -> {y}")

m1_all_samples = []
for word in words_set:
  m1_all_samples.extend(m1_str_to_int(m1_split_word_into_tokens(word)))
random.shuffle(m1_all_samples)

print(f"total of {len(m1_all_samples)} tokens")

m1_n_split = int(len(m1_all_samples) * 0.9)

m1_x_trn = torch.tensor([sample[0] for sample in m1_all_samples[0:m1_n_split]])
m1_y_trn = torch.tensor([sample[1] for sample in m1_all_samples[0:m1_n_split]])

m1_x_vld = torch.tensor([sample[0] for sample in m1_all_samples[m1_n_split:]])
m1_y_vld = torch.tensor([sample[1] for sample in m1_all_samples[m1_n_split:]])

print(f"train shape x = {m1_x_trn.shape} y = {m1_y_trn.shape}")
print(f"validation shape x = {m1_x_vld.shape} y = {m1_y_vld.shape}")

In [None]:
m1_batch_size = 32
m1_steps_in_epoch = m1_n_split // m1_batch_size

m1_n_embd, m1_l1_channels, m1_l2_channels, m1_l3_channels, m1_head_channels = 2, 4, 8, 12, 12
#m1_n_embd, m1_l1_channels, m1_l2_channels, m1_l3_channels, m1_head_channels = 12, 16, 24, 32, 64
#m1_n_embd, m1_l1_channels, m1_l2_channels, m1_l3_channels, m1_head_channels = 24, 32, 48, 64, 64

m1_leaky_relu_alpha = 0.05

In [None]:
class Model1(nn.Module):
  def __init__(self):
    super(Model1, self).__init__()

    self.embedding = nn.Embedding(size, m1_n_embd)
    nn.init.kaiming_normal_(self.embedding.weight, mode='fan_in')

    self.l1 = nn.Conv1d(m1_n_embd, m1_l1_channels, 3)
    nn.init.kaiming_normal_(self.l1.weight, mode='fan_in', nonlinearity='leaky_relu', a=m1_leaky_relu_alpha)
    self.batch1 = nn.BatchNorm1d(m1_l1_channels)
    self.activ1 = nn.LeakyReLU(m1_leaky_relu_alpha)

    self.l2 = nn.Conv1d(m1_l1_channels, m1_l2_channels, 3, stride=3)
    nn.init.xavier_uniform_(self.l2.weight)
    self.batch2 = nn.BatchNorm1d(m1_l2_channels)
    self.activ2 = nn.Tanh()

    self.l3 = nn.Conv1d(m1_l2_channels, m1_l3_channels, 2, stride=2)
    nn.init.xavier_uniform_(self.l3.weight)
    self.batch3 = nn.BatchNorm1d(m1_l3_channels)
    self.activ3 = nn.Tanh()

    self.flat = nn.Flatten()
    self.l4 = nn.Linear(2 * m1_l3_channels, m1_head_channels)
    nn.init.xavier_uniform_(self.l4.weight)
    self.batch4 = nn.BatchNorm1d(m1_head_channels)
    self.activ4 = nn.Tanh()
    
    self.head = nn.Linear(m1_head_channels, size)

  def forward(self, x):
    x = self.embedding(x).permute(0, 2, 1)
    x = self.activ1(self.batch1(self.l1(x)))
    x = self.activ2(self.batch2(self.l2(x)))
    x = self.activ3(self.batch3(self.l3(x)))
    x = self.activ4(self.batch4(self.l4(self.flat(x))))
    x = self.head(x)

    return x

In [None]:
model1 = Model1()
print(f"Number of parameters in model1 = {count_parameters(model1)}")

In [None]:
# model training
torch.manual_seed(74)
m1_optimizer = torch.optim.AdamW(model1.parameters(), lr = 0.01, weight_decay = 0.001)
m1_scheduler = torch.optim.lr_scheduler.ExponentialLR(m1_optimizer, gamma=0.5)

def m1_getScore():
  model1.eval()

  t_batch = 256
  train_llh, valid_llh = 0.0, 0.0
  train_tot, valid_tot = 0, 0

  with torch.no_grad():
    for i in range(t_batch, m1_x_vld.size(0), t_batch):
      idx = torch.arange(i - t_batch, i, 1)
      x, y = m1_x_vld[idx], m1_y_vld[idx]
      
      outputs = model1(x)
      llh = F.cross_entropy(outputs, y)

      valid_llh += llh.item() * t_batch
      valid_tot += t_batch
  
  with torch.no_grad():
    for i in range(t_batch, m1_x_trn.size(0), t_batch):
      idx = torch.arange(i - t_batch, i, 1)
      x, y = m1_x_trn[idx], m1_y_trn[idx]
      
      outputs = model1(x)
      llh = F.cross_entropy(outputs, y)

      train_llh += llh.item() * t_batch
      train_tot += t_batch
  
  valid_llh /= valid_tot
  train_llh /= train_tot

  return train_llh, valid_llh

train_llh, valid_llh = m1_getScore()
print(f"epoch {-1} loss = {train_llh}   {valid_llh}")

for epoch in range(10):
  model1.train()

  for step in range(m1_steps_in_epoch):
    m1_optimizer.zero_grad()

    ix = torch.randint(0, m1_x_trn.size(0), (m1_batch_size,))
    x, y = m1_x_trn[ix], m1_y_trn[ix]

    outputs = model1(x)
    loss = F.cross_entropy(outputs, y)

    loss.backward()
    m1_optimizer.step()
  
  m1_scheduler.step()
  for param_group in m1_optimizer.param_groups:
    param_group['weight_decay'] = [param_group['lr'] for param_group in m1_optimizer.param_groups][0] * 0.1

  train_llh, valid_llh = m1_getScore()

  print(f"epoch {epoch} loss = {train_llh}   {valid_llh}")

In [None]:
model_path = 'data/model1_large.pth'
torch.save(model1.state_dict(), model_path)

In [None]:
import matplotlib.pyplot as plt

l = model1.embedding.weight.tolist()
x_coords, y_coords = zip(*l)

plt.figure(dpi=300)
plt.scatter(x_coords, y_coords, s=400)

# Add text labels to each point
for i, (x, y) in enumerate(l):
    plt.text(x, y, itoc[i], fontsize=12, ha='center', va='center')

plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('2D Points')

plt.show()

In [None]:
m2_input_len = 8

def m2_align_word(word):
  len_diff = m2_input_len - len(word)
  if len_diff < 0:
    word = word[-m2_input_len:]
  if len_diff > 0:
    word = ("." * len_diff) + word
  return word

def m2_split_word_into_tokens(word):
  res = []
  for i in range(len(word)):
    res.append((m2_align_word(word[0:i]), word[i]))
  res.append((m2_align_word(word), "."))
  return res

def m2_str_to_int(samples):
  return [([ctoi[c] for c in sample[0]], ctoi[sample[1]]) for sample in samples]

for x,y in m2_split_word_into_tokens(words_set[0]):
  print(f"{x} -> {y}")

m2_all_samples = []
for word in words_set:
  m2_all_samples.extend(m2_str_to_int(m2_split_word_into_tokens(word)))
random.shuffle(m2_all_samples)

print(f"total of {len(m2_all_samples)} tokens")

m2_n_split = int(len(m2_all_samples) * 0.9)

m2_x_trn = torch.tensor([sample[0] for sample in m2_all_samples[0:m2_n_split]])
m2_y_trn = torch.tensor([sample[1] for sample in m2_all_samples[0:m2_n_split]])

m2_x_vld = torch.tensor([sample[0] for sample in m2_all_samples[m2_n_split:]])
m2_y_vld = torch.tensor([sample[1] for sample in m2_all_samples[m2_n_split:]])

print(f"train shape x = {m2_x_trn.shape} y = {m2_y_trn.shape}")
print(f"validation shape x = {m2_x_vld.shape} y = {m2_y_vld.shape}")

In [None]:
m2_batch_size = 32
m2_steps_in_epoch = m2_n_split // m2_batch_size

#m2_n_embd, m2_l1_channels, m2_l2_channels, m2_head_channels = 2, 4, 8, 12
#m2_n_embd, m2_l1_channels, m2_l2_channels, m2_head_channels = 24, 32, 40, 48
m2_n_embd, m2_l1_channels, m2_l2_channels, m2_head_channels = 32, 48, 64, 64

m2_leaky_relu_alpha = 0.05

In [None]:
class Model2(nn.Module):
  def __init__(self):
    super(Model2, self).__init__()

    self.embedding = nn.Embedding(size, m2_n_embd)
    nn.init.kaiming_normal_(self.embedding.weight, mode='fan_in')

    self.l1 = nn.Conv1d(m2_n_embd, m2_l1_channels, 2, stride = 2)
    nn.init.kaiming_normal_(self.l1.weight, mode='fan_in', nonlinearity='leaky_relu', a=m2_leaky_relu_alpha)
    self.batch1 = nn.BatchNorm1d(m2_l1_channels)
    self.activ1 = nn.LeakyReLU(m2_leaky_relu_alpha)

    self.l2 = nn.Conv1d(m2_l1_channels, m2_l2_channels, 2, stride=2)
    nn.init.xavier_uniform_(self.l2.weight)
    self.batch2 = nn.BatchNorm1d(m2_l2_channels)
    self.activ2 = nn.Tanh()

    self.flat = nn.Flatten()
    self.l3 = nn.Linear(2 * m2_l2_channels, m2_head_channels)
    nn.init.xavier_uniform_(self.l3.weight)
    self.batch3 = nn.BatchNorm1d(m2_head_channels)
    self.activ3 = nn.Tanh()
    
    self.head = nn.Linear(m2_head_channels, size)

  def forward(self, x):
    x = self.embedding(x).permute(0, 2, 1)
    x = self.activ1(self.batch1(self.l1(x)))
    x = self.activ2(self.batch2(self.l2(x)))
    x = self.activ3(self.batch3(self.l3(self.flat(x))))
    x = self.head(x)

    return x

In [None]:
model2 = Model2()
print(f"Number of parameters in model2 = {count_parameters(model2)}")

In [None]:
# model training
torch.manual_seed(74)
m2_optimizer = torch.optim.AdamW(model2.parameters(), lr = 0.01, weight_decay = 0.001)
m2_scheduler = torch.optim.lr_scheduler.ExponentialLR(m2_optimizer, gamma=0.5)

def m2_getScore():
  model2.eval()

  t_batch = 256
  train_llh, valid_llh = 0.0, 0.0
  train_tot, valid_tot = 0, 0

  with torch.no_grad():
    for i in range(t_batch, m2_x_vld.size(0), t_batch):
      idx = torch.arange(i - t_batch, i, 1)
      x, y = m2_x_vld[idx], m2_y_vld[idx]
      
      outputs = model2(x)
      llh = F.cross_entropy(outputs, y)

      valid_llh += llh.item() * t_batch
      valid_tot += t_batch
  
  with torch.no_grad():
    for i in range(t_batch, m2_x_trn.size(0), t_batch):
      idx = torch.arange(i - t_batch, i, 1)
      x, y = m2_x_trn[idx], m2_y_trn[idx]
      
      outputs = model2(x)
      llh = F.cross_entropy(outputs, y)

      train_llh += llh.item() * t_batch
      train_tot += t_batch
  
  valid_llh /= valid_tot
  train_llh /= train_tot

  return train_llh, valid_llh

train_llh, valid_llh = m2_getScore()
print(f"epoch {-1} loss = {train_llh}   {valid_llh}")

for epoch in range(10):
  model2.train()

  for step in range(m2_steps_in_epoch):
    m2_optimizer.zero_grad()

    ix = torch.randint(0, m2_x_trn.size(0), (m2_batch_size,))
    x, y = m2_x_trn[ix], m2_y_trn[ix]

    outputs = model2(x)
    loss = F.cross_entropy(outputs, y)

    loss.backward()
    m2_optimizer.step()
  
  m2_scheduler.step()
  for param_group in m2_optimizer.param_groups:
    param_group['weight_decay'] = [param_group['lr'] for param_group in m2_optimizer.param_groups][0] * 0.1

  train_llh, valid_llh = m2_getScore()

  print(f"epoch {epoch} loss = {train_llh}   {valid_llh}")

In [None]:
model_path = 'data/model2_large.pth'
torch.save(model2.state_dict(), model_path)

In [None]:
import matplotlib.pyplot as plt

l = model2.embedding.weight.tolist()
x_coords, y_coords = zip(*l)

plt.figure(dpi=300)
plt.scatter(x_coords, y_coords, s=400)

# Add text labels to each point
for i, (x, y) in enumerate(l):
    plt.text(x, y, itoc[i], fontsize=12, ha='center', va='center')

plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('2D Points')

plt.show()

In [None]:
m3_input_len = 9

def m3_align_word(word):
  len_diff = m3_input_len - len(word)
  if len_diff < 0:
    word = word[-m3_input_len:]
  if len_diff > 0:
    word = ("." * len_diff) + word
  return word

def m3_split_word_into_tokens(word):
  res = []
  for i in range(len(word)):
    res.append((m3_align_word(word[0:i]), word[i]))
  res.append((m3_align_word(word), "."))
  return res

def m3_str_to_int(samples):
  return [([ctoi[c] for c in sample[0]], ctoi[sample[1]]) for sample in samples]

for x,y in m3_split_word_into_tokens(words_set[0]):
  print(f"{x} -> {y}")

m3_all_samples = []
for word in words_set:
  m3_all_samples.extend(m3_str_to_int(m3_split_word_into_tokens(word)))
random.shuffle(m3_all_samples)

print(f"total of {len(m3_all_samples)} tokens")

m3_n_split = int(len(m3_all_samples) * 0.9)

m3_x_trn = torch.tensor([sample[0] for sample in m3_all_samples[0:m3_n_split]])
m3_y_trn = torch.tensor([sample[1] for sample in m3_all_samples[0:m3_n_split]])

m3_x_vld = torch.tensor([sample[0] for sample in m3_all_samples[m3_n_split:]])
m3_y_vld = torch.tensor([sample[1] for sample in m3_all_samples[m3_n_split:]])

print(f"train shape x = {m3_x_trn.shape} y = {m3_y_trn.shape}")
print(f"validation shape x = {m3_x_vld.shape} y = {m3_y_vld.shape}")

In [23]:
m3_batch_size = 32
m3_steps_in_epoch = m3_n_split // m3_batch_size

m3_n_embd, m3_l1_channels, m3_head_channels = 2, 8, 12
#m3_n_embd, m3_l1_channels, m3_head_channels = 16, 32, 64
#m3_n_embd, m3_l1_channels, m3_head_channels = 32, 64, 96

m3_leaky_relu_alpha = 0.05

In [24]:
class Model3(nn.Module):
  def __init__(self):
    super(Model3, self).__init__()

    self.embedding = nn.Embedding(size, m3_n_embd)
    nn.init.kaiming_normal_(self.embedding.weight, mode='fan_in')

    self.l1 = nn.Conv1d(m3_n_embd, m3_l1_channels, 3, stride = 3)
    nn.init.kaiming_normal_(self.l1.weight, mode='fan_in', nonlinearity='leaky_relu', a=m3_leaky_relu_alpha)
    self.batch1 = nn.BatchNorm1d(m3_l1_channels)
    self.activ1 = nn.LeakyReLU(m3_leaky_relu_alpha)

    self.flat = nn.Flatten()
    self.l2 = nn.Linear(3 * m3_l1_channels, m3_head_channels)
    nn.init.xavier_uniform_(self.l2.weight)
    self.batch2 = nn.BatchNorm1d(m3_head_channels)
    self.activ2 = nn.Tanh()
    
    self.head = nn.Linear(m3_head_channels, size)

  def forward(self, x):
    x = self.embedding(x).permute(0, 2, 1)
    x = self.activ1(self.batch1(self.l1(x)))
    x = self.activ2(self.batch2(self.l2(self.flat(x))))
    x = self.head(x)

    return x

In [None]:
model3 = Model3()
print(f"Number of parameters in model3 = {count_parameters(model3)}")

In [None]:
# model training
torch.manual_seed(74)
m3_optimizer = torch.optim.AdamW(model3.parameters(), lr = 0.01, weight_decay = 0.001)
m3_scheduler = torch.optim.lr_scheduler.ExponentialLR(m3_optimizer, gamma=0.5)

def m3_getScore():
  model3.eval()

  t_batch = 256
  train_llh, valid_llh = 0.0, 0.0
  train_tot, valid_tot = 0, 0

  with torch.no_grad():
    for i in range(t_batch, m3_x_vld.size(0), t_batch):
      idx = torch.arange(i - t_batch, i, 1)
      x, y = m3_x_vld[idx], m3_y_vld[idx]
      
      outputs = model3(x)
      llh = F.cross_entropy(outputs, y)

      valid_llh += llh.item() * t_batch
      valid_tot += t_batch
  
  with torch.no_grad():
    for i in range(t_batch, m3_x_trn.size(0), t_batch):
      idx = torch.arange(i - t_batch, i, 1)
      x, y = m3_x_trn[idx], m3_y_trn[idx]
      
      outputs = model3(x)
      llh = F.cross_entropy(outputs, y)

      train_llh += llh.item() * t_batch
      train_tot += t_batch
  
  valid_llh /= valid_tot
  train_llh /= train_tot

  return train_llh, valid_llh

train_llh, valid_llh = m3_getScore()
print(f"epoch {-1} loss = {train_llh}   {valid_llh}")

for epoch in range(10):
  model3.train()

  for step in range(m3_steps_in_epoch):
    m3_optimizer.zero_grad()

    ix = torch.randint(0, m3_x_trn.size(0), (m3_batch_size,))
    x, y = m3_x_trn[ix], m3_y_trn[ix]

    outputs = model3(x)
    loss = F.cross_entropy(outputs, y)

    loss.backward()
    m3_optimizer.step()
  
  m3_scheduler.step()
  for param_group in m3_optimizer.param_groups:
    param_group['weight_decay'] = [param_group['lr'] for param_group in m3_optimizer.param_groups][0] * 0.1

  train_llh, valid_llh = m3_getScore()

  print(f"epoch {epoch} loss = {train_llh}   {valid_llh}")

In [22]:
model_path = 'data/model3_large.pth'
torch.save(model3.state_dict(), model_path)

In [None]:
import matplotlib.pyplot as plt

l = model3.embedding.weight.tolist()
x_coords, y_coords = zip(*l)

plt.figure(dpi=300)
plt.scatter(x_coords, y_coords, s=400)

# Add text labels to each point
for i, (x, y) in enumerate(l):
    plt.text(x, y, itoc[i], fontsize=12, ha='center', va='center')

plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('2D Points')

plt.show()