In [1]:
import torch
import lzma
from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
from torch import nn
from torch.utils.data import IterableDataset, DataLoader
import itertools

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = "/content/drive/MyDrive/ModelowanieJezyka/"

In [4]:
with open(path + "challenging-america-word-gap-prediction/train/expected.tsv") as file:
  expected = file.readlines()

expected = [x[:-1] for x in expected]

In [5]:
def get_words_from_line(line, count):
  line = line.rstrip()
  line = line.split("\t")
  text = line[-2] + " " + expected[count] + " " + line[-1]
  text = re.sub(r"\\+n", " ", text)
  text = re.sub('[^A-Za-z ]+', '', text)
  for t in text.split():
    yield t


def get_word_lines_from_file(file_name):
  count = 0
  with lzma.open(file_name, encoding='utf8', mode="rt") as fh:
    for line in fh:
       yield get_words_from_line(line, count)
       count += 1


In [6]:
vocab_size = 25_000

vocab = build_vocab_from_iterator(
    get_word_lines_from_file(path + "challenging-america-word-gap-prediction/train/in.tsv.xz"),
    max_tokens = vocab_size,
    specials = ['<unk>'])

In [7]:
from collections import deque
WORDS_LEN = 61
HALF_WORDS_LEN = WORDS_LEN // 2

def look_ahead_iterator(gen):
   words = deque()
   for item in gen:
      if len(words) == WORDS_LEN:
         first_part = tuple(itertools.islice(words, 0, HALF_WORDS_LEN))
         second_part = tuple(itertools.islice(words, HALF_WORDS_LEN+1, None))
         concat = first_part + second_part
         yield (concat, words[HALF_WORDS_LEN])
         words.popleft()
         words.append(item)
      else:
        words.append(item)

class FullLines(IterableDataset):
  def __init__(self, text_file, vocabulary_size):
      self.vocab = vocab
      self.vocab.set_default_index(self.vocab['<unk>'])
      self.vocabulary_size = vocabulary_size
      self.text_file = text_file

  def __iter__(self):
     return look_ahead_iterator(
         (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))

train_dataset = FullLines(path + "challenging-america-word-gap-prediction/train/in.tsv.xz", vocab_size)

In [27]:
embed_size = 300

class FinalNNModel(nn.Module):
  def __init__(self, vocabulary_size, embedding_size, hl_size ,second_hl_size):
      super(FinalNNModel, self).__init__()
      self.general_embeddings = nn.Embedding(vocabulary_size, embedding_size)
      self.embeddings = nn.Embedding(vocabulary_size, embedding_size)
      self.dropout = nn.Dropout(p=0.3)
      self.hl = nn.Linear(embedding_size*7, hl_size)
      self.second_hl = nn.Linear(hl_size, second_hl_size)
      self.output_layer = nn.Linear(second_hl_size, vocabulary_size)
      self.relu = nn.ReLU()

  def forward(self, x):
      emb_0 = self.embeddings(x[HALF_WORDS_LEN-3])
      emb_1 = self.embeddings(x[HALF_WORDS_LEN-2])
      emb_2 = self.embeddings(x[HALF_WORDS_LEN-1])
      emb_3 = self.embeddings(x[HALF_WORDS_LEN])
      emb_4 = self.embeddings(x[HALF_WORDS_LEN+1])
      emb_5 = self.embeddings(x[HALF_WORDS_LEN+2])

      first_ge = torch.mean(self.general_embeddings(torch.stack(x[:HALF_WORDS_LEN-3], dim=0)), dim=0)
      second_ge = torch.mean(self.general_embeddings(torch.stack(x[HALF_WORDS_LEN+3:], dim=0)), dim=0)
      
      general_embeddings = torch.mean(torch.stack([first_ge, second_ge]), dim=0)

      number_of_dimensions = len(emb_1.size())
      x = torch.cat([emb_0, emb_1, emb_2, emb_3, emb_4, emb_5, general_embeddings], dim=number_of_dimensions-1)
      
      x = self.hl(x)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.second_hl(x)
      x = self.relu(x)
      x = self.output_layer(x)
      x = nn.LogSoftmax(dim=number_of_dimensions-1)(x)
      return x

hl_size = 1000
second_hl_size = 500

vocab.set_default_index(vocab['<unk>'])

In [28]:
device = 'cuda'
model = FinalNNModel(vocab_size, embed_size, hl_size, second_hl_size).to(device)
data = DataLoader(train_dataset, batch_size=10_000)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()

model.train()
step = 0
stacked_loss = 0
state = None

for x, y in data:
   x = [x[idx].to(device) for idx in range(len(x))]
   y = y.to(device)

   optimizer.zero_grad()
   y_predicted = model(x)
   loss = criterion(y_predicted, y)
   stacked_loss += loss

   if step % 100 == 0:
      print(step, stacked_loss)
      stacked_loss = 0

   step += 1
   loss.backward()
   optimizer.step()


0 tensor(10.1469, device='cuda:0', grad_fn=<AddBackward0>)
100 tensor(670.1851, device='cuda:0', grad_fn=<AddBackward0>)
200 tensor(600.6952, device='cuda:0', grad_fn=<AddBackward0>)
300 tensor(574.3068, device='cuda:0', grad_fn=<AddBackward0>)
400 tensor(555.0275, device='cuda:0', grad_fn=<AddBackward0>)
500 tensor(542.8818, device='cuda:0', grad_fn=<AddBackward0>)
600 tensor(531.1751, device='cuda:0', grad_fn=<AddBackward0>)
700 tensor(521.6666, device='cuda:0', grad_fn=<AddBackward0>)
800 tensor(515.3813, device='cuda:0', grad_fn=<AddBackward0>)
900 tensor(508.7432, device='cuda:0', grad_fn=<AddBackward0>)
1000 tensor(503.4024, device='cuda:0', grad_fn=<AddBackward0>)
1100 tensor(498.4164, device='cuda:0', grad_fn=<AddBackward0>)
1200 tensor(493.4991, device='cuda:0', grad_fn=<AddBackward0>)
1300 tensor(488.9643, device='cuda:0', grad_fn=<AddBackward0>)
1400 tensor(486.8802, device='cuda:0', grad_fn=<AddBackward0>)
1500 tensor(481.1358, device='cuda:0', grad_fn=<AddBackward0>)
1600 

In [29]:
vocab_unique = set(vocab.get_stoi().keys())

In [34]:
model.eval()

output = []
x = 0
with lzma.open(path + "challenging-america-word-gap-prediction/test-A/in.tsv.xz", encoding='utf8', mode="rt") as file:
    for line in file:
        line = line.split("\t")

        first_part = re.sub(r"\\+n", " ", line[-2])
        first_part = re.sub('[^A-Za-z ]+', '', first_part).split()

        second_part = re.sub(r"\\+n", " ", line[-1])
        second_part = re.sub('[^A-Za-z ]+', '', second_part).split()

        input_words = first_part[-HALF_WORDS_LEN:] + second_part[:HALF_WORDS_LEN]
        input_words = vocab.forward(input_words)
        input_tokens = [torch.tensor(q).to(device) for q in input_words]

        if len(input_words) < HALF_WORDS_LEN*2:
          output.append(":1.0\n")
          continue
        out = torch.exp(model(input_tokens))

        top = torch.topk(out, 100)
        top_indices = top.indices.tolist()
        top_probs = top.values.tolist()
        unk_bonus = 1 - sum(top_probs)
        top_words = vocab.lookup_tokens(top_indices)
        top_zipped = list(zip(top_words, top_probs))

        res = ""
        for w, p in top_zipped:
            if w == "<unk>":
                res += f":{(p + unk_bonus + 0.01):.4f} "
            else:
                if p > 0.0001:
                  res += f"{w}:{p:.4f} "
        
        res = res[:-1]
        res += "\n"
        output.append(res)

with open(path + "challenging-america-word-gap-prediction/test-A/out.tsv", mode="w") as file:
    file.writelines(output)

In [33]:
output = []
x = 0
with lzma.open(path + "challenging-america-word-gap-prediction/dev-0/in.tsv.xz", encoding='utf8', mode="rt") as file:
    for line in file:
        line = line.split("\t")

        first_part = re.sub(r"\\+n", " ", line[-2])
        first_part = re.sub('[^A-Za-z ]+', '', first_part).split()

        second_part = re.sub(r"\\+n", " ", line[-1])
        second_part = re.sub('[^A-Za-z ]+', '', second_part).split()

        input_words = first_part[-HALF_WORDS_LEN:] + second_part[:HALF_WORDS_LEN]
        input_words = vocab.forward(input_words)
        input_tokens = [torch.tensor(q).to(device) for q in input_words]

        if len(input_words) < HALF_WORDS_LEN*2:
          output.append(":1.0\n")
          continue
        out = torch.exp(model(input_tokens))

        top = torch.topk(out, 100)
        top_indices = top.indices.tolist()
        top_probs = top.values.tolist()
        unk_bonus = 1 - sum(top_probs)
        top_words = vocab.lookup_tokens(top_indices)
        top_zipped = list(zip(top_words, top_probs))

        res = ""
        for w, p in top_zipped:
            if w == "<unk>":
                res += f":{(p + unk_bonus + 0.01):.4f} "
            else:
                if p > 0.0001:
                  res += f"{w}:{p:.4f} "
        
        res = res[:-1]
        res += "\n"
        output.append(res)

with open(path + "challenging-america-word-gap-prediction/dev-0/out.tsv", mode="w") as file:
    file.writelines(output)