In [1]:
!pip install -q polyglot
!pip install -q conllu

[?25l[K     |██▋                             | 10 kB 28.8 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 5.1 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 7.2 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 4.0 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 4.5 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 5.3 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 5.3 MB/s eta 0:00:01[K     |████████████████████▊           | 81 kB 6.0 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 3.6 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 3.9 MB/s eta 0:00:01[K     |████████████████████████████▌   | 112 kB 3.9 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 3.9 MB/s eta 0:00:01[K     |████████████████████████████████| 126 kB 3.9 MB/s 
[?25h  Building wheel for polyglot (setup.py) ... [?25l[?25hdone


In [2]:
import time
import torch
from torch import nn
from torch import optim
import numpy as np
from polyglot.mapping import Embedding
from conllu import parse
from tqdm import tqdm
from collections import Counter
import random

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/DL for NLP project/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/DL for NLP project


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
class Encoder(nn.Module):

    def __init__(self, vocab_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=100)
        self.bilstm = nn.LSTM(input_size=100, hidden_size=100, num_layers=1, bidirectional=True)

    def forward(self, chars: torch.tensor):  # chars = (N_CHARS,)
        embedded = self.embedding(chars)
        _, (final_hidden, _) = self.bilstm(embedded.view(len(chars), 1, 100))
        return final_hidden.view(-1)  # (2*HIDDEN,)

In [6]:
class POS_Tagger(nn.Module):

    def __init__(self, model_type, use_polyglot, use_freqbin, embedding_matrix, c_vocab_size, b_vocab_size, freq_max, noise):
        super(POS_Tagger, self).__init__()
        self.model_type = model_type
        self.use_polyglot = use_polyglot
        self.use_freqbin = use_freqbin
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix).requires_grad_(True)
        self.characterbased = Encoder(c_vocab_size).to(device)
        self.bytebased = Encoder(b_vocab_size).to(device)
        self.input_size = 0
        if 'w' in model_type:
            self.input_size += 64 if use_polyglot else 128
        if 'c' in model_type:
            self.input_size += 200
        if 'b' in model_type:
            self.input_size += 200
        self.bilstm = nn.LSTM(input_size=self.input_size, hidden_size=100, num_layers=1, bidirectional=True)
        self.pos_tagger = nn.Linear(in_features=200, out_features=17)
        self.freqbin = nn.Linear(in_features=200, out_features=freq_max)
        self.noise = noise

    def forward(self, tokens: torch.tensor, char_lists: list,
                byte_lists: list):  # tokens = (N_TOKENS, 64), char_lists = List[List[int]]
        concatted = torch.zeros((len(tokens), self.input_size), device=device)  # concatted = (N_TOKENS, 264)
        if 'w' in self.model_type:
            embedded_words = self.embedding(tokens)
        for i, (char_list, byte_list) in enumerate(zip(char_lists, byte_lists)):
            embedded = torch.zeros((0,), device=device)
            if 'w' in self.model_type:
                embedded = torch.concat((embedded, embedded_words[i]))
            if 'c' in self.model_type:
                embedded_characters = self.characterbased(torch.tensor(char_list, device=device))
                embedded = torch.concat((embedded, embedded_characters))
            if 'b' in self.model_type:
                embedded_bytes = self.bytebased(torch.tensor(byte_list, device=device))
                embedded = torch.concat((embedded, embedded_bytes))
            concatted[i] = embedded
        if self.training:
            noise = torch.autograd.Variable(concatted.data.new(concatted.size()).normal_(0, self.noise))
            concatted = concatted + noise
        bilstm_out, _ = self.bilstm(concatted.view(len(tokens), 1, self.input_size))
        pos_tags = self.pos_tagger(bilstm_out.view(len(tokens), 200))
        if self.use_freqbin:
            freq = self.freqbin(bilstm_out.view(len(tokens), 200))
            return pos_tags, freq
        else:
            return pos_tags, None

In [7]:
class Main:

    def __init__(self, language, model_type=('w', 'c'), polyglot=False, freqbin=False):
        with open(f"languages/{language}/{language}-ud-train.conllu") as file:
            self.train_data = parse(file.read())
            random.seed(0)
            self.train_data = random.sample(self.train_data, len(self.train_data))[:5000]
        # with open(f"languages/{language}/{language}-ud-dev.conllu") as file:
        #     self.dev_data = parse(file.read())
        with open(f"languages/{language}/{language}-ud-test.conllu") as file:
            self.test_data = parse(file.read())
        self.embeds = Embedding.from_glove(f"polyglot/{language}.polyglot.txt")

        self.n_epochs = 20
        self.report_every = 21
        self.learning_rate = 0.1
        self.noise = 0.2

        self.model_type = model_type
        self.polyglot = polyglot
        self.freqbin = freqbin

    def build_indexes(self):
        self.w2i = {}
        self.c2i = {}
        self.b2i = {}
        self.l2i = {}
        self.w2i["_UNK"] = 0
        self.c2i["_UNK"] = 0
        self.c2i["<w>"] = 1
        self.c2i["</w>"] = 2
        self.b2i["_UNK"] = 0
        self.b2i["<w>"] = 1
        self.b2i["</w>"] = 2
        self.l2i = {"ADJ": 0, "ADP": 1, "ADV": 2, "AUX": 3, "CONJ": 4, "DET": 5,
                    "INTJ": 6, "NOUN": 7, "NUM": 8, "PART": 9, "PRON": 10,
                    "PROPN": 11, "PUNCT": 12, "SCONJ": 13, "SYM": 14, "VERB": 15,
                    "X": 16}
        self.freqbin_dict = {}
        tokens = []
        for sentence in self.train_data:
            for token in sentence:
                if type(token['id']) != int:
                    continue
                word = token['form'].lower()
                if word not in self.w2i:
                    self.w2i[word] = len(self.w2i)
                for character in word:
                    if character not in self.c2i:
                        self.c2i[character] = len(self.c2i)
                    for byte in character.encode("utf-8"):
                        if byte not in self.b2i:
                            self.b2i[byte] = len(self.b2i)
                tokens.append(token['form'].lower())
        for word, frequency in Counter(tokens).items():
            self.freqbin_dict[word] = int(np.log(frequency))

        if self.polyglot:
            self.embedding_matrix = torch.FloatTensor(size=(len(self.w2i), 64))
            for word, index in self.w2i.items():
                embedding = self.embeds.get(word.lower())
                if self.polyglot and embedding is not None:
                    self.embedding_matrix[index] = torch.FloatTensor(embedding)
                else:
                    self.embedding_matrix[index] = torch.rand((1, 64))
        else:
            self.embedding_matrix = torch.rand((len(self.w2i), 128))

    def tensorize_data(self, sentence):
        tokens_list = []
        char_lists = []
        byte_lists = []
        pos_tag_list = []
        freq_list = []
        for token in sentence:
            if type(token['id']) != int:
                continue
            word = token['form'].lower()
            tokens_list.append(self.w2i[word] if word in self.w2i else 0)
            char_list = [self.c2i['<w>']]
            byte_list = [self.b2i['<w>']]
            for char in word:
                char_list.append(self.c2i[char] if char in self.c2i else 0)
                for byte in char.encode('utf-8'):
                    byte_list.append(self.b2i[byte] if byte in self.b2i else 0)
            char_list.append(self.c2i['</w>'])
            byte_list.append(self.b2i['</w>'])
            char_lists.append(char_list)
            byte_lists.append(byte_list)
            pos_tag_list.append(self.l2i[token['upos']])
            freq_list.append(self.freqbin_dict[word] if word in self.freqbin_dict else 0)
        tokens = torch.LongTensor(tokens_list).to(device)
        pos_gold = torch.LongTensor(pos_tag_list).to(device)
        freq_gold = torch.LongTensor(freq_list).to(device)
        return tokens, char_lists, byte_lists, pos_gold, freq_gold

    def eval(self, data):
        self.model.eval()
        with torch.no_grad():
            accuracy = 0
            n_tokens = 0
            for sentence in tqdm(data, desc="Evaluation"):
                tokens, char_lists, byte_lists, golden, _ = self.tensorize_data(sentence)
                pred, _ = self.model(tokens, char_lists, byte_lists)
                pred_label = torch.argmax(pred, dim=1)
                accuracy += torch.sum(pred_label == golden)
                n_tokens += len(tokens)
        return accuracy / n_tokens

    def train(self):

        self.model = POS_Tagger(self.model_type, self.polyglot, self.freqbin, self.embedding_matrix, 
                                len(self.c2i), len(self.b2i), max(self.freqbin_dict.values()) + 1, self.noise
                                ).to(device)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate)

        for epoch in range(self.n_epochs):
            total_loss = 0
            self.model.train()
            for i, sentence in enumerate(tqdm(self.train_data, desc="Training  ")):
                optimizer.zero_grad()
                tokens, char_lists, byte_lists, pos_gold, freq_gold = self.tensorize_data(sentence)
                pos_pred, freq_pred = self.model(tokens, char_lists, byte_lists)
                loss = criterion(pos_pred, pos_gold)
                if self.freqbin:
                    loss += criterion(freq_pred[:-1], freq_gold[1:])
                total_loss += loss
                loss.backward()
                optimizer.step()

            # Testing
            if ((epoch + 1) % self.report_every) == 0:
                train_accuracy = self.eval(self.train_data)
                dev_accuracy = self.eval(self.dev_data)
                loss = total_loss / len(self.train_data)
                print(f"epoch: {epoch}, loss: {loss:.4f}, train acc: {train_accuracy:.4f}, dev acc: {dev_accuracy:.4f}")

    def test(self):
        test_accuracy = self.eval(self.test_data)
        print(f"\nTest accuracy: {test_accuracy}")
        return test_accuracy

In [8]:
def model_name(model):
    out = '+'.join(model['model_type'])
    if model['polyglot']:
        out += '_p'
    if model['freqbin']:
        out += '_f'
    return out

In [None]:
models = [
        # {'model_type': ('w',), 'polyglot': False, 'freqbin': False},
        # {'model_type': ('c',), 'polyglot': False, 'freqbin': False},
        # {'model_type': ('c', 'b'), 'polyglot': False, 'freqbin': False},
        # {'model_type': ('w', 'c'), 'polyglot': False, 'freqbin': False},
        # {'model_type': ('w', 'c'), 'polyglot': True, 'freqbin': False},
        {'model_type': ('w', 'c'), 'polyglot': True, 'freqbin': True},
        ]
languages = ['ar', 'bg', 'cs', 'da', 'de', 'en', 'es', 'eu', 'fa', 'fi', 'fr',
             'he', 'hi', 'hr', 'id', 'it', 'nl', 'no', 'pl', 'pt', 'sl', 'sv']
with open("results.csv", 'w') as file:
    file.write(f"Language, Model, Accuracy, Time\n")
with open("results.csv", 'a', 1) as file:
    for language in reversed(languages):
        print(f"Working with language {language}")
        for model in models:
            print(f"\tTraining model {model_name(model)}")
            pos_tagger = Main(language, **model)
            pos_tagger.build_indexes()
            start = time.time()
            pos_tagger.train()
            end = time.time()
            file.write(f"{language}, {model_name(model)}, {pos_tagger.test()}, {end - start}\n")
            torch.save(pos_tagger.model.state_dict(), f"models/{model_name(model)}_{language}.pt")