In [299]:
import junky
from corpuscula.corpus_utils import syntagrus

from collections import OrderedDict, Counter
from itertools import chain
from tqdm import tqdm
import numpy as np
import math

In [2]:
junky.clear_tqdm()
train, train_lemmas = junky.get_conllu_fields(syntagrus.train, fields=['LEMMA'])
dev, dev_lemmas = junky.get_conllu_fields(syntagrus.dev, fields=['LEMMA'])
test, test_lemmas = junky.get_conllu_fields(syntagrus.test, fields=['LEMMA'])

Load corpus
Corpus has been loaded: 48814 sentences, 871526 tokens
Load corpus
Corpus has been loaded: 6584 sentences, 118692 tokens
Load corpus
Corpus has been loaded: 6491 sentences, 117523 tokens


In [3]:
full_corpus = train + train_lemmas + dev + dev_lemmas + test + test_lemmas

In [10]:
word2freq = dict(Counter(list(chain.from_iterable(full_corpus))))

In [17]:
word2freq = OrderedDict(sorted(word2freq.items(), key=lambda t: t[1]))

In [41]:
# {k: v for k, v in points.items() if v > 888}
word2freq = OrderedDict(sorted({k: v for k, v in word2freq.items() if v>888}.items(), key=lambda t: t[1]))

In [42]:
list(word2freq.items())[:10]

[('государство', 904),
 ('работать', 905),
 ('случай', 906),
 ('всегда', 908),
 ('да', 909),
 ('конечно', 912),
 ('здесь', 932),
 ('потом', 936),
 ('тем', 939),
 ('решение', 940)]

In [40]:
word2freq = OrderedDict(
    sorted(
        Counter(
            chain.from_iterable(full_corpus)).items(), key=lambda t: t[1]))

In [56]:
%%time
word2freq = OrderedDict(
    sorted(
        {k: v for k, v in Counter(
            chain.from_iterable(full_corpus)).items() if v>888}.items(), key=lambda t: t[1]))

CPU times: user 284 ms, sys: 0 ns, total: 284 ms
Wall time: 280 ms


In [335]:
%%time
word2freq = OrderedDict(
    sorted(
        filter(lambda x: x[1] > 0.0, 
                    Counter(chain.from_iterable(full_corpus)).items()), key=lambda t: t[1]))

CPU times: user 404 ms, sys: 0 ns, total: 404 ms
Wall time: 401 ms


In [336]:
list(word2freq.items())[:10]

[('разгорелось', 1),
 ('подтекста', 1),
 ('раллиста', 1),
 ('Тристрама', 1),
 ('ледостойкой', 1),
 ('Цимлянское', 1),
 ('верующего', 1),
 ('сочинена', 1),
 ('повеселели', 1),
 ('обошлась', 1)]

In [337]:
list(word2freq.keys())[0]

'разгорелось'

## Distilling vector vocabulary

In [367]:
def filter_embeddings(pretrained_embs, corpus, min_abs_freq=1, save_name=None,
                   include_emb_info=False, pad_token=None, unk_token=None,
                   extra_tokens=None):
    
    """Filters pretrained word embeddings' vocabulary, leaving only tokens 
    that are present in the specified `corpus` which are more frequent than
    minimum absolute frequency `min_abs_freq`. This method allows to
    significantly reduce memory usage and speed up word embedding process.
    The drawbacks include lower performance on unseen data.
    
    Args:
    
    **vectors**: file with pretrained word vectors in text format (not
    binary), where the first line is
    `<vocab_size> <embedding_dimensionality>`.
    
    **corpus**: a list of lists or tuples with already tokenized sentences.
    Filtered result will not contain any tokens outside of this corpus.
    
    **min_abs_freq** (`int`): minimum absolute frequency; only tokens the
    frequency of which is equal or greater than this specified value will be
    included in the filtered word embeddings. Default `min_abs_freq=1`,
    meaning all words from the corpus that have corresponding word vectors in
    `pretrained_embs` are preserved. 
    
    **save_name**(`str`): if specified, filtered word embeddings are saved in
    a file with the specified name.
    
    **include_emb_info**(`bool`): whether to include `<vocab_size> <emb_dim>`
    as the first line to the filtered embeddings file. Default is `False`,
    embedding info line is skipped. Relevant only if `save_name` is not None.
    
    For the arguments below note, that typically pretrained embeddings already
    include PAD or UNK tokens. But these argumets are helpful if you need to
    specify your custom pad/unk/extra tokens or make sure they are at the top
    of the vocab (thus, pad_token will have index=0 for convenience).
    
    **pad_token** (`str`): custom padding token, which is initialized with
    zeros and included at the top of the vocabulary. 
    
    **unk_token** (`str`): custom token for unknown words, which is
    initialized with small random numbers and included at the top of the
    vocabulary.
    
    **extra_tokens** (`list`): list of any extra custom tokens. For now, they
    are initialized with small random numbers and included at the top of the
    vocabulary. Typically, used for special tokens, e.g. start/end tokens etc.
    
    If `save_name` is specified, saves the filtered vocabulary. Otherwise,
    returns word2index OrderedDict and a numpy array of corresponding word
    vectors.
    """
    
    filter_vocab = OrderedDict(
        sorted(
            {k: v 
             for k, v in Counter(chain.from_iterable(corpus)).items()
             if v>=min_abs_freq}.items(), 
            key=lambda t: t[1]))
    
    word2index = OrderedDict()
    vectors = []

    # model in vec or txt format
    # (not binary, first line is <vocab_size> <emb_dim>)
    word2vec_file = open(pretrained_embs)
    
    n_words, embedding_dim = word2vec_file.readline().split()
    n_words, embedding_dim = int(n_words), int(embedding_dim)
    
    if pad_token:
        # Zero vector for PAD
        vectors.append(np.zeros((1, embedding_dim)))
        word2index[pad_token] = len(word2index)
        
    if unk_token:
        # Initializing UNK vector with small random numbers 
        vectors.append(
            np.random.rand(1, embedding_dim) / math.sqrt(embedding_dim))
        word2index[unk_token] = len(word2index)
    
    if extra_tokens:
        # random-small-number vectors for extra_tokens
        for x_t in extra_tokens:
            vectors.append(
                np.random.rand(1, embedding_dim) / math.sqrt(embedding_dim))
            word2index[x_t] = len(word2index)

    progress_bar = tqdm(desc='Filtering vectors', total=n_words)

    while True:
        line = word2vec_file.readline().strip()

        if not line:
            break

        current_parts = line.split()
        current_word = ' '.join(current_parts[:-embedding_dim])

        if current_word in filter_vocab:

            word2index[current_word] = len(word2index)

            current_vectors = current_parts[-embedding_dim:]
            current_vectors = np.array(list(map(float, current_vectors)))
            current_vectors = np.expand_dims(current_vectors, 0)

            vectors.append(current_vectors)
            
        progress_bar.update(1)
    
    progress_bar.close()
    word2vec_file.close()

    vectors = list(np.concatenate(vectors))
    
    if save_name:
        with open(save_name, 'w') as f:
            if include_emb_info:
                print(len(word2index), embedding_dim, file=f)
        
            for word, vector in tqdm(zip(word2index.keys(), vectors), 
                         desc='Saving filtered vectors', total=len(vectors)):
                print(word, ' '.join(str(v) for v in vector),
                      end=' \n', file=f)
    else:
        return word2index, vectors

In [368]:
FT_VECTORS_PATH = 'ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec'

w2x, vects = filter_embeddings(pretrained_embs=FT_VECTORS_PATH, corpus=full_corpus, min_abs_freq=7000,
                         # save_name='filtered_vectors_freq1.vec', 
                               include_emb_info=True,
                         pad_token='[PAD]', unk_token='[UNK]', extra_tokens=['[START]', '[END]']
                  )

Filtering vectors: 100%|██████████| 1560131/1560131 [00:34<00:00, 45728.89it/s]


In [372]:
# vects

In [324]:
s = np.array([0., 0., 0., 0., 0.])

In [290]:
a = np.array2string(s)

In [325]:
' '.join(str(i) for i in s)

'0.0 0.0 0.0 0.0 0.0'

In [360]:
# filename = 'filtered_vectors.vec'
filename = 'filtered_vectors_freq1.vec'
# filename = 'ft_native_300_ru_wiki_lenta_nltk_wordpunct_tokenize.vec'

len(open(filename).readlines())
# close(filename)

128692

In [359]:
a.readline()

'[PAD] 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0

In [165]:
a.readline()

'[PAD] 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n'