In [1]:
import pickle
import numpy as np
from collections import defaultdict
from tqdm import tqdm_notebook

In [2]:
lang = 'en'
fname = f'D:/datasets/nlp/fasttext/wiki.{lang}/wiki.{lang}.vec'
pkl_fname = f'data/wiki.{lang}.pkl'

## Word Vectors

https://github.com/facebookresearch/fastText

In [None]:

# N = len(open(fname, encoding='utf8').read().split('\n'))

# print(N)

word_vecs = defaultdict()
with open(fname, encoding='utf8') as f:
    for i, line in enumerate(tqdm_notebook(f, desc='reading')):
        if i == 0: continue
        tokens = line.strip().split(' ')
        if len(tokens) != 301: continue
        word = tokens[0]
        wv = np.array([float(x) for x in tokens[1:]])
        word_vecs[word] = wv
        
pickle.dump(word_vecs, open(pkl_fname, 'wb'))

In [None]:
wvecs = pickle.load(open(pkl_fname, 'rb'))

In [None]:
for i, w in enumerate(wvecs):
    print(w)
    if i > 20: break

## WordVector class

In [3]:
class WordVector(object):
    def __init__(self, lang, filename):
        " Create an instance of word vecor class."
        self.lang = lang
        self.word_vectors = pickle.load(open(filename, 'rb'))
        self.vector_size = len(self.word_vectors['</s>'])
        self.mean_vector = None
        self.stddev = None
        
    def __len__(self):
        return len(self.word_vectors)
    
    def __getitem__(self, word):
        return self.get_word_vector(word)
        
    def get_words(self):
        return [w for w in self.word_vectors]
    
    def mean(self):
        if self.mean_vector is None:
            vectors = np.stack([v for _, v in self.word_vectors.items()], axis=0)
            self.mean_vector = np.mean(vectors, axis=0)
        return self.mean_vector
    
    def std(self):
        if self.stddev is None:
            vectors = np.stack([v for _, v in self.word_vectors.items()], axis=0)
            self.stddev = np.std(vectors)
        return self.stddev
    
    def get_word_vector(self, word):
        return self.word_vectors.get(word, self.mean() + np.random.normal(scale=self.std(), size=(self.vector_size,)))
        
    def get_word_vectors(self, words):
        return [self.get_word_vector(w) for w in words]
        
    def most_similar(self, word, n=10):
        v = self.get_word_vector(word)
        V = np.stack([v for _, v in self.word_vectors.items()], axis=0)
        dist = V - v
        idxes = np.argsort(np.sum(dist * dist, axis=1))[:n]
        
        result = []
        for i, w in enumerate(self.word_vectors.keys()):
            if i in idxes:
                result += [w]
        return result

In [4]:
%%time
word_vectors = WordVector(lang, pkl_fname)

Wall time: 15.7 s


In [5]:
len(word_vectors)

2519072

In [6]:
len(word_vectors.get_words())

2519072

In [7]:
word_vectors.vector_size

300

In [14]:
word_vectors.get_word_vector('naser')

array([ 0.041985 , -1.1396   ,  0.034302 ,  0.64125  ,  0.089276 ,
       -0.77393  , -0.013124 , -0.20347  , -0.54934  ,  0.1399   ,
       -0.070892 ,  0.21652  , -0.42647  , -0.072916 , -0.084692 ,
       -0.14336  ,  0.09212  ,  0.088827 , -0.45631  , -0.34052  ,
       -0.071695 ,  0.45763  , -0.17093  ,  0.0082519,  0.10863  ,
       -0.42013  ,  0.089975 ,  0.29635  , -0.29825  ,  0.41596  ,
       -0.15034  , -0.80878  , -0.15346  , -0.35681  ,  0.14044  ,
        0.029812 , -0.023503 , -0.085503 , -0.16474  ,  0.14628  ,
        0.0027051,  0.50969  ,  0.30577  , -0.29629  ,  0.51845  ,
        0.056919 , -0.2038   ,  0.12229  ,  0.079332 , -0.17808  ,
       -0.34848  , -0.30105  , -0.29494  ,  0.12041  ,  0.15768  ,
       -0.45548  ,  0.26775  , -0.39971  , -0.22564  ,  0.19732  ,
        0.25455  ,  0.4003   ,  0.16563  , -0.25421  , -0.24958  ,
       -0.351    , -0.023575 ,  0.094394 , -0.54649  ,  0.017168 ,
        0.29266  ,  0.10806  ,  0.17613  , -0.80156  , -0.4188

In [9]:
word_vectors.mean()

array([-0.10205359, -0.02326221, -0.16352156,  0.15728838, -0.13871301,
       -0.11039955,  0.19033169, -0.09665664, -0.10636277,  0.06610439,
        0.02778889,  0.05542875, -0.19266407, -0.16751885,  0.01827158,
       -0.20545288, -0.04739126,  0.09778675, -0.00279672,  0.36863911,
       -0.10439755,  0.19294405, -0.15567456, -0.0679449 , -0.25715248,
       -0.13536481, -0.00901685, -0.1016655 ,  0.06599224,  0.18713068,
       -0.15736825,  0.22180213, -0.15763848,  0.08059061,  0.03168765,
        0.01327852,  0.00629826, -0.00879637,  0.00241807, -0.10346001,
       -0.01693895,  0.02723596,  0.03406903, -0.00360328,  0.11942819,
        0.03686219,  0.00401023, -0.10689108,  0.05666206,  0.03063519,
        0.04111546, -0.2810808 , -0.07629056,  0.01554295, -0.02203771,
       -0.04047799,  0.24183744,  0.03369634,  0.00629663,  0.22661112,
        0.01703174,  0.06665025,  0.17627028, -0.0695035 , -0.03441882,
       -0.11930528,  0.0149045 ,  0.18129509, -0.04238762, -0.02

In [10]:
word_vectors.std()

0.2928340977562775

In [15]:
word_vectors.most_similar('brother', n=30)

['son',
 'father',
 'brother',
 'brothers',
 'younger',
 'uncle',
 'cousin',
 'grandfather',
 'elder',
 'grandson',
 'nephew',
 'siblings',
 'niece',
 'stepfather',
 'nephews',
 'uncles',
 'stepson',
 'stepbrother',
 'stepbrothers',
 'brotherson',
 'halfbrother',
 'stepgrandfather',
 'brother…',
 '—father',
 'brother—who',
 'nephew,',
 'brothers—one',
 'brother—and',
 'brothering',
 '—brother']

In [17]:
word_vectors.most_similar('book', n=30)

['book',
 'published',
 'books',
 'author',
 'foreword',
 'republication',
 'book—',
 'books—the',
 'book—the',
 'book—and',
 'book—which',
 'books—and',
 'foreworded',
 'book—in',
 'booklore',
 'books,and',
 'books—all',
 'book—but',
 'book—one',
 'booklegger',
 'book—with',
 'book\xa0of\xa0abraham',
 'books—which',
 'book—it',
 'book>',
 'book—to',
 'published—and',
 'book—is',
 'author,and',
 'book,and']