In [44]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance

In [1]:
%matplotlib inline

 Представляем модель:

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



Получаем лексику из модели:

In [4]:
for i, word in enumerate(wv.vocab):
    if i == 10:
        break
    print(word)

</s>
in
for
that
is
on
##
The
with
said


Получаем векторы для слова 'king':

In [5]:
vec_king = wv['king']

In [6]:
print(vec_king)

[ 1.25976562e-01  2.97851562e-02  8.60595703e-03  1.39648438e-01
 -2.56347656e-02 -3.61328125e-02  1.11816406e-01 -1.98242188e-01
  5.12695312e-02  3.63281250e-01 -2.42187500e-01 -3.02734375e-01
 -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01
  3.46679688e-02  5.21850586e-03  4.63867188e-02  1.28906250e-01
  1.36718750e-01  1.12792969e-01  5.95703125e-02  1.36718750e-01
  1.01074219e-01 -1.76757812e-01 -2.51953125e-01  5.98144531e-02
  3.41796875e-01 -3.11279297e-02  1.04492188e-01  6.17675781e-02
  1.24511719e-01  4.00390625e-01 -3.22265625e-01  8.39843750e-02
  3.90625000e-02  5.85937500e-03  7.03125000e-02  1.72851562e-01
  1.38671875e-01 -2.31445312e-01  2.83203125e-01  1.42578125e-01
  3.41796875e-01 -2.39257812e-02 -1.09863281e-01  3.32031250e-02
 -5.46875000e-02  1.53198242e-02 -1.62109375e-01  1.58203125e-01
 -2.59765625e-01  2.01416016e-02 -1.63085938e-01  1.35803223e-03
 -1.44531250e-01 -5.68847656e-02  4.29687500e-02 -2.46582031e-02
  1.85546875e-01  4.47265

С незнакомым словом модель не сработает:

In [7]:
try:
    vec_cameroon = wv['cameroon']
except KeyError:
    print("The word 'cameroon' does not appear in this model")

The word 'cameroon' does not appear in this model


Чем менее схожи два слова, тем меньшая корреляция им приписывается:

In [8]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


Высчитаем косинусную близость:

In [41]:
cosine_similarity([wv['dignity']], [wv['faith']])

array([[0.38274416]], dtype=float32)

In [42]:
wv.similarity('dignity', 'faith')

0.38274416

In [43]:
#cosine similarity
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, cosine_similarity([wv[w1]], [wv[w2]])))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


Высчитаем расстояние по разным метрикам:

In [45]:
distance.hamming(wv['dignity'], wv['faith'])

1.0

In [46]:
#hamming distance
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, distance.hamming(wv[w1], wv[w2])))

'car'	'minivan'	1.00
'car'	'bicycle'	1.00
'car'	'airplane'	1.00
'car'	'cereal'	1.00
'car'	'communism'	1.00


In [47]:
distance.euclidean(wv['dignity'], wv['faith'])

3.4608068466186523

In [48]:
#euclidean distance
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, distance.euclidean(wv[w1], wv[w2])))

'car'	'minivan'	2.53
'car'	'bicycle'	2.70
'car'	'airplane'	3.08
'car'	'cereal'	3.99
'car'	'communism'	4.58


In [49]:
distance.jaccard(wv['dignity'], wv['faith'])

1.0

In [50]:
#jaccard distance
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, distance.jaccard(wv[w1], wv[w2])))

'car'	'minivan'	1.00
'car'	'bicycle'	1.00
'car'	'airplane'	1.00
'car'	'cereal'	1.00
'car'	'communism'	1.00


In [51]:
distance.minkowski(wv['dignity'], wv['faith'])

3.4608068466186523

In [52]:
#minkowski distance
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, distance.minkowski(wv[w1], wv[w2])))

'car'	'minivan'	2.53
'car'	'bicycle'	2.70
'car'	'airplane'	3.08
'car'	'cereal'	3.99
'car'	'communism'	4.58


In [53]:
distance.chebyshev(wv['dignity'], wv['faith'])

0.7421875

In [54]:
#chebyshev distance
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, distance.chebyshev(wv[w1], wv[w2])))

'car'	'minivan'	0.47
'car'	'bicycle'	0.46
'car'	'airplane'	0.55
'car'	'cereal'	0.72
'car'	'communism'	0.88


Высчитаем корреляцию:

In [55]:
np.correlate(wv['dignity'], wv['faith'])

array([3.6290274], dtype=float32)

In [56]:
np.corrcoef(wv['dignity'], wv['faith'])

array([[1.        , 0.38274354],
       [0.38274354, 1.        ]])

In [57]:
#correlation
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, np.correlate(wv[w1], wv[w2])))

'car'	'minivan'	6.37
'car'	'bicycle'	4.18
'car'	'airplane'	3.44
'car'	'cereal'	1.25
'car'	'communism'	0.60


In [58]:
for w1, w2 in pairs:
    print('%r\t%r\t' % (w1, w2))
    print(np.corrcoef(wv[w1], wv[w2]))
    print('\n')

'car'	'minivan'	
[[1.         0.69121251]
 [0.69121251 1.        ]]


'car'	'bicycle'	
[[1.         0.53311424]
 [0.53311424 1.        ]]


'car'	'airplane'	
[[1.         0.41951464]
 [0.41951464 1.        ]]


'car'	'cereal'	
[[1.         0.13292977]
 [0.13292977 1.        ]]


'car'	'communism'	
[[1.         0.05575202]
 [0.05575202 1.        ]]




Найдём 5 самых близких слова к слову 'minivan'

In [9]:
print(wv.most_similar(positive=['car', 'minivan'], topn=5))

[('SUV', 0.853219211101532), ('vehicle', 0.8175784349441528), ('pickup_truck', 0.7763689160346985), ('Jeep', 0.7567334175109863), ('Ford_Explorer', 0.756571888923645)]


Модель может находить лишнее слово в списке:

In [10]:
print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))

car


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


Тренируем собственную модель:

In [11]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

Обозначим корпус, на котором тренируем:

In [12]:
import gensim.models

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)

Сделаем с нашей моделью то же, что и с демонстрационной:

In [13]:
vec_king = model.wv['king']

In [14]:
for i, word in enumerate(model.wv.vocab):
    if i == 10:
        break
    print(word)

hundreds
of
people
have
been
forced
to
their
homes
in


Сохраним созданную модель:

In [22]:
import tempfile

with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False) as tmp:
    temporary_filepath = tmp.name
    model.save(temporary_filepath)
    #
    # The model is now safely stored in the filepath.
    # You can copy it to other machines, share it with others, etc.
    #
    # To load a saved model:
    #
    new_model = gensim.models.Word2Vec.load(temporary_filepath)

Очистим модель от редко встречающихся слов, не представляющих для нас интереса:

In [23]:
model = gensim.models.Word2Vec(sentences, min_count=10)

Изменим количество измерений модели:

In [24]:
# default value of size=100
model = gensim.models.Word2Vec(sentences, size=200)

In [25]:
!pip install cython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Следующий параметр ускоряет тренировку модели:

In [26]:
# default value of workers=3 (tutorial says 1...)
model = gensim.models.Word2Vec(sentences, workers=4)



Настроим возможность менять окна:

In [None]:
sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)

С окном 3:

In [68]:
# instantiating and training the Word2Vec model
model_window_3 = gensim.models.Word2Vec(
    sentences=sentences,
    min_count=1,
    compute_loss=True,
    hs=0,
    sg=1,
    seed=42,
    window = 3
)

# getting the training loss value
training_loss = model_window_3.get_latest_training_loss()
print(training_loss)

1158084.875


С окном 5:

In [69]:
# instantiating and training the Word2Vec model
model_window_5 = gensim.models.Word2Vec(
    sentences=sentences,
    min_count=1,
    compute_loss=True,
    hs=0,
    sg=1,
    seed=42,
    window = 5
)

# getting the training loss value
training_loss = model_window_5.get_latest_training_loss()
print(training_loss)

1552052.375


С окном 8:

In [70]:
# instantiating and training the Word2Vec model
model_window_8 = gensim.models.Word2Vec(
    sentences=sentences,
    min_count=1,
    compute_loss=True,
    hs=0,
    sg=1,
    seed=42,
    window = 8
)

# getting the training loss value
training_loss = model_window_8.get_latest_training_loss()
print(training_loss)

2102435.25


Оцениваем модель:

In [28]:
model.wv.evaluate_word_analogies(datapath('questions-words.txt'))

(0.00684931506849315,
 [{'section': 'capital-common-countries',
   'correct': [],
   'incorrect': [('CANBERRA', 'AUSTRALIA', 'KABUL', 'AFGHANISTAN'),
    ('CANBERRA', 'AUSTRALIA', 'PARIS', 'FRANCE'),
    ('KABUL', 'AFGHANISTAN', 'PARIS', 'FRANCE'),
    ('KABUL', 'AFGHANISTAN', 'CANBERRA', 'AUSTRALIA'),
    ('PARIS', 'FRANCE', 'CANBERRA', 'AUSTRALIA'),
    ('PARIS', 'FRANCE', 'KABUL', 'AFGHANISTAN')]},
  {'section': 'capital-world',
   'correct': [],
   'incorrect': [('CANBERRA', 'AUSTRALIA', 'KABUL', 'AFGHANISTAN'),
    ('KABUL', 'AFGHANISTAN', 'PARIS', 'FRANCE')]},
  {'section': 'currency', 'correct': [], 'incorrect': []},
  {'section': 'city-in-state', 'correct': [], 'incorrect': []},
  {'section': 'family',
   'correct': [],
   'incorrect': [('HE', 'SHE', 'HIS', 'HER'),
    ('HE', 'SHE', 'MAN', 'WOMAN'),
    ('HIS', 'HER', 'MAN', 'WOMAN'),
    ('HIS', 'HER', 'HE', 'SHE'),
    ('MAN', 'WOMAN', 'HE', 'SHE'),
    ('MAN', 'WOMAN', 'HIS', 'HER')]},
  {'section': 'gram1-adjective-to-adver

In [30]:
model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))

((0.1765858126084932, 0.17711611543644676),
 SpearmanrResult(correlation=0.12030012579531764, pvalue=0.3598926933522787),
 83.0028328611898)

Добавим предложение в модель:

In [31]:
model = gensim.models.Word2Vec.load(temporary_filepath)
more_sentences = [
    ['Advanced', 'users', 'can', 'load', 'a', 'model',
     'and', 'continue', 'training', 'it', 'with', 'more', 'sentences']
]
model.build_vocab(more_sentences, update=True)
model.train(more_sentences, total_examples=model.corpus_count, epochs=model.iter)

# cleaning up temporary file
import os
os.remove(temporary_filepath)

  import sys


Вычислим loss:

In [32]:
# instantiating and training the Word2Vec model
model_with_loss = gensim.models.Word2Vec(
    sentences,
    min_count=1,
    compute_loss=True,
    hs=0,
    sg=1,
    seed=42
)

# getting the training loss value
training_loss = model_with_loss.get_latest_training_loss()
print(training_loss)

1369235.625


Проведём сравнение:

In [33]:
import io
import os

import gensim.models.word2vec
import gensim.downloader as api
import smart_open


def head(path, size):
    with smart_open.open(path) as fin:
        return io.StringIO(fin.read(size))


def generate_input_data():
    lee_path = datapath('lee_background.cor')
    ls = gensim.models.word2vec.LineSentence(lee_path)
    ls.name = '25kB'
    yield ls

    text8_path = api.load('text8').fn
    labels = ('1MB', '10MB', '50MB', '100MB')
    sizes = (1024 ** 2, 10 * 1024 ** 2, 50 * 1024 ** 2, 100 * 1024 ** 2)
    for l, s in zip(labels, sizes):
        ls = gensim.models.word2vec.LineSentence(head(text8_path, s))
        ls.name = l
        yield ls


input_data = list(generate_input_data())



In [34]:
# Temporarily reduce logging verbosity
logging.root.level = logging.ERROR

import time
import numpy as np
import pandas as pd

train_time_values = []
seed_val = 42
sg_values = [0, 1]
hs_values = [0, 1]

fast = True
if fast:
    input_data_subset = input_data[:3]
else:
    input_data_subset = input_data


for data in input_data_subset:
    for sg_val in sg_values:
        for hs_val in hs_values:
            for loss_flag in [True, False]:
                time_taken_list = []
                for i in range(3):
                    start_time = time.time()
                    w2v_model = gensim.models.Word2Vec(
                        data,
                        compute_loss=loss_flag,
                        sg=sg_val,
                        hs=hs_val,
                        seed=seed_val,
                    )
                    time_taken_list.append(time.time() - start_time)

                time_taken_list = np.array(time_taken_list)
                time_mean = np.mean(time_taken_list)
                time_std = np.std(time_taken_list)

                model_result = {
                    'train_data': data.name,
                    'compute_loss': loss_flag,
                    'sg': sg_val,
                    'hs': hs_val,
                    'train_time_mean': time_mean,
                    'train_time_std': time_std,
                }
                print("Word2vec model #%i: %s" % (len(train_time_values), model_result))
                train_time_values.append(model_result)

train_times_table = pd.DataFrame(train_time_values)
train_times_table = train_times_table.sort_values(
    by=['train_data', 'sg', 'hs', 'compute_loss'],
    ascending=[False, False, True, False],
)
print(train_times_table)

Word2vec model #0: {'train_data': '25kB', 'compute_loss': True, 'sg': 0, 'hs': 0, 'train_time_mean': 0.7297822634379069, 'train_time_std': 0.20466097400105315}
Word2vec model #1: {'train_data': '25kB', 'compute_loss': False, 'sg': 0, 'hs': 0, 'train_time_mean': 0.5857268969217936, 'train_time_std': 0.012935529091057898}
Word2vec model #2: {'train_data': '25kB', 'compute_loss': True, 'sg': 0, 'hs': 1, 'train_time_mean': 0.7779197692871094, 'train_time_std': 0.030166712521759488}
Word2vec model #3: {'train_data': '25kB', 'compute_loss': False, 'sg': 0, 'hs': 1, 'train_time_mean': 0.7543358008066813, 'train_time_std': 0.013840382408474199}
Word2vec model #4: {'train_data': '25kB', 'compute_loss': True, 'sg': 1, 'hs': 0, 'train_time_mean': 0.9387257893880209, 'train_time_std': 0.011987804247392424}
Word2vec model #5: {'train_data': '25kB', 'compute_loss': False, 'sg': 1, 'hs': 0, 'train_time_mean': 0.9382279713948568, 'train_time_std': 0.002227148225638777}
Word2vec model #6: {'train_data'

Кэшируем схожие слова в словарь:

In [35]:
# re-enable logging
logging.root.level = logging.INFO

most_similars_precalc = {word : model.wv.most_similar(word) for word in model.wv.index2word}
for i, (key, value) in enumerate(most_similars_precalc.items()):
    if i == 3:
        break
    print(key, value)

the [('of', 0.9999169111251831), ('in', 0.9999136328697205), ('on', 0.9999097585678101), ('two', 0.9999056458473206), ('with', 0.9999037981033325), ('australian', 0.9999022483825684), ('its', 0.9999018311500549), ('at', 0.999901533126831), ('from', 0.9999009370803833), ('for', 0.9999004006385803)]
to [('and', 0.9999486207962036), ('will', 0.9999473094940186), ('are', 0.9999469518661499), ('which', 0.9999457597732544), ('from', 0.9999455213546753), ('for', 0.9999455213546753), ('is', 0.999944806098938), ('on', 0.9999433755874634), ('would', 0.9999428987503052), ('but', 0.9999422430992126)]
of [('by', 0.9999492168426514), ('into', 0.9999451637268066), ('after', 0.9999420046806335), ('on', 0.9999416470527649), ('at', 0.9999412298202515), ('today', 0.9999401569366455), ('with', 0.9999391436576843), ('in', 0.9999380707740784), ('and', 0.9999357461929321), ('an', 0.9999356865882874)]


Сравним результаты с кэшированием и без:

In [36]:
import time
words = ['voted', 'few', 'their', 'around']

Без кэширования:

In [37]:
start = time.time()
for word in words:
    result = model.wv.most_similar(word)
    print(result)
end = time.time()
print(end - start)

[('is', 0.9981661438941956), ('be', 0.9981362819671631), ('are', 0.9981335401535034), ('israelis', 0.9981286525726318), ('to', 0.998114287853241), ('says', 0.9981071949005127), ('call', 0.9981005191802979), ('information', 0.9980980157852173), ('team', 0.9980946779251099), ('trying', 0.9980891942977905)]
[('one', 0.9998093843460083), ('them', 0.9998065829277039), ('an', 0.9998046159744263), ('areas', 0.9998036623001099), ('against', 0.9998013377189636), ('their', 0.9998008012771606), ('pakistan', 0.9997987747192383), ('him', 0.9997983574867249), ('us', 0.99979567527771), ('has', 0.9997936487197876)]
[('up', 0.9999465942382812), ('an', 0.9999445080757141), ('two', 0.999941885471344), ('which', 0.9999412298202515), ('before', 0.9999393820762634), ('out', 0.9999393224716187), ('from', 0.9999388456344604), ('as', 0.9999385476112366), ('with', 0.999938428401947), ('its', 0.9999377727508545)]
[('who', 0.9999216794967651), ('for', 0.9999215602874756), ('also', 0.9999210834503174), ('and', 0.9

С кэшированием:

In [38]:
start = time.time()
for word in words:
    if 'voted' in most_similars_precalc:
        result = most_similars_precalc[word]
        print(result)
    else:
        result = model.wv.most_similar(word)
        most_similars_precalc[word] = result
        print(result)

end = time.time()
print(end - start)

[('is', 0.9981661438941956), ('be', 0.9981362819671631), ('are', 0.9981335401535034), ('israelis', 0.9981286525726318), ('to', 0.998114287853241), ('says', 0.9981071949005127), ('call', 0.9981005191802979), ('information', 0.9980980157852173), ('team', 0.9980946779251099), ('trying', 0.9980891942977905)]
[('one', 0.9998093843460083), ('them', 0.9998065829277039), ('an', 0.9998046159744263), ('areas', 0.9998036623001099), ('against', 0.9998013377189636), ('their', 0.9998008012771606), ('pakistan', 0.9997987747192383), ('him', 0.9997983574867249), ('us', 0.99979567527771), ('has', 0.9997936487197876)]
[('up', 0.9999465942382812), ('an', 0.9999445080757141), ('two', 0.999941885471344), ('which', 0.9999412298202515), ('before', 0.9999393820762634), ('out', 0.9999393224716187), ('from', 0.9999388456344604), ('as', 0.9999385476112366), ('with', 0.999938428401947), ('its', 0.9999377727508545)]
[('who', 0.9999216794967651), ('for', 0.9999215602874756), ('also', 0.9999210834503174), ('and', 0.9

Визуализируем окружение слов:

In [39]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    for word in model.wv.vocab:
        vectors.append(model.wv[word])
        labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)

