In [1]:
"""
Usage examples for embeddings trained on Serbo-Croatian Wikipedia

For more information please refer to:
-Word2Vec tutorial https://radimrehurek.com/gensim/models/word2vec.html
-FastText https://radimrehurek.com/gensim/models/fasttext.html
-Doc2Vec https://radimrehurek.com/gensim/models/doc2vec.html
"""

from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.models.doc2vec import Doc2Vec

In [1]:
"""Download models from http://llod.jerteh.rs/putnich/"""

"""Load models"""
word2vec = Word2Vec.load('embeddings/word2vec/word2vec-sh-wiki.bin')
fasttext = FastText.load('embeddings/fasttext/fasttext-sh-wiki.bin')
doc2vec = Doc2Vec.load('embeddings/doc2vec/doc2vec-sh-wiki.bin')

In [None]:
"""Get word embedding"""
print(word2vec.wv['misliti'])

[-2.4175782  -3.8338094  -2.483057    0.38566908  0.83836174 -0.562291
 -1.4730322  -1.1030451  -0.36971334 -0.20898299 -1.9952496  -1.0570163
 -1.892794    0.1037849  -0.47431648  2.8931887  -0.4214371   0.25094286
 -0.9301952  -2.5353098  -2.5891504   1.7014676   2.1399944   1.3902944
  3.1831017  -1.7514678   0.5296549  -0.87773275 -2.952725    0.3071484
  3.386068    1.1704432  -0.7444692   0.60305977  1.4440508  -0.6556727
  0.24221562 -1.7195258   0.71801907 -0.1644092   0.14656577  0.7531406
  1.3670108  -0.02269833 -1.3307906   0.867045   -1.9414064  -1.851748
 -0.18317665  1.432486    1.6614187  -4.055946   -3.2940412   2.8124099
 -1.3876822   0.19905931  1.3830159  -4.538821   -1.3650875  -1.5292196
  4.449269    2.2378736  -0.5724806  -2.071407    3.290039    2.6091652
  0.81459886 -1.8403133   0.47773957  0.6252073   1.1328638   2.6836839
 -1.3068022   2.061742    1.112584    1.5156024  -0.15288864  1.5629045
 -1.1689259   2.2272818   0.8024054   0.3959393   1.8815391   1.6

In [None]:
"""Get nearest words"""
print(word2vec.wv.most_similar(positive='misliti', topn=10))

[('pomisliti', 0.8272355198860168), ('reći', 0.7675224542617798), ('sumnjati', 0.7673249244689941), ('kazati', 0.7565544843673706), ('vjerovati', 0.7456005811691284), ('tvrditi', 0.7440354228019714), ('znati', 0.74385666847229), ('veruje', 0.7371051907539368), ('ispostavljati', 0.7142175436019897), ('veruju', 0.7112040519714355)]


In [None]:
oblacno = 'Danas je oblačno'
oblacno_reversed = 'Oblačno je danas'
suncano = 'Sutra je sunčano'
guzva = 'Gužva u saobraćaju'

In [None]:
fasttext.wv[oblacno]

array([-1.8494327 , -0.1647695 ,  1.5611722 ,  0.1954371 , -0.8534858 ,
       -0.17160808, -0.63202274, -0.26581982, -0.20364723,  0.5406822 ,
        0.720155  , -0.2768724 , -1.0120019 , -0.39838263, -0.625616  ,
        0.94517326,  0.38049704,  0.447077  , -0.4925514 , -0.04987473,
        0.7011778 , -0.3815585 ,  0.23962598,  0.49945247, -0.05200323,
        0.01328462,  0.12581816,  1.0299982 , -0.27596927, -0.30877528,
        1.0628593 , -0.20295258, -0.8879725 , -1.0139191 , -0.57103944,
        0.82548946,  1.2264459 ,  0.85974294, -0.89849865,  0.7603431 ,
        0.30399606,  0.48102623, -0.24103768,  0.53744197,  0.72049963,
       -0.6832704 ,  0.44890508,  0.23836358,  0.8988211 ,  0.54030377,
        0.21661802,  1.1675948 ,  0.05190293, -0.135646  , -0.43071792,
        0.26412237,  0.4683544 ,  0.44881862, -0.05927201, -0.02765048,
        0.67922723, -0.16002311, -0.45176962,  1.5057105 ,  0.4425515 ,
        0.7490429 ,  0.18526925,  1.0694392 , -0.18955971, -0.28

In [None]:
"""Similarity between texts"""
print(fasttext.wv.cosine_similarities(fasttext.wv[oblacno], fasttext.wv[[oblacno]]))

[1.0000001]


In [None]:
print(fasttext.wv.cosine_similarities(fasttext.wv[oblacno], fasttext.wv[[oblacno_reversed]]))

[0.71361804]


In [None]:
print(fasttext.wv.cosine_similarities(fasttext.wv[oblacno], fasttext.wv[[suncano]]))

[0.70184004]


In [None]:
print(fasttext.wv.cosine_similarities(fasttext.wv[oblacno], fasttext.wv[[guzva]]))

[0.40375346]


In [None]:
doc2vec.infer_vector(suncano.split())

array([-0.04681116, -0.07405312, -0.0528174 ,  0.22408938,  0.03707092,
       -0.08288442, -0.08161405,  0.07412355, -0.18808188, -0.03769279,
        0.04493877, -0.08661932, -0.0703866 , -0.00962123,  0.01738931,
       -0.14432669, -0.00806675, -0.18188167, -0.13788167, -0.03798733,
       -0.0745097 , -0.08376487, -0.00754883, -0.06787176, -0.02506606,
       -0.07099699,  0.00029009,  0.09699437, -0.04758047, -0.03383537,
        0.06705782,  0.01446838, -0.09563633, -0.14767486,  0.037944  ,
        0.05095235,  0.03883098, -0.01640064, -0.13655117,  0.01664549,
       -0.01745041, -0.01169401, -0.05200403,  0.01554014,  0.05013828,
        0.07418264, -0.09321432,  0.07131006,  0.04737925,  0.07560886,
       -0.09147214, -0.06166065,  0.06531931,  0.00796848,  0.00552939,
        0.04931952, -0.05720408, -0.0076518 ,  0.01819174,  0.06630155,
        0.02719215, -0.00638933,  0.0778047 , -0.03322877,  0.02124912,
        0.10701287,  0.01200004, -0.07778283,  0.04115468, -0.15

In [None]:
doc2vec.wv.most_similar('misliti')

[('tvrditi', 0.7902759313583374),
 ('pomisliti', 0.7797690033912659),
 ('kazati', 0.7602627873420715),
 ('reći', 0.7469426393508911),
 ('veruje', 0.7338119745254517),
 ('nagađaju', 0.717974841594696),
 ('sumnjati', 0.7161406874656677),
 ('znati', 0.7151237726211548),
 ('pretpostavljati', 0.7151066660881042),
 ('vjerovati', 0.7147721648216248)]