# NLP - embedding word2vec

In [3]:
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from gensim.models import Word2Vec

In [2]:
def load_data(percentage_of_sentences=None):
    train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True)

    train_sentences, y_train = tfds.as_numpy(train_data)
    test_sentences, y_test = tfds.as_numpy(test_data)
    
    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
        
        len_train = int(percentage_of_sentences/100*len(train_sentences))
        train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]
  
        len_test = int(percentage_of_sentences/100*len(test_sentences))
        test_sentences, y_test = test_sentences[:len_test], y_test[:len_test]
    
    X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences]
    X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences]
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_data(percentage_of_sentences=10)

## Word2vec

In [4]:
word2vec = Word2Vec(sentences=X_train)

In [5]:
word2vec.wv['dog']

array([-0.13364062,  0.15041523, -0.19150633,  0.2187058 , -0.01406818,
       -0.34920904,  0.00141394,  0.5706544 , -0.20943388, -0.22621362,
        0.01463939, -0.34485847, -0.02439873,  0.16945587, -0.03083642,
       -0.2451295 ,  0.18742393, -0.25462937, -0.0092235 , -0.3436122 ,
        0.1884112 ,  0.07658485,  0.29378095, -0.11073026, -0.02666011,
        0.02338655, -0.25449425, -0.0599355 , -0.186066  ,  0.0253413 ,
        0.21239057, -0.01285569,  0.17263268, -0.3984698 , -0.16725846,
        0.20426191,  0.17115471, -0.16269489, -0.24861552, -0.3116361 ,
       -0.14133993, -0.32830966, -0.26248837,  0.2080132 ,  0.29344252,
       -0.02447009, -0.16760898, -0.1096155 ,  0.2716104 ,  0.19329774,
        0.01489836, -0.22473957, -0.25307456, -0.06551018, -0.21846691,
        0.22553138,  0.19902542, -0.05951208, -0.20575923,  0.07214351,
        0.1207689 , -0.00614363,  0.02635141,  0.10765271, -0.25436944,
        0.20722958, -0.03627525,  0.23768559, -0.33733985,  0.32

In [6]:
word2vec.wv.most_similar('movie', topn=10)

[('film', 0.971341609954834),
 ('show', 0.8336167931556702),
 ('thing', 0.8297647833824158),
 ('sequel', 0.8017898797988892),
 ('series', 0.7847037315368652),
 ('flick', 0.7821328639984131),
 ('book', 0.7811017036437988),
 ('ending', 0.7718184590339661),
 ('fun', 0.7434617877006531),
 ('watching', 0.7419548630714417)]

In [7]:
word2vec.wv.most_similar('movie', topn=10)

[('film', 0.971341609954834),
 ('show', 0.8336167931556702),
 ('thing', 0.8297647833824158),
 ('sequel', 0.8017898797988892),
 ('series', 0.7847037315368652),
 ('flick', 0.7821328639984131),
 ('book', 0.7811017036437988),
 ('ending', 0.7718184590339661),
 ('fun', 0.7434617877006531),
 ('watching', 0.7419548630714417)]

In [8]:
word_embedding = word2vec.wv['cat']
word2vec.wv.similar_by_vector(word_embedding)

[('cat', 1.0),
 ('pursuit', 0.9849877953529358),
 ('hunter', 0.9848315119743347),
 ('mouth', 0.9841893315315247),
 ('beetle', 0.9836503267288208),
 ('brettschneider', 0.983066976070404),
 ('mate', 0.9829167723655701),
 ('anger', 0.981568455696106),
 ("family's", 0.981535792350769),
 ('machine', 0.9804127812385559)]

In [9]:
word2vec.wv['good'] - word2vec.wv['bad']

array([ 5.57097793e-02, -7.32366741e-03,  1.65491521e-01,  3.58830810e-01,
        2.87445784e-02, -2.37224340e-01, -3.44025493e-01, -3.41679677e-02,
       -7.13418722e-02, -2.33952403e-01,  2.44847968e-01,  1.41833663e-01,
        4.06437218e-02, -1.61210477e-01, -2.72908151e-01,  2.90893734e-01,
       -3.98859978e-01,  2.44817048e-01,  1.60467088e-01,  2.74570942e-01,
       -6.62795901e-02,  1.70966938e-01, -3.01094055e-02,  1.42659232e-01,
       -3.13529670e-02,  5.92669845e-03, -1.91088408e-01,  3.81416142e-01,
        2.69961655e-02, -5.32444417e-01, -4.46149349e-01,  1.20966621e-01,
        3.61466169e-01,  2.82432914e-01, -1.87411830e-01, -2.33152986e-01,
        3.59142840e-01, -4.08710986e-01, -1.93633646e-01,  5.84474146e-01,
       -2.74925828e-02, -3.95717025e-02, -2.02594340e-01, -1.86217308e-01,
        2.49952257e-01, -9.72493142e-02,  3.90564293e-01, -1.44666433e-01,
       -1.17019594e-01,  1.50686622e-01, -3.76390815e-02,  6.07988238e-02,
       -5.51718831e-01, -

In [10]:
v_queen = word2vec.wv['good']
v_king = word2vec.wv['bad']
v_man = word2vec.wv['stupid']

v_result = v_queen - v_king + v_man

In [11]:
word2vec.wv.similar_by_vector(v_result)

[('nice', 0.8123105764389038),
 ('spark', 0.8013906478881836),
 ('decent', 0.7989938259124756),
 ('good', 0.7913230657577515),
 ('worthwhile', 0.7684338092803955),
 ('great', 0.763630747795105),
 ('wonderful', 0.7629781365394592),
 ('smart', 0.7593650221824646),
 ('posted', 0.7563799619674683),
 ('poor', 0.7507991790771484)]