In [1]:
!pip install gensim



In [2]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [3]:
sentences = [
    "I am learning natural language processing",
    "Natural language processing is part of machine learning",
    "I am also learning machine learning",
    "I want to learn deep learning",
    "Deep learning uses neural networks",
    "Neural networks are powerful"
]

In [4]:
tokenized_sentences = [word_tokenize(s.lower()) for s in sentences]

In [5]:
print(tokenized_sentences)

[['i', 'am', 'learning', 'natural', 'language', 'processing'], ['natural', 'language', 'processing', 'is', 'part', 'of', 'machine', 'learning'], ['i', 'am', 'also', 'learning', 'machine', 'learning'], ['i', 'want', 'to', 'learn', 'deep', 'learning'], ['deep', 'learning', 'uses', 'neural', 'networks'], ['neural', 'networks', 'are', 'powerful']]


In [6]:
model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=100, # embedding dimension - dimension of word vectors
    window=3, # context window size - max distance b/w target and  context words
    min_count=1 # keep all words - ignore words appearing less than this number
)

In [8]:
print(model.wv.similarity("deep","learning"))

0.09284417


In [9]:
print(model.wv.similarity("language","natural"))

0.13881183


In [11]:
# print(model.wv.most_similar("deep"))
print(model.wv.most_similar("deep", topn=5))

[('powerful', 0.31900644302368164), ('is', 0.1747603714466095), ('learn', 0.11928387731313705), ('want', 0.11117910593748093), ('to', 0.10898482799530029)]


In [None]:
# Internally:
# During training:
# model slides window across text
# learn weights via neural network
# produces embedding matrix

# Word2Vec is:
# a shallow neural network with one hidden layer only

# Limitation of Word2Vec
# Static embeddings (same vector for every context)
# "bank" - money / river