# 1.  Importing required packages

1. https://towardsdatascience.com/word2vec-skip-gram-model-part-1-intuition-78614e4d6e0b

In [4]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

# 2. Word2Vec (i.e Hidden layer weight vector)
<img src="images/Continuous-Bag-of-words-CBOW-CB-and-Skip-gram.png" style="width:500px;height:300px;">

<br>
1. <b>CBOW:</b>
<img src="images/word2vec-cbow.png" style="width:300px;height:300px;">
  To calculate hidden layer inputs, we take an average over all these C context word inputs.

<br><br><br>
2. <b>Skip Gram: </b>
<img src="images/skip gram.png" style="width:300px;height:300px;">



In [5]:
from gensim.models import Word2Vec
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
             ['this', 'is', 'the', 'second', 'sentence'],
             ['yet', 'another', 'sentence'],
             ['one', 'more', 'sentence'],
             ['and', 'the', 'final', 'sentence']]




There are many parameters on this constructor; a few noteworthy arguments you may wish to configure are:

1. size: (default 100) The number of dimensions of the embedding, e.g. the length of the dense vector to represent each token (word).
2. window: (default 5) The maximum distance between a target word and words around the target word.
3. min_count: (default 5) The minimum count of words to consider when training the model; words with an occurrence less than this count will be ignored.
4. workers: (default 3) The number of threads to use while training.
5. sg: (default 0 or CBOW) The training algorithm, either CBOW (0) or skip gram (1).
6. alpha: Learning rate

In [11]:
# train model
model = Word2Vec(sentences, min_count=1,size=100)
# summarize the loaded model
print(model)


Word2Vec(vocab=14, size=100, alpha=0.025)


In [12]:
# summarize vocabulary
words = list(model.wv.vocab)
print(words)

['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec', 'second', 'yet', 'another', 'one', 'more', 'and', 'final']


In [13]:
# access vector for one word
print(model['sentence'])

[ 5.96484635e-04  4.02896106e-03  2.50628381e-03  1.13368430e-03
  3.31014791e-03  2.36928416e-03  2.08337931e-03 -1.25981605e-05
 -4.20993241e-03  4.76118876e-03  3.67753417e-03 -3.19461571e-04
  1.22230710e-03 -4.79406165e-03  1.38692011e-03 -2.93617068e-05
  4.72569931e-03  2.90702539e-03  1.47349841e-03  3.98808764e-03
  1.94128056e-03  1.59068673e-03  1.49845739e-03 -2.66237883e-03
 -2.40131398e-03 -4.74941451e-03 -4.06561838e-03  3.73138255e-03
  3.93687282e-03  2.99954298e-03 -1.99720403e-03  9.32016002e-04
 -5.66960429e-04  1.47769402e-03 -3.52450064e-03  2.73211347e-03
 -7.41784577e-04  2.65987171e-03 -9.70018504e-04 -3.63929733e-03
 -2.59269588e-03  2.33549019e-03  1.08449801e-03 -1.54980796e-03
  1.86904171e-03  3.93600576e-03  1.86595472e-03 -2.64461990e-03
  2.66823132e-04  4.03169636e-03 -5.11152437e-04  4.30564582e-03
 -2.34517525e-03 -4.25397139e-03  6.17238693e-04  3.25139030e-03
 -4.74330457e-03 -3.65862716e-03 -1.21435220e-03  4.49695019e-03
 -3.46390298e-03  3.17979

  


In [10]:
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec(vocab=14, size=100, alpha=0.025)


In [17]:
from gensim.models import KeyedVectors
model.wv.save("wordvectors.kv")

# 2 Doc2Vec (i.e Average of wordvec of a sentence)

<img src="images/Doc2Vec.png" style="width:500px;height:200px;">

## 2.1  Different Parameters

1. <b>documents:</b>  Iterable of list of TaggedDocument
2. <b>dm:</b> If dm=0, distributed bag of words (PV-DBOW) is used; if dm=1,‘distributed memory’ (PV-DM) is used.Distributed Memory model preserves the word order in a document whereas Distributed Bag of words just uses the bag of words approach, which doesn't preserve any word order.
3. <b>vector_size:</b> Dimensional feature vectors.
4. <b>min_count:</b> Ignores all words with total frequency lower than this.
5. <b>window:</b> The maximum distance between the current and predicted word within a sentence.
6. <b>alpha:</b> The initial learning rate.

In [19]:
data = ["I love machine learning. Its awesome.",
        "I love coding in python",
        "I love building chatbots",
        "they chat amagingly well"]

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]


In [29]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(tagged_data, dm=1,vector_size=5, window=2, min_count=1, workers=4)

model.save("d2v.model")


In [30]:
model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)

# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])

V1_infer [-0.07063764 -0.06617451  0.00726097 -0.00809091  0.06903025]
[('2', 0.4644005298614502), ('3', 0.11856015026569366), ('0', -0.5672831535339355)]
[-0.04921347 -0.0373654   0.07231769 -0.07449533  0.05500709]
