In [3]:
! pip install gensim




In [5]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize

In [11]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
data = [
    "This is the first document",
    "This is the second document",
    "This is the third document",
    "This is the fourth document"
]

In [21]:
# Prepare the documents, and create Tagged Documents
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(data)]
print(tagged_data)

[TaggedDocument(words=['this', 'is', 'the', 'first', 'document'], tags=['0']), TaggedDocument(words=['this', 'is', 'the', 'second', 'document'], tags=['1']), TaggedDocument(words=['this', 'is', 'the', 'third', 'document'], tags=['2']), TaggedDocument(words=['this', 'is', 'the', 'fourth', 'document'], tags=['3'])]


In [17]:
tokenizer = Tokenizer(num_words=100)  # you can limit vocab size

In [23]:
tagged_data = []
for i, doc in enumerate(data):
  tag_docs = TaggedDocument(words = word_tokenize(doc.lower()), tags=[str(i)])
  tagged_data.append(tag_docs)

print(tagged_data)

[TaggedDocument(words=['this', 'is', 'the', 'first', 'document'], tags=['0']), TaggedDocument(words=['this', 'is', 'the', 'second', 'document'], tags=['1']), TaggedDocument(words=['this', 'is', 'the', 'third', 'document'], tags=['2']), TaggedDocument(words=['this', 'is', 'the', 'fourth', 'document'], tags=['3'])]


In [13]:
# train the doc2vec model
model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [24]:
# train the doc2vec model
model = Doc2Vec(vector_size = 20, min_count=2, epochs=50)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [14]:
# get the document vectors
document_vectors = [model.infer_vector(word_tokenize(doc.lower())) for doc in data]


In [25]:
# get the document vectors
document_vectors = []
for doc in data:
  vec = model.infer_vector(word_tokenize(doc.lower()))
  document_vectors.append(vec)

print(document_vectors)

[array([ 0.01914601,  0.01581868,  0.01517364,  0.01450728, -0.01254469,
        0.02134449, -0.02158963, -0.00790595, -0.02162696,  0.01948986,
       -0.00524412, -0.02197738,  0.0078729 ,  0.01225551,  0.01517913,
       -0.02130485, -0.00079446, -0.02119508,  0.02412067,  0.01603048],
      dtype=float32), array([ 0.01655311, -0.01528235, -0.00576514,  0.00123265, -0.00248322,
       -0.00954959, -0.00020489,  0.00905933,  0.00537069,  0.02177139,
       -0.01555459,  0.02197477,  0.01155983, -0.00930728,  0.0017278 ,
       -0.00097345,  0.01894876, -0.01965865, -0.02363788,  0.00099795],
      dtype=float32), array([-0.02170233, -0.00900744, -0.00751184, -0.01114856, -0.00073212,
       -0.01919173,  0.01692825,  0.00668934, -0.00966807,  0.01537576,
        0.0131486 ,  0.00748374,  0.01138197,  0.00705871, -0.01292038,
       -0.00396556,  0.01600288,  0.02036013,  0.01436794,  0.01962908],
      dtype=float32), array([ 0.02109862,  0.01905306, -0.00967325, -0.01916632,  0.0088

In [16]:
# print the document vectors
for i, doc in enumerate(data):
  print("document: ", i+1, " : ", doc)
  print("vector: ", document_vectors[i])
  print()

document:  1  :  This is the first document
vector:  [ 0.01914601  0.01581868  0.01517364  0.01450728 -0.01254469  0.02134449
 -0.02158963 -0.00790595 -0.02162696  0.01948986 -0.00524412 -0.02197738
  0.0078729   0.01225551  0.01517913 -0.02130485 -0.00079446 -0.02119508
  0.02412067  0.01603048]

document:  2  :  This is the second document
vector:  [ 0.01655311 -0.01528235 -0.00576514  0.00123265 -0.00248322 -0.00954959
 -0.00020489  0.00905933  0.00537069  0.02177139 -0.01555459  0.02197477
  0.01155983 -0.00930728  0.0017278  -0.00097345  0.01894876 -0.01965865
 -0.02363788  0.00099795]

document:  3  :  This is the third document
vector:  [-0.02170233 -0.00900744 -0.00751184 -0.01114856 -0.00073212 -0.01919173
  0.01692825  0.00668934 -0.00966807  0.01537576  0.0131486   0.00748374
  0.01138197  0.00705871 -0.01292038 -0.00396556  0.01600288  0.02036013
  0.01436794  0.01962908]

document:  4  :  This is the fourth document
vector:  [ 0.02109862  0.01905306 -0.00967325 -0.01916632