# Doc2Vec Model - Paragraph/Document Vectors

## Import Libraries

In [257]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy import spatial
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Define a Custom Class

In [258]:
# Define Custom Class
class MyDoc2Vec:

 def __init__(self, docs):
    # initialize a model
    self.model = Doc2Vec(size=5, window=1, min_count=1, workers=2, alpha=0.025, min_alpha=0.01, dm=0)

    self.num_docs=len(docs)
    # build vocabulary
    self.model.build_vocab(docs)

    # train this model
    self.model.train(docs, total_examples=len(docs), epochs=100)

    self.docsim={}
    for doc_id in range(self.num_docs):
       # Find the document which is most similar to a given document-id
       self.docsim[doc_id] = self.model.docvecs.most_similar(doc_id)

 # Display trained document vectors
 def display_vectors(self):
    print('\n Number of document vectors : {}'.format(len(self.model.docvecs)))

    print('\n The Document Vectors are : ')
    for i in range(len(self.model.docvecs)):
     print('\n Document vector {} : {}'.format(i+1, self.model.docvecs[i]))
     #print(type(self.model.docvecs[i]))
     #print(self.model.docvecs[i].shape)

    # get the trained document vector, and most similar articles
    # (after training, the results should be correct)
    #docvec0 = model.docvecs[0]
    #docvecsyn0 = model.docvecs.doctag_syn0[0]

 # Display self similarities of the document vectors in the Training set
 def display_similarity_of_training_vectors(self):
    # Most similar documents
    print('\n Document Similarity (on Training Set) :')
    for doc_id in range(self.num_docs):
      print('\n Document similarity of document id {}  -> {}'.format(doc_id+1, self.docsim[doc_id]))


 # Find similarity with a Test document 
 def display_similarity_with_test_vector(self, test_data):
    # Infer a vector from a document
    vector = self.model.infer_vector(test_data)
    #print(vector)
    #print(type(vector))
    #print(vector.shape)

    docsim_score={}

    for doc_id in range(self.num_docs):
       docsim_score[doc_id] = (1 - spatial.distance.cosine(self.model.docvecs[doc_id], vector))

    max_idx = np.argmax(list(docsim_score.values()))

    # Most similar documents
    print('\n Document Similarity (on Test Data) :')
    for doc_id in range(self.num_docs):
      print('\n Similarity of the test document with the Training document id {}  -> {}'.format(doc_id+1, docsim_score[doc_id]))

    print('\n Maximum similarity found with the Training document {}'.format(max_idx+1))
        

In [259]:
# Dummy documents
article1 = ['My', 'name', 'is', 'David', 'I', 'like', 'Playing', 'Soccer']
article2 = ['My', 'name', 'is', 'Jenny', 'I', 'love', 'Basketball']
article3 = ['Today', 'is', 'Monday']
article4 = ['I', 'am', 'a', 'data-scientist']
article5 = ['He', 'is', 'a', 'pianist']

# Document Ids - training labels
id1 = '1'
id2 = '2'
id3 = '3'
id4 = '4'
id5 = '5'

# Tagged documents - list of words in each document along with a document id
doc1 = TaggedDocument(article1, [id1])
doc2 = TaggedDocument(article2, [id2])
doc3 = TaggedDocument(article3, [id3])
doc4 = TaggedDocument(article4, [id4])
doc5 = TaggedDocument(article5, [id5])

# A list of Tagged Documents
docs = [doc1, doc2, doc3, doc4, doc5]


In [260]:
model = MyDoc2Vec(docs)

## Training Vectors

In [261]:
model.display_vectors()


 Number of document vectors : 5

 The Document Vectors are : 

 Document vector 1 : [ 0.03132734 -0.10112011 -0.08185777  0.05819842 -0.06228345]

 Document vector 2 : [ 0.10304973 -0.019814   -0.07219619 -0.1020571  -0.01467577]

 Document vector 3 : [-0.01836818 -0.06126294 -0.00878479  0.00441003 -0.0214762 ]

 Document vector 4 : [ 0.05756095 -0.09660447 -0.01305723  0.0507192   0.05599436]

 Document vector 5 : [-0.0576116   0.02537722 -0.06660505 -0.06617403 -0.05086739]


## Self Similarity of the Training Vectors

In [262]:
model.display_similarity_of_training_vectors()


 Document Similarity (on Training Set) :

 Document similarity of document id 1  -> [('3', 0.7333403825759888), ('4', 0.560562014579773), ('2', 0.23523040115833282), ('5', 0.020243817940354347)]

 Document similarity of document id 2  -> [('5', 0.2889161705970764), ('1', 0.23523040115833282), ('4', 0.12510935962200165), ('3', -0.01607811078429222)]

 Document similarity of document id 3  -> [('1', 0.7333403825759888), ('4', 0.4307209253311157), ('5', 0.10521996021270752), ('2', -0.016078103333711624)]

 Document similarity of document id 4  -> [('1', 0.5605620741844177), ('3', 0.4307209253311157), ('2', 0.12510940432548523), ('5', -0.6580833792686462)]

 Document similarity of document id 5  -> [('2', 0.2889161705970764), ('3', 0.10521997511386871), ('1', 0.020243817940354347), ('4', -0.6580833792686462)]


## Test Vectors

In [263]:
test_doc_vector = 'He is a soccer player'
model.display_similarity_with_test_vector(test_doc_vector)


 Document Similarity (on Test Data) :

 Similarity of the test document with the Training document id 1  -> 0.39250800013542175

 Similarity of the test document with the Training document id 2  -> 0.6389683485031128

 Similarity of the test document with the Training document id 3  -> -0.12709517776966095

 Similarity of the test document with the Training document id 4  -> -0.0754372850060463

 Similarity of the test document with the Training document id 5  -> 0.141945481300354

 Maximum similarity found with the Training document 2


In [264]:
test_doc_vector = 'I like to watch movies'
model.display_similarity_with_test_vector(test_doc_vector)


 Document Similarity (on Test Data) :

 Similarity of the test document with the Training document id 1  -> 0.5887623429298401

 Similarity of the test document with the Training document id 2  -> 0.22942928969860077

 Similarity of the test document with the Training document id 3  -> 0.420350044965744

 Similarity of the test document with the Training document id 4  -> -0.03386664390563965

 Similarity of the test document with the Training document id 5  -> 0.09837538748979568

 Maximum similarity found with the Training document 1


In [265]:
#model1 = Doc2Vec(size=5, window=1, min_count=1, workers=2, alpha=0.025, min_alpha=0.01, dm=0)
#model.docvecs.similarity(,)