# Similarity in Paragraph2Vec Text Representation

*Gensim software examples.*

In [1]:
import os
import smart_open
import gensim
from gensim.models import Doc2Vec

In [2]:
doc_collection = ''
file_path = 'gutenberg/'
file_list = list(os.popen('ls '+ file_path).read().split('\n'))
for file in file_list:
    if file:
        with open(os.path.join(file_path,file)) as doc:
            doc_collection += doc.read()+'\n'

#Wrangling the data from list of doc-strings -> list of word-list by sentences
with open('gensim_data/all_gutenberg', 'w') as f:
    f.write(doc_collection)

In [3]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [4]:
corpus = list(read_corpus('gensim_data/all_gutenberg'))
paraph2vec = Doc2Vec(corpus, size=300, window=8, min_count=5, workers=4)

## Wrangling sentences from Str to p2v vectors

In [5]:
sentence1 = 'the girl run into the hall'
sentence2 = 'Here Alice run to the hall'

sent1 = sentence1.split()
sent2 = sentence2.split()

vec_sent1_p2v = paraph2vec.infer_vector(sent1)
vec_sent2_p2v = paraph2vec.infer_vector(sent2)

In [6]:
#print the vector of the word 'hall'.
paraph2vec.wv['hall'][:10]

array([-0.01495365, -0.52712882, -0.15033531, -0.03319004,  0.15571935,
       -0.14664985,  0.04288134,  0.40894195,  0.1561143 , -0.37317038], dtype=float32)

In [7]:
#print the Paragraph2vector of the sentence 1
print(len(vec_sent1_p2v))
vec_sent1_p2v[:10]

300


array([-0.01261198, -0.02667871, -0.03645166, -0.00957741,  0.00195654,
       -0.00025346, -0.00115494,  0.03957676, -0.00166694, -0.01274711], dtype=float32)

## Gensim Paragraph2Vec sentence similarity

In [8]:
from gensim.matutils import kullback_leibler, jaccard, hellinger, cossim
cossim(vec_sent1_p2v,vec_sent2_p2v)

#for the next line to work the model must contain all appearing words
#paraph2vec.n_similarity(sentence1,sentence2)

#paraph2vec.wv.n_similarity(sentence1,sentence2)

TypeError: cannot convert dictionary update sequence element #0 to a sequence

## Sklearn Paragraph2Vec-Cosine sentence similarity

### Wrangling Data

In [9]:
sentence1 = 'the girl run into the hall'
sentence2 = 'Here Alice run to the hall'

In [10]:
import numpy as np

def preproc_data(sent1, sent2, model):

    sentence1 = sent1.split()
    sentence2 = sent2.split()
    
    p2v_sent1 = []
    p2v_sent2 = []

    for i,word in enumerate(sentence1):
        try:
            p2v_sent1.append(paraph2vec.wv[word])
        except:
            pass

    for i,word in enumerate(sentence2):
        try:
            p2v_sent2.append(paraph2vec.wv[word])
        except:
            pass

    p2v_sent1 = sum(np.asarray(p2v_sent1))
    p2v_sent2 = sum(np.asarray(p2v_sent2))
    
    A = p2v_sent1.reshape(1,-1)
    B = p2v_sent2.reshape(1,-1)
    
    return A,B

In [11]:
p2v_sent1, p2v_sent2 = preproc_data(sentence1,sentence2,paraph2vec)
print(len(p2v_sent1[0]))
p2v_sent2[0][:10]

300


array([-0.33344087, -3.05628347, -0.12873888,  0.28321588,  0.24815422,
       -0.89516532, -0.85681403,  0.69070792,  0.43571717, -0.16906594], dtype=float32)

### Applying Similarity

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(p2v_sent1,p2v_sent2)[0][0]

0.78142464

In [13]:
#Filtering stopwords
sent1s = 'girl run hall'
sent2s = 'Alice run hall'
p2v_sent1s, p2v_sent2s = preproc_data(sent1s,sent2s,paraph2vec)
cosine_similarity(p2v_sent1s,p2v_sent2s)[0][0]

0.87112945

## Scipy Cosine Similarity

In [14]:
from scipy.spatial.distance import cosine as cosine_scipy

print(cosine_scipy(p2v_sent1,p2v_sent2))
print(cosine_scipy(p2v_sent1s,p2v_sent2s)) #Filtering stopwords

0.218575402618
0.128870538611


## Gensim p2v.n_similarity

In [15]:
paraph2vec.n_similarity(['the','girl','run','into','the','hall'],['here','alice','run','to','the','hall'])

0.69291740833683868

In [17]:
paraph2vec.n_similarity(['girl','run','hall'],['alice','run','hall'])

0.84913617439670053

In [20]:
paraph2vec.n_similarity(['the','boy','eat','red','apple'],
                   ['here','alice','run','to','the','hall'])

0.60044111540961354

## Gensim p2v infer_vector

In [21]:
#Testing initial infer_vector similarity
vec_sent1_p2v = vec_sent1_p2v.reshape(1,-1)
vec_sent2_p2v = vec_sent2_p2v.reshape(1,-1)
cosine_similarity(vec_sent1_p2v,vec_sent2_p2v)[0][0]

0.33136809

In [22]:
#infer_vector similarity filtering stopwords
vec_sent1s_p2v = paraph2vec.infer_vector(sent1s.split()).reshape(1,-1)
vec_sent2s_p2v = paraph2vec.infer_vector(sent2s.split()).reshape(1,-1)
cosine_similarity(vec_sent1s_p2v,vec_sent2s_p2v)[0][0]

0.33047721

## Gensim p2v.similarity

**Warning:** all the words must be converted to lowercase.

In [23]:
sent1 = sentence1.lower().split()
sent2 = sentence2.lower().split()

vec_sent1 = paraph2vec.wv[sent1]
vec_sent2 = paraph2vec.wv[sent2]

#cosine(vec_sent1,vec_sent2)
vec_sent1_ = sum(vec_sent1).reshape(1,-1)
vec_sent2_ = sum(vec_sent2).reshape(1,-1)

cosine_similarity(vec_sent1_,vec_sent2_)[0][0]

0.69291735

In [24]:
def word_vector_cosine_sim(sent1, sent2,p2v):
    for i,word in enumerate(sent1):
        if i == 0:
            sent1_p2v = p2v.wv[word]
        else:
            sent1_p2v+= p2v.wv[word]

    for i,word in enumerate(sent2):
        if i == 0:
            sent2_p2v = p2v.wv[word]
        else:
            sent2_p2v+= p2v.wv[word]

    # get the sentence vector similarity
    return 1-cosine_scipy(sent1_p2v,sent2_p2v)

In [25]:
print(word_vector_cosine_sim(sent1,sent2,paraph2vec))

0.947779667627


Seems like if paragraph2vec had a sparcity problem, due to that word vectors are to slow.
Also the model if you test the *wv* method to many times the numers approximate to 0.999.

## Best Pair Word Overlap Similarity

Lets try a different way to compound a sentence similarity, based on WordNet-Augmented-Word-Overlap similarity idea.

$p = {\sum_{w\in\ sent_1}max(df[w][w']) \over len(sent_1)} \ \ \ \forall\ w' \in\ sent_2$

$q = {\sum_{w'\in\ sent_2}max(df[w][w']) \over len(sent_2)} \ \ \ \forall\ w \in\ sent_1$

$sim = \left\{ \begin{array}{rcl} 
0  & if\ p+q = 0\\
{2 p*q \over (p+q)}  & others\\
\end{array}
\right.$

In [26]:
paraph2vec.similarity('girl','woman')

0.72026728389009687

In [27]:
sentence1 = ['the','girl','run','into','the','hall']
sentence2 = ['Here','Alice','run','to','the','hall']

def harmonic_best_pair_word_sim(string1,string2):
    p=0
    for wi in string1:
        m = 0
        for wc in string2:
            try:
                m = max(m, paraph2vec.similarity(wi,wc))
            except:
                pass
        p += m
    p = p/len(string1)

    q=0
    for wc in string2:
        m = 0
        for wi in string1:
            try:
                m = max(m, paraph2vec.similarity(wi,wc))
            except:
                pass
        q += m
    q = q/len(string2)

    sim = 2*p*q/(p+q or 1)
    return sim

print('Harmonic mean best pair-word similarity, stopword_filtering=no',
      harmonic_best_pair_word_sim(sentence1,sentence2))
print('Harmonic mean best pair-word similarity, stopword_filtering=yes',
      harmonic_best_pair_word_sim(['girl','run','hall'],['Alice','eat','hall']))

Harmonic mean best pair-word similarity, stopword_filtering=no 0.670109130314
Harmonic mean best pair-word similarity, stopword_filtering=yes 0.575509338622


## Textsim Jaccard

In [28]:
import sys
sys.path.append('/home/abelm/')
import textsim
from textsim.tokendists import jaccard_distance
print('Textsim Jaccard', jaccard_distance(sent1,sent2))
print('Textsim Jaccard, stopwords_filter=yes', jaccard_distance('girl run hall','Alice eat hall'))

Textsim Jaccard 0.625
Textsim Jaccard, stopwords_filter=yes 0.8


# Conclusions

Same as Word2Vec this model doesn't works with bow structure, it represent a word as a vector of *size parameter* value length. At the same time this model can infered a vector for a sentence. The experiments shows that with the same corpus and the same sentences the paragraph2vec tends to fail with some words, e.g. 'Alice' or 'Here', this behavior is different to Word2Vec model.

* Gensim Hellinger, Cosine, Jaccard, Kullback-Leibler and the others based on bowvec doesn't work.
* 0.777 input = str, Jaccard, Textsim, stopwords_filter=no
* 0.800 input = str, Jaccard, Textsim, stopwords_filter=yes
* 0.433 input = str, Cosine, Textsim-sklearn, stopwords_filter=no
* 0.333 input = str, Cosine, Textsim-sklearn, stopwords_filter=yes
* 0.773 input = self vec, Cosine, Sklearn
* 0.361 input = Doc2Vec infer_vec, Cosine, Sklearn
* 0.635 input = str list, Harmonic mean, Best word sim of words in both sentences, stopwords_filter=no
* 0.543 input = str list, Harmonic mean, Best word sim of words in both sentences, stopwords_filter=yes


# Recomendations

* Made the same example with Wikipedia dump data, to test the similarity difference according to data.