# Word2Vec Best-Pair Word Overlap

Based on WordNet Augmented Word Overlap from [Saric2012](#Saric2012) design.

In [1]:
import gensim
import nltk
import os
import re
from gensim.corpora import MmCorpus
from gensim.models import Word2Vec

In [2]:
#Loading text corpus, little Gutenberg collection same as NLTK Gutemberg corpus
doc_collection = []
file_path = 'data/gutenberg/'
file_list = list(os.popen('ls '+ file_path).read().split('\n'))
for file in file_list:
    if file:
        with open(os.path.join(file_path,file)) as doc:
            doc_collection.append(doc.read())

In [3]:
#Wrangling the data from list of doc-strings -> list of word-list by sentences
sentences = []
for doc in range(len(doc_collection)):
    for sent in nltk.sent_tokenize(doc_collection[doc]):
        sent_words = []
        for word in nltk.word_tokenize(sent):
            sent_words.append(word)
        sentences.append(sent_words)

In [5]:
#Building Word2Vec model
#first build vocabulary
w2v = Word2Vec(iter=1)
w2v.build_vocab(sentences)

#second train the model
w2v = Word2Vec(sentences, min_count=1,workers=4)
w2v.save('data/gensim_data/my_model')
model = gensim.models.Word2Vec.load('data/gensim_data/my_model')

#third train the model with more sentences
model.train(sentences,total_words=20000000,epochs=model.iter)



9702752

### Best Pair Word Overlap

Lets try a different way to compound a sentence similarity, based on WordNet-Augmented-Word-Overlap similarity idea.

$p = {\sum_{w\in\ sent_1}max(df[w][w']) \over len(sent_1)} \ \ \ \forall\ w' \in\ sent_2$

$q = {\sum_{w'\in\ sent_2}max(df[w][w']) \over len(sent_2)} \ \ \ \forall\ w \in\ sent_1$

$sim = \left\{ \begin{array}{rcl} 
0  & if\ p+q = 0\\
{2 p*q \over (p+q)}  & others\\
\end{array}
\right.$

In [7]:
sent1 = ['the','girl','run','into','the','hall']
sent2 = ['Here','Alice','run','to','the','hall']

p=0
for wi in sent1:
    m = 0
    for wc in sent2:
        m = max(m, model.similarity(wi,wc))
    p += m
p = p/len(sent1)

q=0
for wc in sent2:
    m = 0
    for wi in sent1:
        m = max(m, model.similarity(wi,wc))
    q += m
q = q/len(sent2)

sim = 2*p*q/(p+q or 1)
print(p,q,sim)

0.783466901991 0.642468558815 0.705996681108
