In [1]:
import sys
import os
import random
# Counter
from collections import Counter
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
from somhos.methods.useful import save_pickle, load_pickle, wordvectors_centroid
from somhos.config.paths import *

Default path

In [2]:
prefix_path = "../../"
data_path = get_relative_path(prefix_path, V9_PATH)
os.path.exists(data_path)

True

Path to word2vec embeddings

In [3]:
word2vec_path = get_relative_path(prefix_path, DEAULT_WORDVECTORS)
os.path.exists(word2vec_path)

True

Load samples

In [4]:
# Load sample A
docs_sample_a_path = get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX)
docs_sample_a = load_pickle(docs_sample_a_path)
# Load sample B
docs_sample_b_path = get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX)
docs_sample_b = load_pickle(docs_sample_b_path)

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (1000, 1000)


Load samples content

In [5]:
docs_samples_content_path = get_relative_path(data_path, DOCS_SAMPLES_CONTENT)
samples_content = load_pickle(docs_samples_content_path)
print("Docs: %d" % len(samples_content))

Docs: 2000


In [6]:
from itertools import product
import numpy as np
from scipy.spatial.distance import cosine as cosine_distance
import gensim

In [7]:
wordvectors = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [8]:
sample_a_indices = range(0, len(docs_sample_a))
sample_b_indices = range(0, len(docs_sample_b))

def get_content_tokens(index, samples_content):
    return samples_content[index]['bag-of-words']

m_measure_sim = np.zeros([len(docs_sample_a), len(docs_sample_b)]) 
for i, j in product(sample_a_indices, sample_a_indices):
    # Words sets
    set_a = get_content_tokens(docs_sample_a[i], samples_content)
    set_b = get_content_tokens(docs_sample_b[j], samples_content)
    # Cetroids
    centroid_a = wordvectors_centroid(wordvectors, set_a)
    centroid_b = wordvectors_centroid(wordvectors, set_b)
    # Measure
    measure_sim = 1.0 - cosine_distance(centroid_a, centroid_b)
    m_measure_sim[i, j] = measure_sim

print(m_measure_sim)

[[0.86534196 0.86278559 0.79227646 ... 0.86958995 0.79866438 0.81544963]
 [0.87643462 0.89360271 0.83657274 ... 0.90201855 0.80954968 0.86228529]
 [0.84398758 0.84359904 0.83315985 ... 0.85875672 0.79936952 0.86560457]
 ...
 [0.81910398 0.82330639 0.78677141 ... 0.83295354 0.76034818 0.88402544]
 [0.83766927 0.85300346 0.71587968 ... 0.85768615 0.75476113 0.79618026]
 [0.84980335 0.82969595 0.79795486 ... 0.85606399 0.77727514 0.86805362]]


In [9]:
docs_samples_measure_sim_path = get_relative_path(data_path, DOCS_SAMPLES_WORD2VEC_SIM)
save_pickle(m_measure_sim, docs_samples_measure_sim_path)

In [10]:
n_singularvalues = 1
U, s, V = np.linalg.svd(m_measure_sim, full_matrices=False)
D = np.diag(s[:n_singularvalues])
m_udv_measure_sim = np.dot(U[:,:n_singularvalues], np.dot(D, V[:n_singularvalues,:]))
print(m_udv_measure_sim)

[[0.86128099 0.87603515 0.79645275 ... 0.87464476 0.78405023 0.83869546]
 [0.8768267  0.89184716 0.81082834 ... 0.89043168 0.79820196 0.85383351]
 [0.86250937 0.87728458 0.79758868 ... 0.8758922  0.78516846 0.83989163]
 ...
 [0.83569109 0.85000688 0.77278899 ... 0.8486578  0.76075496 0.81377661]
 [0.83460778 0.84890502 0.77178723 ... 0.84755769 0.7597688  0.81272171]
 [0.85664297 0.87131768 0.79216384 ... 0.86993478 0.7798281  0.83417907]]


In [11]:
docs_samples_measure_sim_udv_path = get_relative_path(data_path, DOCS_SAMPLES_WORD2VEC_SIM_UDV)
save_pickle(m_udv_measure_sim, docs_samples_measure_sim_udv_path)