In [1]:
import sys
import os
import random
# Counter
from collections import Counter
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
from somhos.methods.useful import save_pickle, load_pickle, wordvectors_centroid
from somhos.config.paths import *

Default path

In [2]:
prefix_path = "../../"
data_path = get_relative_path(prefix_path, V9GAMMA_PATH)
os.path.exists(data_path)

True

Load samples

In [3]:
# Load sample A
docs_sample_a_path = get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX)
docs_sample_a = load_pickle(docs_sample_a_path)
# Load sample B
docs_sample_b_path = get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX)
docs_sample_b = load_pickle(docs_sample_b_path)

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (5000, 5000)


Load samples content

In [4]:
docs_samples_content_path = get_relative_path(data_path, DOCS_SAMPLES_CONTENT)
samples_content = load_pickle(docs_samples_content_path)
print("Docs: %d" % len(samples_content))

Docs: 10000


Path to word2vec embeddings

In [5]:
word2vec_path = get_relative_path(prefix_path, DEAULT_WORDVECTORS)
os.path.exists(word2vec_path)

True

In [6]:
from itertools import product
import numpy as np
from scipy.spatial.distance import cosine as cosine_distance
import gensim

In [7]:
wordvectors = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [8]:
sample_a_indices = range(0, len(docs_sample_a))
sample_b_indices = range(0, len(docs_sample_b))

def get_content_for(index, samples_content, field='bag-of-words'):
    return samples_content[index][field]

m_measure_sim = np.zeros([len(docs_sample_a), len(docs_sample_b)]) 
for i, j in product(sample_a_indices, sample_a_indices):
    # Words sets
    set_a = get_content_for(docs_sample_a[i], samples_content)
    set_b = get_content_for(docs_sample_b[j], samples_content)
    # Cetroids
    centroid_a = wordvectors_centroid(wordvectors, set_a)
    centroid_b = wordvectors_centroid(wordvectors, set_b)
    # Measure
    measure_sim = 1.0 - cosine_distance(centroid_a, centroid_b)
    m_measure_sim[i, j] = measure_sim

print(m_measure_sim)

[[0.85457032 0.76439463 0.79155265 ... 0.8088368  0.76935862 0.8377027 ]
 [0.83597968 0.77741533 0.72474716 ... 0.85471003 0.78843886 0.75208273]
 [0.73758383 0.70874797 0.67737473 ... 0.72717386 0.70482656 0.7638605 ]
 ...
 [0.83541201 0.8386872  0.71838643 ... 0.80204947 0.83352037 0.76898583]
 [0.83616432 0.82673341 0.73442807 ... 0.86542173 0.78982102 0.77871454]
 [0.83048901 0.8211685  0.74389879 ... 0.864918   0.81549967 0.81777544]]


In [9]:
del wordvectors

In [10]:
docs_samples_measure_sim_path = get_relative_path(data_path, DOCS_SAMPLES_WORD2VEC_SIM)
save_pickle(m_measure_sim, docs_samples_measure_sim_path)

In [11]:
n_singularvalues = 1
U, s, V = np.linalg.svd(m_measure_sim, full_matrices=False)
D = np.diag(s[:n_singularvalues])
m_udv_measure_sim = np.dot(U[:,:n_singularvalues], np.dot(D, V[:n_singularvalues,:]))
print(m_udv_measure_sim)

[[0.82892786 0.77719529 0.76529965 ... 0.81525781 0.77951582 0.80459596]
 [0.82982015 0.77803189 0.76612344 ... 0.81613538 0.78035491 0.80546205]
 [0.76093491 0.71344571 0.70252581 ... 0.74838615 0.7155759  0.73859883]
 ...
 [0.8211906  0.7699409  0.75815629 ... 0.80764814 0.77223977 0.79708581]
 [0.84356335 0.7909174  0.77881172 ... 0.82965195 0.79327889 0.81880185]
 [0.83983896 0.78742544 0.77537322 ... 0.82598897 0.78977651 0.81518678]]


In [12]:
docs_samples_measure_sim_udv_path = get_relative_path(data_path, DOCS_SAMPLES_WORD2VEC_SIM_UDV)
save_pickle(m_udv_measure_sim, docs_samples_measure_sim_udv_path)