In [1]:
import sys
import os
import random
# Counter
from collections import Counter
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
from somhos.methods.useful import save_pickle, load_pickle, wordvectors_centroid
from somhos.config.paths import *

Default path

In [2]:
prefix_path = "../../"
data_path = get_relative_path(prefix_path, V9_PATH)
os.path.exists(data_path)

True

Load samples

In [3]:
# Load sample A
docs_sample_a_path = get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX)
docs_sample_a = load_pickle(docs_sample_a_path)
# Load sample B
docs_sample_b_path = get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX)
docs_sample_b = load_pickle(docs_sample_b_path)

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (1000, 1000)


Load samples content

In [4]:
docs_samples_content_path = get_relative_path(data_path, DOCS_SAMPLES_CONTENT)
samples_content = load_pickle(docs_samples_content_path)
print("Docs: %d" % len(samples_content))

Docs: 2000


Path to word2vec embeddings

In [5]:
word2vec_path = get_relative_path(prefix_path, DEAULT_WORDVECTORS)
os.path.exists(word2vec_path)

True

In [6]:
from itertools import product
import numpy as np
from scipy.spatial.distance import cosine as cosine_distance
import gensim

In [7]:
wordvectors = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [8]:
sample_a_indices = range(0, len(docs_sample_a))
sample_b_indices = range(0, len(docs_sample_b))

def get_content_for(index, samples_content, field='bag-of-kps'):
    return set(t for k in samples_content[index][field] for t in k.decode('utf-8').replace("-", " ").split())

m_measure_sim = np.zeros([len(docs_sample_a), len(docs_sample_b)]) 
for i, j in product(sample_a_indices, sample_a_indices):
    # Words sets
    try:
        set_a = get_content_for(docs_sample_a[i], samples_content)
        set_b = get_content_for(docs_sample_b[j], samples_content)
    except UnicodeDecodeError:
        print(samples_content[docs_sample_a[i]]['bag-of-kps'])
        print(samples_content[docs_sample_b[j]]['bag-of-kps'])
        break
    # Cetroids
    centroid_a = wordvectors_centroid(wordvectors, set_a)
    centroid_b = wordvectors_centroid(wordvectors, set_b)
    # Measure
    measure_sim = 1.0 - cosine_distance(centroid_a, centroid_b)
    m_measure_sim[i, j] = measure_sim

print(m_measure_sim)

[[0.73584551 0.74774643 0.70017648 ... 0.71498831 0.59425368 0.66057425]
 [0.7529434  0.79520642 0.61637561 ... 0.74657731 0.58591201 0.66758565]
 [0.67955684 0.70658729 0.69086082 ... 0.7408391  0.62025059 0.75199539]
 ...
 [0.64550431 0.70693401 0.57462293 ... 0.68213468 0.56961568 0.65312854]
 [0.61921492 0.68430302 0.44202702 ... 0.63422805 0.64957497 0.59564065]
 [0.68590205 0.73472731 0.62482526 ... 0.73951819 0.66878865 0.73156019]]


In [9]:
docs_samples_measure_sim_path = get_relative_path(data_path, DOCS_SAMPLES_WORD2VEC_SIM_KPS)
save_pickle(m_measure_sim, docs_samples_measure_sim_path)

In [10]:
n_singularvalues = 1
U, s, V = np.linalg.svd(m_measure_sim, full_matrices=False)
D = np.diag(s[:n_singularvalues])
m_udv_measure_sim = np.dot(U[:,:n_singularvalues], np.dot(D, V[:n_singularvalues,:]))
print(m_udv_measure_sim)

[[0.70249156 0.76503067 0.61917123 ... 0.73742336 0.63819848 0.67459647]
 [0.713463   0.77697885 0.62884139 ... 0.74894037 0.6481658  0.68513226]
 [0.7221335  0.78642124 0.6364835  ... 0.75804201 0.65604276 0.69345846]
 ...
 [0.6631624  0.72220025 0.58450678 ... 0.69613853 0.60246878 0.63682902]
 [0.60632752 0.66030566 0.53441291 ... 0.63647751 0.55083552 0.58225099]
 [0.7176664  0.78155645 0.63254623 ... 0.75335278 0.65198449 0.68916874]]


In [11]:
docs_samples_measure_sim_udv_path = get_relative_path(data_path, DOCS_SAMPLES_WORD2VEC_SIM_UDV_KPS)
save_pickle(m_udv_measure_sim, docs_samples_measure_sim_udv_path)