In [1]:
import sys
import os
import random
# Counter
from collections import Counter
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
from somhos.methods.useful import save_pickle, load_pickle, wordvectors_centroid
from somhos.config.paths import *

Default path

In [2]:
prefix_path = "../../"
data_path = get_relative_path(prefix_path, V9GAMMA_PATH)
os.path.exists(data_path)

True

Load samples

In [3]:
# Load sample A
docs_sample_a_path = get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX)
docs_sample_a = load_pickle(docs_sample_a_path)
# Load sample B
docs_sample_b_path = get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX)
docs_sample_b = load_pickle(docs_sample_b_path)

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (5000, 5000)


Load samples content

In [4]:
docs_samples_content_path = get_relative_path(data_path, DOCS_SAMPLES_CONTENT)
samples_content = load_pickle(docs_samples_content_path)
print("Docs: %d" % len(samples_content))

Docs: 10000


Path to word2vec embeddings

In [5]:
word2vec_path = get_relative_path(prefix_path, DEAULT_WORDVECTORS)
os.path.exists(word2vec_path)

True

In [6]:
from itertools import product
import numpy as np
from scipy.spatial.distance import cosine as cosine_distance
import gensim

In [7]:
wordvectors = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [8]:
sample_a_indices = range(0, len(docs_sample_a))
sample_b_indices = range(0, len(docs_sample_b))

def get_content_for(index, samples_content, field='bag-of-kps'):
    return set(t for k in samples_content[index][field] for t in k.decode('utf-8').replace("-", " ").split())

m_measure_sim = np.zeros([len(docs_sample_a), len(docs_sample_b)]) 
for i, j in product(sample_a_indices, sample_a_indices):
    # Words sets
    try:
        set_a = get_content_for(docs_sample_a[i], samples_content)
        set_b = get_content_for(docs_sample_b[j], samples_content)
    except UnicodeDecodeError:
        print(samples_content[docs_sample_a[i]]['bag-of-kps'])
        print(samples_content[docs_sample_b[j]]['bag-of-kps'])
        break
    # Cetroids
    centroid_a = wordvectors_centroid(wordvectors, set_a)
    centroid_b = wordvectors_centroid(wordvectors, set_b)
    # Measure
    measure_sim = 1.0 - cosine_distance(centroid_a, centroid_b)
    m_measure_sim[i, j] = measure_sim

print(m_measure_sim)

[[0.59941519 0.55470327 0.43061113 ... 0.62401874 0.42920045 0.59120494]
 [0.60925314 0.56669532 0.47241645 ... 0.71085392 0.59293892 0.51076994]
 [0.45249799 0.43012126 0.41202952 ... 0.49928811 0.45999924 0.45372825]
 ...
 [0.71048871 0.64416613 0.48523162 ... 0.65125671 0.6224604  0.60824139]
 [0.6964594  0.63334227 0.43874686 ... 0.72537404 0.55560171 0.62522677]
 [0.69810397 0.6918212  0.53824229 ... 0.71049728 0.62089338 0.65411763]]


In [9]:
del wordvectors

In [10]:
docs_samples_measure_sim_path = get_relative_path(data_path, DOCS_SAMPLES_WORD2VEC_SIM_KPS)
save_pickle(m_measure_sim, docs_samples_measure_sim_path)

In [11]:
n_singularvalues = 1
U, s, V = np.linalg.svd(m_measure_sim, full_matrices=False)
D = np.diag(s[:n_singularvalues])
m_udv_measure_sim = np.dot(U[:,:n_singularvalues], np.dot(D, V[:n_singularvalues,:]))
print(m_udv_measure_sim)

[[0.56327034 0.50936044 0.42587916 ... 0.57711881 0.47450445 0.52818554]
 [0.64564009 0.58384668 0.48815753 ... 0.66151368 0.54389352 0.60542466]
 [0.51288639 0.46379867 0.3877847  ... 0.52549612 0.43206051 0.48093988]
 ...
 [0.66195384 0.59859907 0.50049208 ... 0.67822852 0.55763639 0.62072227]
 [0.67309706 0.60867578 0.50891728 ... 0.6896457  0.56702354 0.6311714 ]
 [0.69570252 0.6291177  0.52600888 ... 0.71280694 0.58606661 0.65236882]]


In [12]:
docs_samples_measure_sim_udv_path = get_relative_path(data_path, DOCS_SAMPLES_WORD2VEC_SIM_UDV_KPS)
save_pickle(m_udv_measure_sim, docs_samples_measure_sim_udv_path)