In [1]:
import sys
import os
import random
# Counter
from collections import Counter
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
from somhos.methods.useful import save_pickle, load_pickle, wordvectors_centroid
from somhos.config.paths import *

Default path

In [2]:
prefix_path = "../../"
data_path = get_relative_path(prefix_path, V9GAMMA_PATH)
os.path.exists(data_path)

True

Load samples

In [3]:
# Load sample A
docs_sample_a_path = get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX)
docs_sample_a = load_pickle(docs_sample_a_path)
# Load sample B
docs_sample_b_path = get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX)
docs_sample_b = load_pickle(docs_sample_b_path)

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (5000, 5000)


Load samples content

In [4]:
docs_samples_content_path = get_relative_path(data_path, DOCS_SAMPLES_CONTENT)
samples_content = load_pickle(docs_samples_content_path)
print("Docs: %d" % len(samples_content))

Docs: 10000


In [5]:
from itertools import product
import numpy as np
from scipy.spatial.distance import cosine as cosine_distance
import gensim

Path to word2vec embeddings

In [6]:
word2vec_path = get_relative_path(prefix_path, DEAULT_WORDVECTORS)
os.path.exists(word2vec_path)

True

In [7]:
wordvectors = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

Load counts documents by word

In [8]:
docs_samples_word_doc_count_path = get_relative_path(data_path, DOCS_SAMPLES_WORD_DOC_COUNT)
samples_word_in_docs_count = load_pickle(docs_samples_word_doc_count_path)

Measure $idf(t, D) = log\frac{N}{\vert \left\{ d\in D:t\in d \right\} \vert}$

In [9]:
n_docs = len(docs_sample_a) + len(docs_sample_b)
t_idf = {t: np.log(n_docs/v) for t, v in samples_word_in_docs_count.items()}

In [10]:
sample_a_indices = range(0, len(docs_sample_a))
sample_b_indices = range(0, len(docs_sample_b))

def get_content_for(index, samples_content, field='kps-normalized'):
    return list(t for k in samples_content[index][field] for t in k.decode('utf-8').replace("-", " ").split())

def tf_doc(terms):
    tf = Counter(terms)
    cnt = sum(tf.values())
    return {k: v/cnt for k, v in tf.items()}

m_measure_sim = np.zeros([len(docs_sample_a), len(docs_sample_b)]) 
for i, j in product(sample_a_indices, sample_a_indices):
    # Term frequencies by document
    tf_a = tf_doc(get_content_for(docs_sample_a[i], samples_content))
    tf_b = tf_doc(get_content_for(docs_sample_b[j], samples_content))
    # Term list
    t_list = list(set(tf_a) | set(tf_b))
    # TF-IDF
    tf_idf_a = np.zeros([len(t_list),])
    tf_idf_b = np.zeros([len(t_list),])
    # wordvector length 300
    centroid_a = mean = np.zeros(300, dtype=np.float64)
    centroid_b = mean = np.zeros(300, dtype=np.float64)
    for pos, t in enumerate(t_list):
        if t and t in wordvectors.vocab and t in t_idf:
            wv = wordvectors.get_vector(t)
            if t in tf_a:
                tf_idf_a[pos] = tf_a[t] * t_idf[t]
                centroid_a += wv * tf_idf_a[pos]
            if t in tf_b:
                tf_idf_b[pos] = tf_b[t] * t_idf[t]
                centroid_b += wv * tf_idf_b[pos]
            
    sum_tf_idf_a = sum(tf_idf_a)
    sum_tf_idf_b = sum(tf_idf_b)
    # Centroid
    centroid_a /= sum_tf_idf_a
    centroid_b /= sum_tf_idf_b
    
    # Measure
    measure_sim = 1.0 - cosine_distance(centroid_a, centroid_b)
    m_measure_sim[i, j] = measure_sim

print(m_measure_sim)

[[0.4308252  0.46092489 0.25156474 ... 0.53535328 0.33720741 0.43388834]
 [0.4332573  0.47743951 0.37647243 ... 0.55057405 0.55530767 0.36347259]
 [0.3466539  0.38661425 0.33447691 ... 0.41947611 0.42869596 0.39422269]
 ...
 [0.41816378 0.4478462  0.348465   ... 0.46241594 0.47159042 0.44217885]
 [0.50347543 0.57928714 0.33142323 ... 0.61184987 0.51755772 0.51304573]
 [0.44150739 0.54036326 0.43194542 ... 0.52334593 0.5163729  0.45019183]]


In [11]:
del wordvectors

In [12]:
docs_samples_measure_sim_path = get_relative_path(data_path, DOCS_SAMPLES_WORD2VEC_TFIDF_KPS)
save_pickle(m_measure_sim, docs_samples_measure_sim_path)

In [13]:
n_singularvalues = 1
U, s, V = np.linalg.svd(m_measure_sim, full_matrices=False)
D = np.diag(s[:n_singularvalues])
m_udv_measure_sim = np.dot(U[:,:n_singularvalues], np.dot(D, V[:n_singularvalues,:]))
print(m_udv_measure_sim)

[[0.38512844 0.41505043 0.31852407 ... 0.44523561 0.39519693 0.40710229]
 [0.44736451 0.48212184 0.36999699 ... 0.51718488 0.45906005 0.47288929]
 [0.3906337  0.42098341 0.32307724 ... 0.45160007 0.40084612 0.41292166]
 ...
 [0.38580905 0.41578391 0.31908697 ... 0.44602243 0.39589533 0.40782173]
 [0.50702332 0.54641575 0.41933837 ... 0.58615467 0.52027854 0.53595199]
 [0.46365498 0.49967797 0.38347017 ... 0.53601781 0.47577641 0.49010923]]


In [14]:
docs_samples_measure_sim_udv_path = get_relative_path(data_path, DOCS_SAMPLES_WORD2VEC_TFIDF_UDV_KPS)
save_pickle(m_udv_measure_sim, docs_samples_measure_sim_udv_path)