In [1]:
import sys
import os
import random
# Counter
from collections import Counter
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
from somhos.methods.useful import save_pickle, load_pickle, wordvectors_centroid
from somhos.config.paths import *

Default path

In [2]:
prefix_path = "../../"
data_path = get_relative_path(prefix_path, V9_PATH)
os.path.exists(data_path)

True

Load samples

In [3]:
# Load sample A
docs_sample_a_path = get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX)
docs_sample_a = load_pickle(docs_sample_a_path)
# Load sample B
docs_sample_b_path = get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX)
docs_sample_b = load_pickle(docs_sample_b_path)

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (1000, 1000)


Load samples content

In [4]:
docs_samples_content_path = get_relative_path(data_path, DOCS_SAMPLES_CONTENT)
samples_content = load_pickle(docs_samples_content_path)
print("Docs: %d" % len(samples_content))

Docs: 2000


In [5]:
from itertools import product
import numpy as np
from scipy.spatial.distance import cosine as cosine_distance
from scipy.sparse import csr_matrix

In [6]:
docs_samples_word_doc_count_path = get_relative_path(data_path, DOCS_SAMPLES_WORD_DOC_COUNT)
samples_word_in_docs_count = load_pickle(docs_samples_word_doc_count_path)

In [7]:
n_docs = len(docs_sample_a) + len(docs_sample_b)
t_idf = {t: np.log(n_docs/v) for t, v in samples_word_in_docs_count.items()}

def get_content_tokens(index, samples_content):
    return samples_content[index]['tokens']

In [8]:
sample_a_indices = range(0, len(docs_sample_a))
sample_b_indices = range(0, len(docs_sample_b))

def get_content_for(index, samples_content, field='tokens'):
    return samples_content[index][field]

def tf_doc(terms):
    tf = Counter(terms)
    cnt = sum(tf.values())
    return {k: v/cnt for k, v in tf.items()}

m_measure_sim = np.zeros([len(docs_sample_a), len(docs_sample_b)]) 
for i, j in product(sample_a_indices, sample_a_indices):
    # Term frequencies by document
    tf_a = tf_doc(get_content_for(docs_sample_a[i], samples_content))
    tf_b = tf_doc(get_content_for(docs_sample_b[j], samples_content))
    # Term list
    t_list = list(set(tf_a) | set(tf_b))
    # TF-IDF
    tf_idf_a = np.zeros([len(t_list),])
    tf_idf_b = np.zeros([len(t_list),])
    for pos, t in enumerate(t_list):
        if t in tf_a:
            tf_idf_a[pos] = tf_a[t] * t_idf[t]
        if t in tf_b:
            tf_idf_b[pos] = tf_b[t] * t_idf[t]
    # Measure
    measure_sim = 1.0 - cosine_distance(tf_idf_a, tf_idf_b)
    m_measure_sim[i, j] = measure_sim

print(m_measure_sim)

[[0.00750679 0.01106243 0.00031596 ... 0.00968403 0.0032995  0.00496262]
 [0.0157523  0.01400749 0.03310009 ... 0.01981849 0.00143613 0.00954822]
 [0.00522653 0.00838618 0.00083333 ... 0.02829378 0.00507733 0.01066712]
 ...
 [0.0074064  0.00825712 0.00714255 ... 0.01280274 0.00424296 0.00496312]
 [0.00849982 0.01319539 0.00128268 ... 0.02281991 0.00201917 0.        ]
 [0.01844931 0.00421175 0.         ... 0.01321121 0.00665555 0.00863887]]


In [9]:
docs_samples_measure_sim_path = get_relative_path(data_path, DOCS_SAMPLES_TF_IDF)
save_pickle(m_measure_sim, docs_samples_measure_sim_path)

In [10]:
n_singularvalues = 1
U, s, V = np.linalg.svd(m_measure_sim, full_matrices=False)
D = np.diag(s[:n_singularvalues])
m_udv_measure_sim = np.dot(U[:,:n_singularvalues], np.dot(D, V[:n_singularvalues,:]))
print(m_udv_measure_sim)

[[0.01353857 0.02039515 0.0128373  ... 0.02081429 0.00554584 0.00883632]
 [0.01443389 0.02174391 0.01368625 ... 0.02219076 0.00591259 0.00942068]
 [0.01512196 0.02278044 0.01433868 ... 0.0232486  0.00619445 0.00986976]
 ...
 [0.01451197 0.02186152 0.01376028 ... 0.02231079 0.00594457 0.00947163]
 [0.0078894  0.01188498 0.00748075 ... 0.01212922 0.00323176 0.00514924]
 [0.0144247  0.02173006 0.01367753 ... 0.02217663 0.00590883 0.00941468]]


In [11]:
docs_samples_measure_sim_udv_path = get_relative_path(data_path, DOCS_SAMPLES_TF_IDF_UDV)
save_pickle(m_udv_measure_sim, docs_samples_measure_sim_udv_path)