In [1]:
import sys
import os
import random
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
from somhos.methods.useful import save_pickle, load_pickle
from somhos.config.paths import get_relative_path
from somhos.config.paths import DOCS_SAMPLE_A_SUFFIX, DOCS_SAMPLE_B_SUFFIX
from somhos.config.paths import DOCS_SAMPLES_CONTENT
from somhos.config.paths import DOCS_SAMPLES_WORD_DOC_COUNT, DOCS_SAMPLES_WORD_COUNT
from somhos.config.paths import DOCS_SAMPLES_KPS_DOC_COUNT, DOCS_SAMPLES_KPS_COUNT
from somhos.config.paths import DOCS_SAMPLES_JACCARD_SIM, DOCS_SAMPLES_JACCARD_SIM_UDV
from somhos.config.paths import DOCS_SAMPLES_JACCARD_SIM_KPS, DOCS_SAMPLES_JACCARD_SIM_UDV_KPS
from somhos.config.paths import DOCS_SAMPLES_TF_IDF, KPS_DIRECTORY_INVERSE_SUFFIX

Default path

In [2]:
data_path = "../../src/somhos/resources/aminer/v9gamma"

Load samples

In [3]:
# Load sample A
docs_sample_a_path = get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX)
docs_sample_a = load_pickle(docs_sample_a_path)
# Load sample B
docs_sample_b_path = get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX)
docs_sample_b = load_pickle(docs_sample_b_path)

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (5000, 5000)


Load samples content

In [4]:
docs_samples_content_path = get_relative_path(data_path, DOCS_SAMPLES_CONTENT)
samples_content = load_pickle(docs_samples_content_path)
print("Docs: %d" % len(samples_content))

Docs: 10000


In [5]:
from nltk.metrics.distance import jaccard_distance
from itertools import product
import numpy as np

In [6]:
sample_a_indices = range(0, len(docs_sample_a))
sample_b_indices = range(0, len(docs_sample_b))

def get_kps(index, samples_content):
    return samples_content[index]['bag-of-kps']

m_jaccard_sim = np.zeros([len(docs_sample_a), len(docs_sample_b)]) 
for i, j in product(sample_a_indices, sample_a_indices):
    set_a = get_kps(docs_sample_a[i], samples_content)
    set_b = get_kps(docs_sample_b[j], samples_content)
    if set_a and set_b:  
        jaccard_sim = 1 - jaccard_distance(set_a, set_b)
    else:
        jaccard_sim = 0
    m_jaccard_sim[i, j] = jaccard_sim

print(m_jaccard_sim)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.02631579 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.02439024 0.         0.        ]]


In [7]:
docs_samples_jaccard_sim_path = get_relative_path(data_path, DOCS_SAMPLES_JACCARD_SIM_KPS)
save_pickle(m_jaccard_sim, docs_samples_jaccard_sim_path)

In [8]:
n_singularvalues = 1
U, s, V = np.linalg.svd(m_jaccard_sim, full_matrices=False)
D = np.diag(s[:n_singularvalues])
m_udv_jaccard_sim = np.dot(U[:,:n_singularvalues], np.dot(D, V[:n_singularvalues,:]))
print(m_udv_jaccard_sim)

[[9.89092281e-04 4.85534230e-04 1.62718086e-03 ... 2.07293709e-03
  3.38306208e-04 1.29827380e-03]
 [2.21332913e-03 1.08649827e-03 3.64120402e-03 ... 4.63868956e-03
  7.57040572e-04 2.90519628e-03]
 [1.75591496e-04 8.61958823e-05 2.88870034e-04 ... 3.68004211e-04
  6.00587977e-05 2.30479848e-04]
 ...
 [2.14650524e-04 1.05369518e-04 3.53127034e-04 ... 4.49864022e-04
  7.34184325e-05 2.81748383e-04]
 [3.15118049e-03 1.54687891e-03 5.18408713e-03 ... 6.60423604e-03
  1.07782048e-03 4.13621169e-03]
 [1.81206654e-03 8.89522999e-04 2.98107673e-03 ... 3.79772444e-03
  6.19793898e-04 2.37850255e-03]]


In [9]:
docs_samples_jaccard_sim_udv_path = get_relative_path(data_path, DOCS_SAMPLES_JACCARD_SIM_UDV_KPS)
save_pickle(m_udv_jaccard_sim, docs_samples_jaccard_sim_udv_path)