In [1]:
import sys
import os
import random
# Counter
from collections import Counter
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
from somhos.methods.useful import save_pickle, load_pickle, wordvectors_centroid
from somhos.config.paths import *

Default path

In [2]:
prefix_path = "../../"
data_path = get_relative_path(prefix_path, V9GAMMA_PATH)
os.path.exists(data_path)

True

Load samples

In [3]:
# Load sample A
docs_sample_a = load_pickle(get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX))
# Load sample B
docs_sample_b = load_pickle(get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX))

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (5000, 5000)


Load corpus

In [4]:
doc_directory = load_pickle(get_relative_path(data_path, DOC_DIRECTORY))
# Corpus - tokens
corpus_bag_of_words = load_pickle(get_relative_path(data_path, CORPUS_BAG_OF_WORDS))
dictionary_tokens = load_pickle(get_relative_path(data_path, DICTIONARY_TOKENS))
# Corpus - keyphrases
corpus_bag_of_keyphrases = load_pickle(get_relative_path(data_path, CORPUS_BAG_OF_KEYPHRASES))
dictionary_keyphrases = load_pickle(get_relative_path(data_path, DICTIONARY_KEYPHRASES))

print(len(doc_directory))
print("Test:", doc_directory[docs_sample_a[0]])
print("Test:", doc_directory[docs_sample_b[0]])

# print("Test:", corpus_bag_of_words[doc_directory[docs_sample_a[0]]])
# print("Test:", corpus_bag_of_words[doc_directory[docs_sample_b[0]]])
print(len(corpus_bag_of_words), len(dictionary_tokens))
print(len(corpus_bag_of_keyphrases), len(dictionary_keyphrases))

10000
Test: 2111
Test: 483
10000 62388
10000 107954


In [5]:
corpus_a_tokens = [corpus_bag_of_words[doc_directory[docid]] for docid in docs_sample_a]
corpus_b_tokens = [corpus_bag_of_words[doc_directory[docid]] for docid in docs_sample_b]
corpus_a_keyphrases = [corpus_bag_of_keyphrases[doc_directory[docid]] for docid in docs_sample_a]
corpus_b_keyphrases = [corpus_bag_of_keyphrases[doc_directory[docid]] for docid in docs_sample_b]

print(len(corpus_a_tokens), len(corpus_b_tokens), len(corpus_a_keyphrases), len(corpus_b_keyphrases))

5000 5000 5000 5000


In [6]:
import numpy as np
from gensim import models
from gensim.similarities import Similarity
from gensim.test.utils import get_tmpfile
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## TFIDF

In [7]:
model_tf_idf_tokens = models.TfidfModel(corpus_bag_of_words)
model_tf_idf_keyphrases = models.TfidfModel(corpus_bag_of_keyphrases)

2018-09-14 02:21:37,150 : INFO : collecting document frequencies
2018-09-14 02:21:37,153 : INFO : PROGRESS: processing document #0
2018-09-14 02:21:37,493 : INFO : calculating IDF weights for 10000 documents and 62387 features (988950 matrix non-zeros)
2018-09-14 02:21:37,688 : INFO : collecting document frequencies
2018-09-14 02:21:37,689 : INFO : PROGRESS: processing document #0
2018-09-14 02:21:37,787 : INFO : calculating IDF weights for 10000 documents and 107953 features (232171 matrix non-zeros)


Example

In [8]:
doc = corpus_bag_of_words[doc_directory[docs_sample_a[0]]]
vector1 = model_tf_idf_tokens[doc]

doc = corpus_bag_of_keyphrases[doc_directory[docs_sample_a[0]]]
vector2 = model_tf_idf_keyphrases[doc]

# Example
print(vector1[:5])
print(vector2[:5])

[(16, 0.012841377027749524), (36, 0.008727667495260607), (50, 0.028852844980048267), (51, 0.005664177941739807), (80, 0.054138131117306684)]
[(105, 0.06695925194076603), (470, 0.07503314787978065), (507, 0.13897025827840198), (566, 0.06990219084546122), (1274, 0.05774539521913921)]


Index simmilarities

In [9]:
index_tf_tokens = Similarity(get_tmpfile("index-tf-tokens"), corpus_b_tokens, num_features=len(dictionary_tokens))
save_pickle(np.array(index_tf_tokens[corpus_a_tokens]), get_relative_path(data_path, SIM_TF_TOKENS))

index_tf_keyphrases = Similarity(get_tmpfile("index-tf-keyphrases"), corpus_b_keyphrases, num_features=len(dictionary_keyphrases))
save_pickle(np.array(index_tf_keyphrases[corpus_a_keyphrases]), get_relative_path(data_path, SIM_TF_KEYPHRASES))

2018-09-14 02:21:38,147 : INFO : starting similarity index under /tmp/index-tf-tokens
  if np.issubdtype(vec.dtype, np.int):
2018-09-14 02:21:39,789 : INFO : creating sparse index
2018-09-14 02:21:39,790 : INFO : creating sparse matrix from corpus
2018-09-14 02:21:39,791 : INFO : PROGRESS: at document #0/5000
2018-09-14 02:21:41,344 : INFO : created <5000x62388 sparse matrix of type '<class 'numpy.float32'>'
	with 495130 stored elements in Compressed Sparse Row format>
2018-09-14 02:21:41,345 : INFO : creating sparse shard #0
2018-09-14 02:21:41,346 : INFO : saving index shard to /tmp/index-tf-tokens.0
2018-09-14 02:21:41,348 : INFO : saving SparseMatrixSimilarity object under /tmp/index-tf-tokens.0, separately None
2018-09-14 02:21:41,384 : INFO : saved /tmp/index-tf-tokens.0
2018-09-14 02:21:41,385 : INFO : loading SparseMatrixSimilarity object from /tmp/index-tf-tokens.0
2018-09-14 02:21:41,407 : INFO : loaded /tmp/index-tf-tokens.0
2018-09-14 02:21:43,905 : INFO : starting similari

In [10]:
index_tf_idf_tokens = Similarity(get_tmpfile("index-tf-idf-tokens"), model_tf_idf_tokens[corpus_b_tokens], num_features=len(dictionary_tokens))
save_pickle(np.array(index_tf_idf_tokens[model_tf_idf_tokens[corpus_a_tokens]]), get_relative_path(data_path, SIM_TF_IDF_TOKENS))

index_tf_idf_keyphrases = Similarity(get_tmpfile("index-tf-idf-keyphrases"), model_tf_idf_keyphrases[corpus_b_keyphrases], num_features=len(dictionary_keyphrases))
save_pickle(np.array(index_tf_idf_keyphrases[model_tf_idf_keyphrases[corpus_a_keyphrases]]), get_relative_path(data_path, SIM_TF_IDF_KEYPHRASES))

2018-09-14 02:21:46,501 : INFO : starting similarity index under /tmp/index-tf-idf-tokens
  if np.issubdtype(vec.dtype, np.int):
2018-09-14 02:21:51,851 : INFO : creating sparse index
2018-09-14 02:21:51,852 : INFO : creating sparse matrix from corpus
2018-09-14 02:21:51,853 : INFO : PROGRESS: at document #0/5000
2018-09-14 02:21:53,459 : INFO : created <5000x62388 sparse matrix of type '<class 'numpy.float32'>'
	with 495130 stored elements in Compressed Sparse Row format>
2018-09-14 02:21:53,460 : INFO : creating sparse shard #0
2018-09-14 02:21:53,462 : INFO : saving index shard to /tmp/index-tf-idf-tokens.0
2018-09-14 02:21:53,463 : INFO : saving SparseMatrixSimilarity object under /tmp/index-tf-idf-tokens.0, separately None
2018-09-14 02:21:53,497 : INFO : saved /tmp/index-tf-idf-tokens.0
2018-09-14 02:21:53,499 : INFO : loading SparseMatrixSimilarity object from /tmp/index-tf-idf-tokens.0
2018-09-14 02:21:53,527 : INFO : loaded /tmp/index-tf-idf-tokens.0
2018-09-14 02:21:59,556 : 