In [1]:
import sys
import os
import random
# Counter
from collections import Counter
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
from somhos.methods.useful import save_pickle, load_pickle, wordvectors_centroid
from somhos.config.paths import *

Default path

In [2]:
prefix_path = "../../"
data_path = get_relative_path(prefix_path, V9GAMMA_PATH)
os.path.exists(data_path)

True

Load samples

In [3]:
# Load sample A
docs_sample_a = load_pickle(get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX))
# Load sample B
docs_sample_b = load_pickle(get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX))

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (5000, 5000)


Load corpus

In [4]:
doc_directory = load_pickle(get_relative_path(data_path, DOC_DIRECTORY))
# Corpus - tokens
corpus_bag_of_words = load_pickle(get_relative_path(data_path, CORPUS_BAG_OF_WORDS))
dictionary_tokens = load_pickle(get_relative_path(data_path, DICTIONARY_TOKENS))
# Corpus - keyphrases
corpus_bag_of_keyphrases = load_pickle(get_relative_path(data_path, CORPUS_BAG_OF_KEYPHRASES))
dictionary_keyphrases = load_pickle(get_relative_path(data_path, DICTIONARY_KEYPHRASES))

print(len(doc_directory))
print("Test:", doc_directory[docs_sample_a[0]])
print("Test:", doc_directory[docs_sample_b[0]])

# print("Test:", corpus_bag_of_words[doc_directory[docs_sample_a[0]]])
# print("Test:", corpus_bag_of_words[doc_directory[docs_sample_b[0]]])
print(len(corpus_bag_of_words), len(dictionary_tokens))
print(len(corpus_bag_of_keyphrases), len(dictionary_keyphrases))

10000
Test: 2111
Test: 483
10000 62388
10000 107954


In [5]:
corpus_a_tokens = [corpus_bag_of_words[doc_directory[docid]] for docid in docs_sample_a]
corpus_b_tokens = [corpus_bag_of_words[doc_directory[docid]] for docid in docs_sample_b]
corpus_a_keyphrases = [corpus_bag_of_keyphrases[doc_directory[docid]] for docid in docs_sample_a]
corpus_b_keyphrases = [corpus_bag_of_keyphrases[doc_directory[docid]] for docid in docs_sample_b]

print(len(corpus_a_tokens), len(corpus_b_tokens), len(corpus_a_keyphrases), len(corpus_b_keyphrases))

5000 5000 5000 5000


In [6]:
import numpy as np
from gensim import models
from gensim.similarities import Similarity
from gensim.test.utils import get_tmpfile
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## TFIDF

In [7]:
model_tf_idf_tokens = models.TfidfModel(corpus_bag_of_words)
model_tf_idf_keyphrases = models.TfidfModel(corpus_bag_of_keyphrases)

2018-09-14 12:58:05,460 : INFO : collecting document frequencies
2018-09-14 12:58:05,463 : INFO : PROGRESS: processing document #0
2018-09-14 12:58:05,900 : INFO : calculating IDF weights for 10000 documents and 62387 features (988950 matrix non-zeros)
2018-09-14 12:58:06,093 : INFO : collecting document frequencies
2018-09-14 12:58:06,094 : INFO : PROGRESS: processing document #0
2018-09-14 12:58:06,181 : INFO : calculating IDF weights for 10000 documents and 107953 features (232171 matrix non-zeros)


In [8]:
corpus_tfidf_tokens = model_tf_idf_tokens[corpus_bag_of_words]
corpus_tfidf_keyphrases = model_tf_idf_keyphrases[corpus_bag_of_keyphrases]

## LSI

In [9]:
model_lsi_tokens = models.LsiModel(corpus_tfidf_tokens, id2word=dictionary_tokens)
model_lsi_keyphrases = models.LsiModel(corpus_tfidf_keyphrases, id2word=dictionary_keyphrases)

2018-09-14 12:58:06,542 : INFO : using serial LSI version on this node
2018-09-14 12:58:06,544 : INFO : updating model with new documents
2018-09-14 12:58:13,536 : INFO : preparing a new chunk of documents
2018-09-14 12:58:13,877 : INFO : using 100 extra samples and 2 power iterations
2018-09-14 12:58:13,877 : INFO : 1st phase: constructing (62388, 300) action matrix
2018-09-14 12:58:14,707 : INFO : orthonormalizing (62388, 300) action matrix
2018-09-14 12:58:27,320 : INFO : 2nd phase: running dense svd on (300, 10000) matrix
2018-09-14 12:58:28,788 : INFO : computing the final decomposition
2018-09-14 12:58:28,789 : INFO : keeping 200 factors (discarding 18.533% of energy spectrum)
2018-09-14 12:58:33,201 : INFO : processed documents up to #10000
2018-09-14 12:58:33,220 : INFO : topic #0(11.229): 0.122*"data" + 0.107*"system" + 0.107*"network" + 0.106*"model" + 0.096*"algorithm" + 0.091*"time" + 0.087*"systems" + 0.086*"design" + 0.086*"method" + 0.085*"based"
2018-09-14 12:58:33,224 

In [10]:
lsi_a_tokens = model_lsi_tokens[[corpus_tfidf_tokens[doc_directory[docid]] for docid in docs_sample_a]]
lsi_b_tokens = model_lsi_tokens[[corpus_tfidf_tokens[doc_directory[docid]] for docid in docs_sample_b]]

lsi_a_keyphrases = model_lsi_keyphrases[[corpus_tfidf_keyphrases[doc_directory[docid]] for docid in docs_sample_a]]
lsi_b_keyphrases = model_lsi_keyphrases[[corpus_tfidf_keyphrases[doc_directory[docid]] for docid in docs_sample_b]]

Index similarities

In [11]:
index_lsi_tokens = Similarity(get_tmpfile("index-lsi-tokens"), lsi_b_tokens, num_features=len(dictionary_tokens))
save_pickle(np.array(index_lsi_tokens[lsi_a_tokens]), get_relative_path(data_path, SIM_LSI_TOKENS))

index_lsi_keyphrases = Similarity(get_tmpfile("index-lsi-keyphrases"), lsi_b_keyphrases, num_features=len(dictionary_keyphrases))
save_pickle(np.array(index_lsi_keyphrases[lsi_a_keyphrases]), get_relative_path(data_path, SIM_LSI_KEYPHRASES))

2018-09-14 12:59:11,883 : INFO : starting similarity index under /tmp/index-lsi-tokens
  if np.issubdtype(vec.dtype, np.int):
2018-09-14 12:59:14,403 : INFO : creating sparse index
2018-09-14 12:59:14,404 : INFO : creating sparse matrix from corpus
2018-09-14 12:59:14,407 : INFO : PROGRESS: at document #0/5000
2018-09-14 12:59:17,372 : INFO : created <5000x62388 sparse matrix of type '<class 'numpy.float32'>'
	with 1000000 stored elements in Compressed Sparse Row format>
2018-09-14 12:59:17,373 : INFO : creating sparse shard #0
2018-09-14 12:59:17,374 : INFO : saving index shard to /tmp/index-lsi-tokens.0
2018-09-14 12:59:17,376 : INFO : saving SparseMatrixSimilarity object under /tmp/index-lsi-tokens.0, separately None
2018-09-14 12:59:17,449 : INFO : saved /tmp/index-lsi-tokens.0
2018-09-14 12:59:17,450 : INFO : loading SparseMatrixSimilarity object from /tmp/index-lsi-tokens.0
2018-09-14 12:59:17,498 : INFO : loaded /tmp/index-lsi-tokens.0
2018-09-14 12:59:38,805 : INFO : starting s

In [12]:
print(model_lsi_tokens.num_topics)
model_lsi_tokens.print_topics(num_topics=2, num_words=10)
# model_lsi_tokens[corpus_tfidf_tokens[0]]

2018-09-14 13:00:04,757 : INFO : topic #0(11.229): 0.122*"data" + 0.107*"system" + 0.107*"network" + 0.106*"model" + 0.096*"algorithm" + 0.091*"time" + 0.087*"systems" + 0.086*"design" + 0.086*"method" + 0.085*"based"
2018-09-14 13:00:04,761 : INFO : topic #1(5.390): 0.280*"image" + 0.162*"images" + 0.145*"method" + -0.144*"software" + 0.136*"algorithm" + -0.132*"service" + -0.131*"web" + -0.121*"management" + -0.111*"services" + -0.103*"security"


200


[(0,
  '0.122*"data" + 0.107*"system" + 0.107*"network" + 0.106*"model" + 0.096*"algorithm" + 0.091*"time" + 0.087*"systems" + 0.086*"design" + 0.086*"method" + 0.085*"based"'),
 (1,
  '0.280*"image" + 0.162*"images" + 0.145*"method" + -0.144*"software" + 0.136*"algorithm" + -0.132*"service" + -0.131*"web" + -0.121*"management" + -0.111*"services" + -0.103*"security"')]

In [13]:
print(model_lsi_keyphrases.num_topics)
model_lsi_keyphrases.print_topics(num_topics=2, num_words=10)

2018-09-14 15:07:46,286 : INFO : topic #0(4.343): -0.220*"algorithm" + -0.195*"g" + -0.192*"system" + -0.179*"algorithms" + -0.166*"data" + -0.162*"book" + -0.148*"simulation" + -0.142*"users" + -0.119*"solution" + -0.117*"information"
2018-09-14 15:07:46,292 : INFO : topic #1(3.843): -0.841*"g" + -0.196*"v" + -0.123*"graphs" + -0.116*"x" + -0.113*"graph" + -0.108*"f" + -0.087*"k" + -0.087*"e" + 0.076*"book" + 0.076*"system"


200


[(0,
  '-0.220*"algorithm" + -0.195*"g" + -0.192*"system" + -0.179*"algorithms" + -0.166*"data" + -0.162*"book" + -0.148*"simulation" + -0.142*"users" + -0.119*"solution" + -0.117*"information"'),
 (1,
  '-0.841*"g" + -0.196*"v" + -0.123*"graphs" + -0.116*"x" + -0.113*"graph" + -0.108*"f" + -0.087*"k" + -0.087*"e" + 0.076*"book" + 0.076*"system"')]