In [1]:
import sys
import os
import random
# Counter
from collections import Counter
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
from somhos.methods.useful import save_pickle, load_pickle, wordvectors_centroid
from somhos.config.paths import *

Default path

In [2]:
prefix_path = "../../"
data_path = get_relative_path(prefix_path, V9GAMMA_PATH)
os.path.exists(data_path)

True

Load samples

In [3]:
# Load sample A
docs_sample_a = load_pickle(get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX))
# Load sample B
docs_sample_b = load_pickle(get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX))

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (5000, 5000)


Load corpus

In [4]:
doc_directory = load_pickle(get_relative_path(data_path, DOC_DIRECTORY))
# Corpus - tokens
corpus_bag_of_words = load_pickle(get_relative_path(data_path, CORPUS_BAG_OF_WORDS))
dictionary_tokens = load_pickle(get_relative_path(data_path, DICTIONARY_TOKENS))
# Corpus - keyphrases
corpus_bag_of_keyphrases = load_pickle(get_relative_path(data_path, CORPUS_BAG_OF_KEYPHRASES))
dictionary_keyphrases = load_pickle(get_relative_path(data_path, DICTIONARY_KEYPHRASES))

print(len(doc_directory))
print("Test:", doc_directory[docs_sample_a[0]])
print("Test:", doc_directory[docs_sample_b[0]])

# print("Test:", corpus_bag_of_words[doc_directory[docs_sample_a[0]]])
# print("Test:", corpus_bag_of_words[doc_directory[docs_sample_b[0]]])
print(len(corpus_bag_of_words), len(dictionary_tokens))
print(len(corpus_bag_of_keyphrases), len(dictionary_keyphrases))

10000
Test: 2111
Test: 483
10000 62388
10000 107954


In [5]:
corpus_a_tokens = [corpus_bag_of_words[doc_directory[docid]] for docid in docs_sample_a]
corpus_b_tokens = [corpus_bag_of_words[doc_directory[docid]] for docid in docs_sample_b]
corpus_a_keyphrases = [corpus_bag_of_keyphrases[doc_directory[docid]] for docid in docs_sample_a]
corpus_b_keyphrases = [corpus_bag_of_keyphrases[doc_directory[docid]] for docid in docs_sample_b]

print(len(corpus_a_tokens), len(corpus_b_tokens), len(corpus_a_keyphrases), len(corpus_b_keyphrases))

5000 5000 5000 5000


In [6]:
import numpy as np
from gensim import models
from gensim.similarities import Similarity
from gensim.test.utils import get_tmpfile
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## LDA

In [7]:
model_lda_tokens = models.LdaModel(corpus_bag_of_words,
                                   id2word=dictionary_tokens,
                                   num_topics=200,
                                   iterations=2000,
                                   passes=10,
                                   eval_every=1)

2018-09-18 15:55:22,161 : INFO : using symmetric alpha at 0.005
2018-09-18 15:55:22,164 : INFO : using symmetric eta at 0.005
2018-09-18 15:55:22,187 : INFO : using serial LDA version on this node
2018-09-18 15:55:24,535 : INFO : running online (multi-pass) LDA training, 200 topics, 10 passes over the supplied corpus of 10000 documents, updating model once every 2000 documents, evaluating perplexity every 2000 documents, iterating 2000x with a convergence threshold of 0.001000
2018-09-18 15:56:19,080 : INFO : -39.416 per-word bound, 733627782983.2 perplexity estimate based on a held-out corpus of 2000 documents with 337338 words
2018-09-18 15:56:19,081 : INFO : PROGRESS: pass 0, at document #2000/10000
2018-09-18 15:57:10,567 : INFO : merging changes from 2000 documents into a model of 10000 documents
2018-09-18 15:57:12,563 : INFO : topic #128 (0.005): 0.011*"model" + 0.009*"based" + 0.008*"synchronization" + 0.008*"design" + 0.007*"analysis" + 0.007*"these" + 0.006*"has" + 0.006*"new

In [8]:
model_lda_keyphrases = models.LdaModel(corpus_bag_of_keyphrases,
                                       id2word=dictionary_keyphrases,
                                       num_topics=200,
                                       iterations=2000,
                                       passes=10,
                                       eval_every=1)

2018-09-18 16:13:30,857 : INFO : using symmetric alpha at 0.005
2018-09-18 16:13:30,864 : INFO : using symmetric eta at 0.005
2018-09-18 16:13:30,914 : INFO : using serial LDA version on this node
2018-09-18 16:13:35,269 : INFO : running online (multi-pass) LDA training, 200 topics, 10 passes over the supplied corpus of 10000 documents, updating model once every 2000 documents, evaluating perplexity every 2000 documents, iterating 2000x with a convergence threshold of 0.001000
2018-09-18 16:13:46,118 : INFO : -253.344 per-word bound, 18366827114821431145190291361421639148606407903271128290500027126734393442304.0 perplexity estimate based on a held-out corpus of 2000 documents with 67244 words
2018-09-18 16:13:46,119 : INFO : PROGRESS: pass 0, at document #2000/10000
2018-09-18 16:13:53,245 : INFO : merging changes from 2000 documents into a model of 10000 documents
2018-09-18 16:13:57,197 : INFO : topic #176 (0.005): 0.013*"aqua+" + 0.007*"phosphorylation/dephosphorylation-proteins" + 

In [9]:
lda_a_tokens = model_lda_tokens[corpus_a_tokens]
lda_b_tokens = model_lda_tokens[corpus_b_tokens]

lda_a_keyphrases = model_lda_keyphrases[corpus_a_keyphrases]
lda_b_keyphrases = model_lda_keyphrases[corpus_b_keyphrases]

Index similarities

In [10]:
index_lda_tokens = Similarity(get_tmpfile("index-lda-tokens"), lda_b_tokens, num_features=len(dictionary_tokens))
save_pickle(np.array(index_lda_tokens[lda_a_tokens]), get_relative_path(data_path, SIM_LDA_TOKENS))

index_lda_keyphrases = Similarity(get_tmpfile("index-lda-keyphrases"), lda_b_keyphrases, num_features=len(dictionary_keyphrases))
save_pickle(np.array(index_lda_keyphrases[lda_a_keyphrases]), get_relative_path(data_path, SIM_LDA_KEYPHRASES))

2018-09-18 16:22:30,046 : INFO : starting similarity index under /tmp/index-lda-tokens
  if np.issubdtype(vec.dtype, np.int):
2018-09-18 16:22:47,485 : INFO : creating sparse index
2018-09-18 16:22:47,486 : INFO : creating sparse matrix from corpus
2018-09-18 16:22:47,487 : INFO : PROGRESS: at document #0/5000
2018-09-18 16:22:47,882 : INFO : created <5000x62388 sparse matrix of type '<class 'numpy.float32'>'
	with 98069 stored elements in Compressed Sparse Row format>
2018-09-18 16:22:47,884 : INFO : creating sparse shard #0
2018-09-18 16:22:47,885 : INFO : saving index shard to /tmp/index-lda-tokens.0
2018-09-18 16:22:47,886 : INFO : saving SparseMatrixSimilarity object under /tmp/index-lda-tokens.0, separately None
2018-09-18 16:22:47,894 : INFO : saved /tmp/index-lda-tokens.0
2018-09-18 16:22:47,896 : INFO : loading SparseMatrixSimilarity object from /tmp/index-lda-tokens.0
2018-09-18 16:22:47,904 : INFO : loaded /tmp/index-lda-tokens.0
2018-09-18 16:23:06,189 : INFO : starting sim

In [11]:
print(model_lda_tokens.num_topics)
model_lda_tokens.print_topics(num_topics=2, num_words=10)
# model_lda_tokens[corpus_tfidf_tokens[0]]

2018-09-18 16:23:19,435 : INFO : topic #105 (0.005): 0.031*"process" + 0.029*"approach" + 0.027*"requirements" + 0.024*"based" + 0.024*"models" + 0.024*"analysis" + 0.019*"specification" + 0.019*"model" + 0.018*"modeling" + 0.018*"case"
2018-09-18 16:23:19,437 : INFO : topic #19 (0.005): 0.085*"mechanism" + 0.058*"side" + 0.040*"messages" + 0.029*"tcp" + 0.028*"route" + 0.025*"congestion" + 0.020*"mechanisms" + 0.020*"automation" + 0.018*"end" + 0.018*"validation"


200


[(105,
  '0.031*"process" + 0.029*"approach" + 0.027*"requirements" + 0.024*"based" + 0.024*"models" + 0.024*"analysis" + 0.019*"specification" + 0.019*"model" + 0.018*"modeling" + 0.018*"case"'),
 (19,
  '0.085*"mechanism" + 0.058*"side" + 0.040*"messages" + 0.029*"tcp" + 0.028*"route" + 0.025*"congestion" + 0.020*"mechanisms" + 0.020*"automation" + 0.018*"end" + 0.018*"validation"')]

In [12]:
print(model_lda_keyphrases.num_topics)
model_lda_keyphrases.print_topics(num_topics=10, num_words=10)

2018-09-18 16:23:19,519 : INFO : topic #176 (0.005): 0.041*"squares" + 0.028*"forecasting" + 0.027*"molecules" + 0.022*"candidate" + 0.013*"taxonomy" + 0.012*"rsa" + 0.012*"time-series" + 0.011*"electronic" + 0.010*"analogy" + 0.009*"hotspots"
2018-09-18 16:23:19,522 : INFO : topic #31 (0.005): 0.020*"corpus" + 0.020*"expert" + 0.019*"explicit" + 0.019*"attitudes" + 0.016*"gate" + 0.016*"novices" + 0.015*"particularly" + 0.013*"novice" + 0.011*"connected components" + 0.011*"parallel processing"
2018-09-18 16:23:19,524 : INFO : topic #50 (0.005): 0.112*"input" + 0.102*"media" + 0.053*"scenarios" + 0.036*"designers" + 0.030*"website" + 0.026*"audio" + 0.017*"generalized" + 0.017*"weight" + 0.015*"large-scale" + 0.012*"product design"
2018-09-18 16:23:19,528 : INFO : topic #159 (0.005): 0.101*"operation" + 0.073*"throughput" + 0.058*"tcp" + 0.046*"easy" + 0.034*"instructions" + 0.026*"degradation" + 0.017*"installation" + 0.016*"cdma" + 0.013*"interventions" + 0.011*"designing"
2018-09-1

200


[(176,
  '0.041*"squares" + 0.028*"forecasting" + 0.027*"molecules" + 0.022*"candidate" + 0.013*"taxonomy" + 0.012*"rsa" + 0.012*"time-series" + 0.011*"electronic" + 0.010*"analogy" + 0.009*"hotspots"'),
 (31,
  '0.020*"corpus" + 0.020*"expert" + 0.019*"explicit" + 0.019*"attitudes" + 0.016*"gate" + 0.016*"novices" + 0.015*"particularly" + 0.013*"novice" + 0.011*"connected components" + 0.011*"parallel processing"'),
 (50,
  '0.112*"input" + 0.102*"media" + 0.053*"scenarios" + 0.036*"designers" + 0.030*"website" + 0.026*"audio" + 0.017*"generalized" + 0.017*"weight" + 0.015*"large-scale" + 0.012*"product design"'),
 (159,
  '0.101*"operation" + 0.073*"throughput" + 0.058*"tcp" + 0.046*"easy" + 0.034*"instructions" + 0.026*"degradation" + 0.017*"installation" + 0.016*"cdma" + 0.013*"interventions" + 0.011*"designing"'),
 (130,
  '0.062*"consequence" + 0.039*"wavelet transform" + 0.022*"economics" + 0.017*"atm networks" + 0.012*"intuitive" + 0.012*"congestion control" + 0.011*"calcium" +