In [1]:
import sys
import os
import random
# Counter
from collections import Counter
import kleis.resources.dataset as kl
# Package
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
import somhos.methods.useful as mu
from somhos.methods.useful import save_pickle, load_pickle
from somhos.config.paths import *

    - Download from here https://scienceie.github.io/resources.html
    - Use one of the following paths.
        + ./kleis_data/corpus/semeval2017-task10/
        + ~/kleis_data/corpus/semeval2017-task10/
        + /home/snov/environments/artsim/lib/python3.6/site-packages/kleis/kleis_data/corpus/semeval2017-task10/
    - You can use pre-trained models.


Default:  ~/kleis_data/corpus/semeval2017-task10


Default path

In [2]:
prefix_path = "../../"
data_path = get_relative_path(prefix_path, V9GAMMA_PATH)
os.path.exists(data_path)

True

Load module to tag keyphrases

In [3]:
kleis = kl.load_corpus()
kleis.training(features_method="simple-posseq", filter_min_count=10)

Load samples

In [4]:
# Load sample A
docs_sample_a_path = get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX)
docs_sample_a = load_pickle(docs_sample_a_path)
# Load sample B
docs_sample_b_path = get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX)
docs_sample_b = load_pickle(docs_sample_b_path)

print("Samples size: (%d, %d)" % (len(docs_sample_a), len(docs_sample_b)))

Samples size: (5000, 5000)


Get documents

In [5]:
# Init cursor
ix_data = rq.cur_indexed_docs(data_path)
# Get analizer
analizer = rd.get_default_analizer()

Get bag of words and counts

In [6]:
samples_content = {}
samples_kps_count = Counter()
samples_kps_in_docs_count = Counter()
samples_words_count = Counter()
samples_word_in_docs_count = Counter()
for i, result in enumerate(rq.find_all_indexdoc(ix_data, " ".join(docs_sample_a + docs_sample_b))):
    # get bag of words
    title_len = len(result['title']) + 1
    tokens = [t.text for t in analizer(result['content'])]
    bag_of_words = set(tokens)
    # get keyphrases
    text = result['title'].strip(". ") + ". " + result['content']
    keyphrases = kleis.label_text(text)
    kps_normalized = [mu.lower_utf8(kptext) for _, _, kptext in keyphrases]
    # kps_hashes = [mu.hash_16bytes(mu.lower_utf8(kptext)) for _, _, kptext in keyphrases]
    bag_of_kps = set(kps_normalized)
    # Counts
    samples_words_count.update(tokens)
    samples_word_in_docs_count.update(bag_of_words)
    samples_kps_count.update(kps_normalized)
    samples_kps_in_docs_count.update(bag_of_kps)
    # Doc content
    samples_content[result['indexdoc']] = {'title': result['title'], 
                                           'text': result['content'],
                                           'content': result['content'][title_len:],
                                           'tokens': tokens,
                                           'kps-normalized': kps_normalized,
                                           'bag-of-words': bag_of_words,
                                           'bag-of-kps': bag_of_kps
                                          }
total_word_count = sum(samples_words_count.values())
word_counts = {'total': total_word_count, 'count': samples_words_count}
total_kps_count = sum(samples_kps_count.values())
kps_counts = {'total': total_kps_count, 'count': samples_kps_count}

In [7]:
docs_samples_content_path = get_relative_path(data_path, DOCS_SAMPLES_CONTENT)
save_pickle(samples_content, docs_samples_content_path)

In [8]:
print("Docs: %d" % (i + 1))
print(total_word_count, samples_words_count.most_common(10))
print(samples_word_in_docs_count.most_common(10))
print(total_kps_count, samples_kps_count.most_common(10))
print(samples_kps_in_docs_count.most_common(10))

Docs: 10000
1455421 [('based', 9616), ('data', 7719), ('which', 7254), ('system', 7222), ('model', 6649), ('using', 6515), ('paper', 5579), ('time', 5476), ('our', 4781), ('results', 4779)]
[('based', 4990), ('paper', 4700), ('which', 4516), ('using', 3992), ('results', 3581), ('system', 3123), ('also', 3077), ('such', 3073), ('used', 3043), ('has', 3031)]
289305 [(b'algorithm', 1259), (b'system', 1163), (b'data', 1012), (b'simulation', 975), (b'algorithms', 845), (b'users', 710), (b'book', 691), (b'solution', 672), (b'information', 661), (b'problem', 570)]
[(b'algorithm', 841), (b'system', 831), (b'data', 756), (b'simulation', 732), (b'algorithms', 565), (b'solution', 512), (b'addition', 509), (b'users', 493), (b'problem', 459), (b'information', 458)]


In [9]:
docs_samples_word_count_path = get_relative_path(data_path, DOCS_SAMPLES_WORD_COUNT)
save_pickle(word_counts, docs_samples_word_count_path)

docs_samples_word_doc_count_path = get_relative_path(data_path, DOCS_SAMPLES_WORD_DOC_COUNT)
save_pickle(samples_word_in_docs_count, docs_samples_word_doc_count_path)

In [10]:
docs_samples_kps_count_path = get_relative_path(data_path, DOCS_SAMPLES_KPS_COUNT)
save_pickle(kps_counts, docs_samples_kps_count_path)

docs_samples_kps_doc_count_path = get_relative_path(data_path, DOCS_SAMPLES_KPS_DOC_COUNT)
save_pickle(samples_kps_in_docs_count, docs_samples_kps_doc_count_path)