In [1]:
import sys
import os
import random
import copy
import time
from pathlib import Path
import hashlib as hl
import pickle
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
import somhos.methods.useful as mu
import kleis.resources.dataset as kl

Default:  ~/kleis_data/corpus/semeval2017-task10


    - Download from here https://scienceie.github.io/resources.html
    - Use one of the following paths.
        + ./kleis_data/corpus/semeval2017-task10/
        + ~/kleis_data/corpus/semeval2017-task10/
        + /home/jupyterlab/.local/lib/python3.5/site-packages/kleis/kleis_data/corpus/semeval2017-task10/
    - You can use pre-trained models.


In [2]:
kleis = kl.load_corpus()
kleis.training(features_method="simple-posseq", filter_min_count=10)

In [3]:
#data_path = "resources/aminer/v1"
data_path = "../../src/somhos/resources/aminer/v9"

In [4]:
# Load document ids in the Same order than the matrices
test_preselected = set(rd.get_sample_ids(data_path, related_docs=True))
test_preselected = test_preselected | set(rd.get_sample_ids(data_path, related_docs=False))
test_dataset = copy.deepcopy(test_preselected)
train_dataset = set()

Sample size: 1003 documents
 - Content from ../../src/somhos/resources/aminer/v9/sample-ids-100-10-0-related-fullcontent.bin
Sample size: 1000 documents
 - Content from ../../src/somhos/resources/aminer/v9/sample-ids-100-10-0-random-fullcontent.bin


In [5]:
fixed_seed = 0
random.seed(fixed_seed)
threshold = 0.1

In [6]:
# Index 
# ix_data = rq.cur_indexed_docs(data_path)
# print(ix_data)

In [7]:
# Read artminer
data = rd.get_filenames(data_path)
filepath = ""
if data:
    filepath = next(data)

dociter = None
if Path(filepath).exists():
    dociter = rd.get_aminer_txt(filepath, merge_text_title=True)

In [8]:
start_time, elapsed_time = time.time(), 0
kps_count = 0
kps_hashes_counts = {}
kps_hashes_keyphrases = {}
kps_hashes_idocs = {}
# idocs_wl_content = {} # idocs with large content

kps_file_segment = 0
tmp_data_path = data_path + "/tmp-pickles"
if not Path(tmp_data_path).exists():
    os.mkdir(tmp_data_path)
kps_tmp_counts = tmp_data_path + "/kps-tmp-counts-%d.pkl"
kps_tmp_keyphrases = tmp_data_path + "/kps-tmp-keyphrases-%d.pkl"
kps_tmp_idocs = tmp_data_path + "/kps-tmp-idocs-%d.pkl"

for i, (idoc, title, content) in enumerate(dociter):
    # Check length of content
    # idocs_wcontent[idoc] = True if content.split() > 50 else False 
    # Sampling test dataset
    if random.random() <= threshold:
        test_dataset.add(idoc)
        continue
    #if i > 10000:
    #    break
    if i % 100000 == 0:
        print("Progress: %d" % i, file=sys.stderr)
        prev_elapsed_time = elapsed_time
        elapsed_time = time.time() - start_time
        print("Total time: %f" % (elapsed_time/60/60), file=sys.stderr)
        print("Elapsed time: %f" % ((elapsed_time - prev_elapsed_time)/60/60), file=sys.stderr)
        print("Hashes: %d\n" % len(kps_hashes_keyphrases), file=sys.stderr)
        # Saving keywords
        mu.save_pickle(kps_hashes_counts, kps_tmp_counts % kps_file_segment)
        mu.save_pickle(kps_hashes_keyphrases, kps_tmp_keyphrases % kps_file_segment)
        mu.save_pickle(kps_hashes_idocs, kps_tmp_idocs % kps_file_segment)
        # increase no.
        kps_file_segment += 1
        # reset vars
        del kps_hashes_counts
        del kps_hashes_keyphrases
        del kps_hashes_idocs
        kps_hashes_counts = {}
        kps_hashes_keyphrases = {}
        kps_hashes_idocs = {}
    # Avoid preselected documents
    if idoc in test_preselected:
        # print("Pre-selected", idoc)
        continue
    # Train dataset
    train_dataset.add(idoc)
    try:
        text = title.strip(". ") + ". " + content
        keyphrases = kleis.label_text(text, post_processing=False)
        for kpid, (kplabel, (kpstart, kpend)), kptext in keyphrases:
            kps_count += 1
            kplower = mu.lower_utf8(kptext)
            kps_hash_16 = mu.hash_16bytes(kplower)
            # count
            kps_hashes_counts.setdefault(kps_hash_16, 0)
            kps_hashes_counts[kps_hash_16] += 1
            # normalized keyphrase
            kps_hashes_keyphrases[kps_hash_16] = kplower
            # id docs
            kps_hashes_idocs.setdefault(kps_hash_16, set())
            kps_hashes_idocs[kps_hash_16].add(idoc)
    except ValueError:
        pass
        # print("\nSkipped: %s\n" % idoc, file=sys.stderr)

In [10]:
# save last hashes
mu.save_pickle(kps_hashes_counts, kps_tmp_counts % kps_file_segment)
mu.save_pickle(kps_hashes_keyphrases, kps_tmp_keyphrases % kps_file_segment)
mu.save_pickle(kps_hashes_idocs, kps_tmp_idocs % kps_file_segment)
# reset vars
kps_hashes_counts = {}
kps_hashes_keyphrases = {}
kps_hashes_idocs = {}
for i in range(0, kps_file_segment + 1):
    kps_hashes_counts.update(mu.load_pickle(kps_tmp_counts % i))
    kps_hashes_keyphrases.update(mu.load_pickle(kps_tmp_keyphrases % i))
    kps_hashes_idocs.update(mu.load_pickle(kps_tmp_idocs % i))

In [11]:
print(" - Tmp files: %d" % (kps_file_segment))
print(" - Test dataset: %d" % len(test_dataset))
print(" - Train dataset: %d" % len(train_dataset))
print(" - Keyphrases: %d" % kps_count)
print(" - Keyphrases hashes-counts: %d" % len(kps_hashes_counts))
print(" - Keyphrases normalized hashes: %d" % len(kps_hashes_keyphrases))
print(" - Keyphrases-docsids: %d" % len(kps_hashes_idocs))

 - Tmp files: 14
 - Test dataset: 159182
 - Train dataset: 1413685
 - Keyphrases: 18788970
 - Keyphrases hashes-counts: 2469116
 - Keyphrases normalized hashes: 2469116
 - Keyphrases-docsids: 2469116


In [12]:
kps_directory = {}
kps_normalized = []
kps_counts = []
kps_docs_counts = []
kps_docs_ids = []
for i, (k, v) in enumerate(sorted(kps_hashes_counts.items(), key=lambda x: x[1], reverse=True)):
    kps_normalized.append(kps_hashes_keyphrases[k])
    kps_counts.append(kps_hashes_counts[k])
    kps_docs_counts.append(len(kps_hashes_idocs[k]))
    kps_docs_ids.append(kps_hashes_idocs[k])
    kps_directory[k] = i

In [13]:
seg_start = 100
seg_end = seg_start + 50
print(kps_normalized[seg_start:seg_end])
print(kps_counts[seg_start:seg_end])
print(kps_docs_counts[seg_start:seg_end])
print(kps_docs_ids[seg_start:seg_end])
print([kps_directory[mu.hash_16bytes(kn)] for kn in kps_normalized[seg_start:seg_end]])

[b'svm', b'online', b'wireless sensor networks', b'service', b'work', b'ways', b'fpga', b'operation', b'customer', b'development', b'literature', b'way', b'agents', b'methodology', b'rules', b'node', b'building', b'task', b'clustering', b'video', b'gas', b'corresponding', b'variables', b'understanding', b'requirements', b'distribution', b'factors', b'use', b'i', b'interest', b'scene', b'quality', b'new', b'mapping', b'c', b'components', b'effectiveness', b'knowledge', b'integration', b'location', b'practice', b'experiments', b'energy', b'noise', b'access', b'protocols', b'experiment', b'functions', b'input', b'ability']
[870, 860, 858, 854, 852, 840, 829, 824, 823, 821, 815, 809, 804, 791, 790, 786, 775, 770, 767, 767, 759, 757, 756, 755, 755, 754, 749, 746, 744, 743, 743, 741, 737, 729, 728, 728, 726, 726, 725, 722, 712, 709, 705, 704, 703, 701, 701, 699, 695, 692]
[504, 678, 593, 719, 785, 799, 501, 752, 453, 770, 774, 791, 475, 679, 583, 570, 665, 673, 578, 574, 393, 732, 576, 697, 

In [14]:
kps_normalized_path = data_path + "/kps-normalized-simpseq10-nopost.pkl"
if not Path(kps_normalized_path).exists():
    with open(kps_normalized_path, "wb") as fout:
        pickle.dump(kps_normalized, fout, pickle.HIGHEST_PROTOCOL)
        
kps_counts_path = data_path + "/kps-counts-simpseq10-nopost.pkl"
if not Path(kps_counts_path).exists():
    with open(kps_counts_path, "wb") as fout:
        pickle.dump(kps_counts, fout, pickle.HIGHEST_PROTOCOL)

kps_docs_counts_path = data_path + "/kps-docs-counts-simpseq10-nopost.pkl"
if not Path(kps_docs_counts_path).exists():
    with open(kps_docs_counts_path, "wb") as fout:
        pickle.dump(kps_docs_counts, fout, pickle.HIGHEST_PROTOCOL)
        
kps_docs_ids_path = data_path + "/kps-docs-ids-simpseq10-nopost.pkl"
if not Path(kps_docs_ids_path).exists():
    with open(kps_docs_ids_path, "wb") as fout:
        pickle.dump(kps_docs_ids, fout, pickle.HIGHEST_PROTOCOL)

kps_directory_path = data_path + "/kps-directory-simpseq10-nopost.pkl"
if not Path(kps_directory_path).exists():
    with open(kps_directory_path, "wb") as fout:
        pickle.dump(kps_directory, fout, pickle.HIGHEST_PROTOCOL)

In [15]:
test_dataset_path = data_path + "/test-dataset.pkl"
if not Path(test_dataset_path).exists():
    with open(test_dataset_path, "wb") as fout:
        pickle.dump(test_dataset, fout, pickle.HIGHEST_PROTOCOL)

train_dataset_path = data_path + "/train-dataset.pkl"
if not Path(train_dataset_path).exists():
    with open(train_dataset_path, "wb") as fout:
        pickle.dump(train_dataset, fout, pickle.HIGHEST_PROTOCOL)