In [1]:
import sys
import os
import random
import copy
import time
from pathlib import Path
import hashlib as hl
import pickle
import somhos.resources.dataset as rd
import somhos.resources.queries as rq
import somhos.methods.useful as mu
import kleis.resources.dataset as kl

Default:  ~/kleis_data/corpus/semeval2017-task10


    - Download from here https://scienceie.github.io/resources.html
    - Use one of the following paths.
        + ./kleis_data/corpus/semeval2017-task10/
        + ~/kleis_data/corpus/semeval2017-task10/
        + /home/jupyterlab/.local/lib/python3.5/site-packages/kleis/kleis_data/corpus/semeval2017-task10/
    - You can use pre-trained models.


In [2]:
kleis = kl.load_corpus()
kleis.training(features_method="simple-posseq", filter_min_count=10)

In [3]:
#data_path = "resources/aminer/v1"
data_path = "../../src/somhos/resources/aminer/v9beta"

In [4]:
# Load document ids in the Same order than the matrices
test_preselected = set(rd.get_sample_ids(data_path, related_docs=True))
test_preselected = test_preselected | set(rd.get_sample_ids(data_path, related_docs=False))
test_dataset = copy.deepcopy(test_preselected)
train_dataset = set()

Sample size: 1003 documents
 - Content from ../../src/somhos/resources/aminer/v9beta/sample-ids-100-10-0-related-fullcontent.bin
Sample size: 1000 documents
 - Content from ../../src/somhos/resources/aminer/v9beta/sample-ids-100-10-0-random-fullcontent.bin


In [5]:
fixed_seed = 0
random.seed(fixed_seed)
threshold = 0.1

In [6]:
# Read artminer
datafiles = sorted(rd.get_filenames(data_path))
print(datafiles)

['../../src/somhos/resources/aminer/v9beta/acm.01.txt', '../../src/somhos/resources/aminer/v9beta/acm.02.txt', '../../src/somhos/resources/aminer/v9beta/acm.03.txt']


In [7]:
filepath = ""
if datafiles:
    filepath = datafiles[1]
    print(filepath)

dociter = None
if Path(filepath).exists():
    dociter = rd.get_aminer_txt(filepath, merge_text_title=True)

../../src/somhos/resources/aminer/v9beta/acm.02.txt


In [8]:
start_time, elapsed_time = time.time(), 0
kps_count = 0
kps_hashes_counts = {}
kps_hashes_keyphrases = {}
kps_hashes_idocs = {}
# idocs_wl_content = {} # idocs with large content

kps_file_segment = 58
tmp_data_path = data_path + "/tmp-pickles"
if not Path(tmp_data_path).exists():
    os.mkdir(tmp_data_path)
kps_tmp_counts = tmp_data_path + "/kps-tmp-counts-%d.pkl"
kps_tmp_keyphrases = tmp_data_path + "/kps-tmp-keyphrases-%d.pkl"
kps_tmp_idocs = tmp_data_path + "/kps-tmp-idocs-%d.pkl"

for i, (idoc, title, content) in enumerate(dociter):
    # Check length of content
    # idocs_wcontent[idoc] = True if content.split() > 50 else False 
    # Sampling test dataset
    if random.random() <= threshold:
        test_dataset.add(idoc)
    else:
        # Train dataset
        train_dataset.add(idoc)
    #if i > 10000:
    #    break
    if i <= 700000:
        continue
    if i % 100000 == 0:
        print("Progress: %d" % i, file=sys.stderr)
        prev_elapsed_time = elapsed_time
        elapsed_time = time.time() - start_time
        print("Total time: %f" % (elapsed_time/60/60), file=sys.stderr)
        print("Elapsed time: %f" % ((elapsed_time - prev_elapsed_time)/60/60), file=sys.stderr)
        print("Hashes: %d\n" % len(kps_hashes_keyphrases), file=sys.stderr)
        # Saving keywords
        mu.save_pickle(kps_hashes_counts, kps_tmp_counts % kps_file_segment)
        mu.save_pickle(kps_hashes_keyphrases, kps_tmp_keyphrases % kps_file_segment)
        mu.save_pickle(kps_hashes_idocs, kps_tmp_idocs % kps_file_segment)
        # increase no.
        kps_file_segment += 1
        # reset vars
        del kps_hashes_counts
        del kps_hashes_keyphrases
        del kps_hashes_idocs
        kps_hashes_counts = {}
        kps_hashes_keyphrases = {}
        kps_hashes_idocs = {}
    # Avoid preselected documents
    #if idoc in test_preselected:
    #    print("Pre-selected", idoc)
    #    continue

    try:
        text = title.strip(". ") + ". " + content
        keyphrases = kleis.label_text(text, post_processing=False)
        for kpid, (kplabel, (kpstart, kpend)), kptext in keyphrases:
            kps_count += 1
            kplower = mu.lower_utf8(kptext)
            kps_hash_16 = mu.hash_16bytes(kplower)
            # count
            kps_hashes_counts.setdefault(kps_hash_16, 0)
            kps_hashes_counts[kps_hash_16] += 1
            # normalized keyphrase
            kps_hashes_keyphrases[kps_hash_16] = kplower
            # id docs
            kps_hashes_idocs.setdefault(kps_hash_16, set())
            kps_hashes_idocs[kps_hash_16].add(idoc)
    except ValueError:
        pass
        # print("\nSkipped: %s\n" % idoc, file=sys.stderr)

# save last hashes
mu.save_pickle(kps_hashes_counts, kps_tmp_counts % kps_file_segment)
mu.save_pickle(kps_hashes_keyphrases, kps_tmp_keyphrases % kps_file_segment)
mu.save_pickle(kps_hashes_idocs, kps_tmp_idocs % kps_file_segment)
# reset vars
del kps_hashes_counts
del kps_hashes_keyphrases
del kps_hashes_idocs

In [9]:
test_dataset_path = data_path + "/test-dataset-acm02.pkl"
if not Path(test_dataset_path).exists():
    with open(test_dataset_path, "wb") as fout:
        pickle.dump(test_dataset, fout, pickle.HIGHEST_PROTOCOL)

train_dataset_path = data_path + "/train-dataset-acm02.pkl"
if not Path(train_dataset_path).exists():
    with open(train_dataset_path, "wb") as fout:
        pickle.dump(train_dataset, fout, pickle.HIGHEST_PROTOCOL)