# Select sample of documents with keyphrases from test dataset

In [1]:
import sys
import os
import random
from somhos.methods.useful import save_pickle, load_pickle
from somhos.config.paths import get_relative_path
from somhos.config.paths import DOCS_SAMPLE_A_SUFFIX, DOCS_SAMPLE_B_SUFFIX, KPS_DIRECTORY_INVERSE_SUFFIX, SAMPLE_PATH

Default path

In [2]:
data_path = "../../src/somhos/resources/aminer/v9"

Load test dataset ids to select a sample

In [3]:
test_dataset_path = data_path + "/test-dataset.pkl"
test_dataset = load_pickle(test_dataset_path)
print("Test dataset: ", len(test_dataset), file=sys.stderr)

Test dataset:  243520


Load inverse directory 'document id -> keyphrase ids' to check if document has keyphrases.

In [4]:
kps_directory_inverse_path = get_relative_path(data_path, KPS_DIRECTORY_INVERSE_SUFFIX)
kps_directory_inverse = load_pickle(kps_directory_inverse_path)
print("Inverse directory: ", len(kps_directory_inverse), file=sys.stderr)

Inverse directory:  1475448


Select two samples.

In [5]:
seed = 0
# docs with keyphrases
docs_wkps = len(test_dataset)
# n expected docs
ndocs_expected = (1000*2)
print("Expected documents: %d" % ndocs_expected)
# threshold
threshold = ndocs_expected/docs_wkps
print(seed, docs_wkps, threshold)
# Init seed
random.seed(0)
# Select random docs
docs_set_a = set()
docs_set_b = set()
def insert_to_set(element, set_1, set_2, limit):
    """Insert element to the first set not exceding limit"""
    if len(set_1) < limit and element not in set_2:
        set_1.add(element)
    elif len(set_2) < limit and element not in set_1:
        set_2.add(element)

# while not ndocs_expected selected
test_dataset_iter = iter(test_dataset)
while len(docs_set_a | docs_set_b) < ndocs_expected:
    # iter test dataset
    try:
        cur_doc = next(test_dataset_iter)
    except StopIteration:
        test_dataset_iter = iter(test_dataset)
    # if doc not in inverse_directory
    if cur_doc not in kps_directory_inverse:
        continue
    if len(kps_directory_inverse[cur_doc]) < 5:
        continue
    # if greater than threshold
    if random.random() > threshold:
        continue
    # assign doc randomly
    if random.random() <= 0.5:
        insert_to_set(cur_doc, docs_set_a, docs_set_b, ndocs_expected/2)
    else:
        insert_to_set(cur_doc, docs_set_b, docs_set_a, ndocs_expected/2)
print("Samples size: (%d, %d)" % (len(docs_set_a), len(docs_set_b)))

Expected documents: 2000
0 243520 0.00821287779237845
Samples size: (1000, 1000)


Free memory

In [6]:
del kps_directory_inverse
del test_dataset_iter
del test_dataset

Save samples

In [7]:
docs_sample_path = get_relative_path(data_path, SAMPLE_PATH)
if not os.path.exists(docs_sample_path):
    os.mkdir(docs_sample_path)

In [8]:
# Save sample A
docs_sample_a_path = get_relative_path(data_path, DOCS_SAMPLE_A_SUFFIX)
save_pickle(list(docs_set_a), docs_sample_a_path)
# Save sample B
docs_sample_b_path = get_relative_path(data_path, DOCS_SAMPLE_B_SUFFIX)
save_pickle(list(docs_set_b), docs_sample_b_path)