In [1]:
import gc
from collections import defaultdict

In [2]:
from somhos.config.paths import *
from somhos.methods.useful import save_pickle, load_pickle
import somhos.methods.useful as mu

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
prefix_path = '../../'
data_path = get_relative_path(prefix_path, V9GAMMA_PATH)

Load datasets

In [5]:
page_rds = load_pickle(get_relative_path(prefix_path, PAGE_REDIRECTS))
print(len(page_rds))

11523073


In [6]:
page_inverse_dict = load_pickle(get_relative_path(prefix_path, PAGE_INVERSE_DICT))
print(len(page_inverse_dict))

11523073


In [7]:
keyphrases_directory = load_pickle(get_relative_path(data_path, KPS_DIRECTORY_SUFFIX))
print(len(keyphrases_directory))

3922202


In [8]:
keyphrases_normalized = load_pickle(get_relative_path(data_path, KPS_NORMALIZED_SUFFIX))
print(len(keyphrases_normalized))

3922202


Load set of keyphrases (extracted with kleis) intersecting the set of Wikipedia pages and generate distinct representations.

In [9]:
hashes_intersection = load_pickle(get_relative_path(prefix_path, HASHES_INTERSECTION))
print(len(hashes_intersection))

def hash2str(h):
    return keyphrases_normalized[keyphrases_directory[h]].decode('utf-8')

def str2id(s):
    return page_inverse_dict[s]

str_intersection = set(hash2str(h) for h in hashes_intersection) # - manually_ignored 
print(len(str_intersection))

id_intersection = set(str2id(s) for s in str_intersection)
print(len(id_intersection))

218637
218637
218637


Load all posible redirect pages in wikipedia from the ids in the intersection. 

In [10]:
redirects = set(rd for str_id in id_intersection for rd in page_rds[str_id])
print(len(redirects))

150968


Load more resoruces

In [11]:
redirect_pages = load_pickle(get_relative_path(prefix_path, REDIRECT_PAGES))
print(len(redirect_pages))

3394754


Extract sets of wikipedia pages for each redirect and made their intersection with the hashes_intersection.
Filtering redirects with more than 1 page variation. 

In [12]:
keyphrase_variations = list(filter(lambda x: len(x) > 1, [redirect_pages[rd] & id_intersection for rd in redirects]))
print(len(keyphrase_variations))
keyphrase_variations[:20]

41292


[{5008150, 9437190},
 {7340040, 8422844},
 {3670036, 5516775},
 {2888303, 5222676, 6208933, 9399802, 10333801},
 {1373481, 2011089, 3407894},
 {6024825, 6815768},
 {6194474, 11128015},
 {4574384, 4980774, 8096396},
 {3493224, 7864378},
 {5767228, 7679608},
 {60207, 5689252},
 {7584300, 11343008},
 {6815812, 7801572, 9621095, 10000434},
 {786503, 1298591, 3747683},
 {3407947, 11053787},
 {790906, 4289511, 8240332},
 {4718684, 5066569},
 {102, 6698828},
 {1048686, 9740537},
 {1773639,
  3250187,
  5598451,
  6217961,
  6815869,
  8921108,
  8925409,
  8956352,
  10274254,
  11152431}]

In [13]:
%reset_selective -f redirect_pages

In [14]:
gc.collect()
gc.get_stats()

[{'collections': 20237, 'collected': 2176, 'uncollectable': 0},
 {'collections': 1839, 'collected': 894, 'uncollectable': 0},
 {'collections': 23, 'collected': 50, 'uncollectable': 0}]

In [15]:
keyphrases_dir_docid = load_pickle(get_relative_path(data_path, KPS_DOCS_IDS_SUFFIX))
print(len(keyphrases_dir_docid))

3922202


In [16]:
page_dictionary = load_pickle(get_relative_path(prefix_path, PAGE_DICTIONARY))
print(len(page_dictionary))

11523073


In [17]:
def str2docids(s):
    h = mu.hash_16bytes(s.encode('utf-8'))
    return keyphrases_dir_docid[keyphrases_directory[h]]

page_docids = {}
rd_docids = defaultdict(set)
for pgs in keyphrase_variations:
    for p in pgs:
        s = page_dictionary[p]
        page_docids[p] = str2docids(s)
        for r in page_rds[p]:
            rd_docids[r] |= page_docids[p]
print(len(page_docids))
print(len(rd_docids))

125735
52132


In [18]:
sorted(list((page_dictionary[k], v) for k, v in rd_docids.items()), key=lambda x: x[1], reverse=True)[:3]

[('donorschoose', {'index1561736', 'index2374450'}),
 ('wl', {'index1291962', 'index2327088', 'index2378558', 'index2381491'}),
 ('welcomelong',
  {'index2264700', 'index2327088', 'index2378558', 'index2381491'})]

In [19]:
list(page_docids.items())[:3]

[(9437190, {'index1561736'}),
 (5008150, {'index2374450'}),
 (7340040, {'index2327088', 'index2378558', 'index2381491'})]

In [20]:
gc.collect()
gc.get_stats()

[{'collections': 25671, 'collected': 2176, 'uncollectable': 0},
 {'collections': 2332, 'collected': 894, 'uncollectable': 0},
 {'collections': 25, 'collected': 50, 'uncollectable': 0}]

List of keyphrases and the set of documents in which they appears but with only one representation. 

In [21]:
kpvariations_nointersection = []
for kvs in keyphrase_variations:
    docids = set()
    for p in kvs:
        docids = page_docids[p].symmetric_difference(docids)
    symmdiff_docs = list(filter(lambda x: len(x[1]) > 0, [(p, page_docids[p] & docids) for p in kvs]))
    kpvariations_nointersection.append(symmdiff_docs)
# force to more than one variation
kpvariations_nointersection = list(filter(lambda x: len(x) > 1, kpvariations_nointersection))
print(len(kpvariations_nointersection))
list([(page_dictionary[rd], docs) for rd, docs in rds] for rds in kpvariations_nointersection)[10:12]

39985


[[('vlachos', {'index952783'}), ('vlachou', {'index1183044'})],
 [('respiratory sinus arrhythmia', {'index1907846'}),
  ('sinus arrhythmia', {'index1398573'})]]

In [23]:
save_pickle(kpvariations_nointersection, get_relative_path(prefix_path, KEYPHRASEVARIATIONS_DOCS))

In [24]:
doc_groups = defaultdict(set) 
for i, grp in enumerate(kpvariations_nointersection):
    for keyphrase, docset in grp:
        for d in docset:
            doc_groups[d].add(i)
doc_groups = sorted(doc_groups.items(), key=lambda x: len(x[1]), reverse=True)
print(len(doc_groups))

151404
