In [1]:
import pandas as pd
import numpy as np
import re
from dataLoader import *
from utils import *
import argparse
import yaml
import os

In [2]:
def category_correspondence(selected_terms, categories, ename2eid, eid2DocProb, eidDocPair2Prob):
    score = 0
    for cat in categories:
        if cat not in ename2eid: continue
        scores = []
        for term in selected_terms:
            scores += [get_nmi(ename2eid[cat], ename2eid[term], eid2DocProb, eidDocPair2Prob)]
        score += sum(scores)
    return score

In [3]:
def result(config_file):
    with open(config_file, 'r') as ymlfile:
        config = yaml.load(ymlfile, Loader=yaml.FullLoader)
    
    domain_path = config['dataset']['domain_path']
    result_path = os.path.join(config['dataset']['domain_path'], config['dataset']['result_folder'])
    
    eid2ename, ename2eid = loadEidToEntityMap(domain_path + 'intermediate/entity2id.txt')
    eid2DocProb = loadEid2DocFeature(domain_path + 'intermediate/eid2DocProb.txt')
    eidDocPair2Prob = loadEidDocPairFeature(domain_path + 'intermediate/eidDocPair2prob.txt')
    
    gt_file = '../data/groundtruths/category_correspondence/arxivcs_categories.txt'
    cs_cates = []
    with open(gt_file, 'r') as f:
        for line in f:
            cat = line.strip()
            cat = cat.lower()
            cat = re.sub(r'[^\x00-\x7F]+', ' ', cat)
            cat = cat.replace("-", " ")
            cat = "_".join(cat.split())
            cs_cates.append(cat)
    cs_cates = np.array(cs_cates)
    
    selected_terms_rf = pd.read_csv(result_path+'rf.txt', header=None, sep='\n').values[:,0]
    selected_terms_lo = pd.read_csv(result_path+'lo.txt', header=None, sep='\n').values[:,0]
    selected_terms_fl = pd.read_csv(result_path+'fl.txt', header=None, sep='\n').values[:,0]
    selected_terms_kl_rf = pd.read_csv(result_path+'kl_rf.txt', header=None, sep='\n').values[:,0]
    selected_terms_mm = pd.read_csv(result_path+'mm.txt', header=None, sep='\n').values[:,0]
    selected_terms_kl_mm = pd.read_csv(result_path+'kl_mm.txt', header=None, sep='\n').values[:,0]
    
    
    ks = [10, 20, 30, 40, 50, 100, 200, 500]
    ccs = []
    for k in ks:
        cc = []
        cc.append(category_correspondence(selected_terms_rf[:k], cs_cates, ename2eid, eid2DocProb, eidDocPair2Prob))
        cc.append(category_correspondence(selected_terms_lo[:k], cs_cates, ename2eid, eid2DocProb, eidDocPair2Prob))
        cc.append(category_correspondence(selected_terms_fl[:k], cs_cates, ename2eid, eid2DocProb, eidDocPair2Prob))
        cc.append(category_correspondence(selected_terms_kl_rf[:k], cs_cates, ename2eid, eid2DocProb, eidDocPair2Prob))
        cc.append(category_correspondence(selected_terms_mm[:k], cs_cates, ename2eid, eid2DocProb, eidDocPair2Prob))
        cc.append(category_correspondence(selected_terms_kl_mm[:k], cs_cates, ename2eid, eid2DocProb, eidDocPair2Prob))
        ccs.append(", ".join([str(round(c,4)) for c in cc]))
        
        print(ccs[-1])

In [4]:
config_file = "configs/arxivcs_ap.yaml" # candidate keywords: authoprhase extracted keywords
result(config_file)

Loading: ../data/arxiv/cs/all_ap/intermediate/entity2id.txt: 100%|██████████| 93148/93148 [00:00<00:00, 656913.95it/s]
Loading: ../data/arxiv/cs/all_ap/intermediate/eid2DocProb.txt: 100%|██████████| 93148/93148 [00:00<00:00, 697756.00it/s]
Loading: ../data/arxiv/cs/all_ap/intermediate/eidDocPair2prob.txt: 100%|██████████| 13653903/13653903 [00:48<00:00, 281840.07it/s]


1.0651, 1.1001, 1.0722, 1.0651, 2.0981, 2.1212
1.144, 3.2476, 1.1345, 1.1451, 4.2626, 4.2972
2.2134, 4.3965, 3.2682, 3.2875, 4.4321, 4.4169
3.3273, 4.4929, 3.3515, 3.366, 4.6, 4.6018
3.453, 4.6896, 3.4505, 3.4496, 5.6826, 5.6826
4.7812, 8.2382, 4.7626, 4.8761, 8.2709, 8.2858
9.7166, 11.105, 9.5908, 8.7045, 11.109, 12.0209
18.8958, 19.3732, 16.6155, 17.7429, 19.3248, 19.2368


In [5]:
config_file = "configs/arxivcs_sp.yaml" # candidate keywords: springer
result(config_file)

Loading: ../data/arxiv/cs/all_sp/intermediate/entity2id.txt: 100%|██████████| 64160/64160 [00:00<00:00, 816706.86it/s]
Loading: ../data/arxiv/cs/all_sp/intermediate/eid2DocProb.txt: 100%|██████████| 64160/64160 [00:00<00:00, 752042.97it/s]
Loading: ../data/arxiv/cs/all_sp/intermediate/eidDocPair2prob.txt: 100%|██████████| 16978645/16978645 [01:02<00:00, 273442.53it/s]


FileNotFoundError: [Errno 2] No such file or directory: '../data/arxiv/cs/all_sp/result/rf.txt'

In [None]:
config_file = "configs/arxivcs_am.yaml" # candidate keywords: aminer
result(config_file)