In [1]:
import pandas as pd
import numpy as np
import csv
from dataLoader import *
from utils import *
import yaml
import os
import json
from sklearn.metrics.pairwise import cosine_similarity

# Representativeness (Evaluation with groundtruths)

In [2]:
def get_clear_terms(terms):
    clear_terms = []
    for term in terms:
        term = term.lower()
        term = "_".join(term.split())
        clear_terms.append(term)
    return np.array(clear_terms)

def get_combined_embed(vocab, ename2eid, eid2embed, glove_vectors,  dim=300):
    
    term2gv = {}
    for i, term in enumerate(vocab):
        w2v_vec = eid2embed[ename2eid[term]][0] if term in ename2eid else np.zeros((dim,))
        glove_vec = glove_vectors[i]
        term2gv[term] = np.concatenate((w2v_vec[:dim], glove_vec[:dim]), axis=0)

    return term2gv


def cosine(term1_vec, term2_vec):
    sim = cosine_similarity([term1_vec], [term2_vec])[0,0]
    sim = max(0, min(1, sim))
    return sim

def representativeness(ground_truths, selected_terms, term2gv):
    score = 0
    idx = []
    for gt in ground_truths:
        _score = []
        for st in selected_terms:
            _score.append(cosine(term2gv[gt], term2gv[st]))
        score += np.max(_score)
    return score/len(ground_truths)

In [3]:
def result(config_file, gt_file):

    with open(config_file, 'r') as ymlfile:
        config = yaml.load(ymlfile, Loader=yaml.FullLoader)
    
    domain_path = config['dataset']['domain_path']
    result_path = os.path.join(config['dataset']['domain_path'], config['dataset']['result_folder'])
    
    eid2ename, ename2eid = loadEidToEntityMap(domain_path + 'intermediate/entity2id.txt')
    eid2DocProb = loadEid2DocFeature(domain_path + 'intermediate/eid2DocProb.txt')
    eidDocPair2Prob = loadEidDocPairFeature(domain_path + 'intermediate/eidDocPair2prob.txt')
    eid2embed = loadEntityEmbedding(domain_path + 'intermediate/eid2embed.txt', dim=300)[0]
    
    domain_terms = pd.read_csv(domain_path + 'intermediate/entity2freq.txt', sep='\t', header=None, keep_default_na=False, quoting=csv.QUOTE_NONE).values[:,0]
    
    gt_file = "../data/groundtruths/trending_keywords/"+gt_file
    gt = pd.read_csv(gt_file, header=None, keep_default_na=False, quoting=csv.QUOTE_NONE).values[:,0]
    gt = get_clear_terms(gt)
    gt = np.unique(gt)
    domain_terms = get_clear_terms(domain_terms)
    
    vocab = np.unique(list(domain_terms) + list(gt))
    
    glove_vectors = load_embeddings_glove("../data/glove.42B.300d.txt", vocab, phrase_connector='_')
    
    term2gv = get_combined_embed(vocab, ename2eid, eid2embed, glove_vectors,  dim=300)
    
    topn = 50
    
    
    for file in os.listdir(result_path):
        print(file.split('.')[0])
        selected_terms = pd.read_csv(result_path+"/"+file, header=None, keep_default_na=False, quoting=csv.QUOTE_NONE).values[:topn,0]
        rep = representativeness(gt, selected_terms, term2gv)
        print(rep)
    

### AI 2000-2009 

In [4]:
config_file = "configs/ai_2000-2009.yaml"
gt_file = "2000-2009.txt"
result(config_file, gt_file)

Loading: ../data/arxiv/cs/ai_sp/2000_2009/intermediate/entity2id.txt: 100%|██████████| 11415/11415 [00:00<00:00, 656105.41it/s]
Loading: ../data/arxiv/cs/ai_sp/2000_2009/intermediate/eid2DocProb.txt: 100%|██████████| 11415/11415 [00:00<00:00, 735306.01it/s]
Loading: ../data/arxiv/cs/ai_sp/2000_2009/intermediate/eidDocPair2prob.txt: 100%|██████████| 463379/463379 [00:01<00:00, 247317.39it/s]
Loading: ../data/arxiv/cs/ai_sp/2000_2009/intermediate/eid2embed.txt: 100%|██████████| 11415/11415 [00:01<00:00, 10828.54it/s]


rf
0.6275255809384787
fl
0.6172833313959839
lo
0.6813504231006454
kl_rf
0.6282294478983255
mm
0.6908700524249917
kl_mm
0.689810618880265
pr
0.6626577658741785


### AI 2010-2019 

In [5]:
config_file = "configs/ai_2010-2019.yaml"
gt_file = "2010-2019.txt"
result(config_file, gt_file)

Loading: ../data/arxiv/cs/ai_sp/2010_2019/intermediate/entity2id.txt: 100%|██████████| 44003/44003 [00:00<00:00, 748723.98it/s]
Loading: ../data/arxiv/cs/ai_sp/2010_2019/intermediate/eid2DocProb.txt: 100%|██████████| 44003/44003 [00:00<00:00, 754095.92it/s]
Loading: ../data/arxiv/cs/ai_sp/2010_2019/intermediate/eidDocPair2prob.txt: 100%|██████████| 6754824/6754824 [00:25<00:00, 265678.35it/s]
Loading: ../data/arxiv/cs/ai_sp/2010_2019/intermediate/eid2embed.txt: 100%|██████████| 44003/44003 [00:03<00:00, 11005.18it/s]


rf
0.6640819756754246
fl
0.6848269874378363
lo
0.7199556977823204
kl_rf
0.6791675278342045
mm
0.7331737389279672
kl_mm
0.7763239746031955
pr
0.6969613304655836


### AI 2020-2021 

In [6]:
config_file = "configs/ai_2020-2021.yaml"
gt_file = "2020-2021.txt"
result(config_file, gt_file)

Loading: ../data/arxiv/cs/ai_sp/2020_2021/intermediate/entity2id.txt: 100%|██████████| 39402/39402 [00:00<00:00, 761746.57it/s]
Loading: ../data/arxiv/cs/ai_sp/2020_2021/intermediate/eid2DocProb.txt: 100%|██████████| 39402/39402 [00:00<00:00, 736821.77it/s]
Loading: ../data/arxiv/cs/ai_sp/2020_2021/intermediate/eidDocPair2prob.txt: 100%|██████████| 5585268/5585268 [00:19<00:00, 293215.63it/s]
Loading: ../data/arxiv/cs/ai_sp/2020_2021/intermediate/eid2embed.txt: 100%|██████████| 39402/39402 [00:03<00:00, 10890.68it/s]


rf
0.6493187205347215
fl
0.6528150384614049
lo
0.7238289069975233
kl_rf
0.6516720857364029
mm
0.7898594158786493
kl_mm
0.7944380046320856
pr
0.6825875466250563


# Keywords trends evaluation (Google Trends)

In [3]:
# trends score from google trends for all the terms for various time stamps
selected_term_trends = None
with open('../data/arxiv/cs/ai_sp/selected_terms_trends.txt', 'r') as fin:
    selected_term_trends = json.loads(fin.read())

In [4]:
def get_scores_by_year_interval(start_year, end_year, trends):
    indices = []
    trends = np.array(trends)
    for entry in trends:
        year = int(str(entry[0]).split('-')[0])
        if year >= start_year and year <= end_year:
            indices += [True]
        else: indices += [False]
    return trends[indices, 1].astype(float)

def get_time_score_by_method(terms, time_scores, time_index):
    score = []
    for term in terms:
        time_score = np.maximum(time_scores[term],1)
        score_ = time_score/np.sum(time_score)
        score += [score_[time_index]]
    return score

In [6]:
time_scores = dict()
for term in selected_term_trends:
    
    if len(selected_term_trends[term]) == 0:
        score2000s = 0
        score2010s = 0
        score2020s = 0
    else:
        score2000s = get_scores_by_year_interval(2000, 2009, selected_term_trends[term])
        score2010s = get_scores_by_year_interval(2010, 2019, selected_term_trends[term])
        score2020s = get_scores_by_year_interval(2020, 2021, selected_term_trends[term])
        
        score2000s = np.sum(score2000s)/max(np.shape(score2000s)[0], 1)
        score2010s = np.sum(score2010s)/max(np.shape(score2010s)[0], 1)
        score2020s = np.sum(score2020s)/max(np.shape(score2020s)[0], 1)

    time_scores[term] = [score2000s, score2010s, score2020s]

In [7]:
def result(config_file, time_index):
    topn = 50
    with open(config_file, 'r') as ymlfile:
        config = yaml.load(ymlfile, Loader=yaml.FullLoader)
    
    result_path = os.path.join(config['dataset']['domain_path'], config['dataset']['result_folder'])
    scores = []
    for file in os.listdir(result_path):
        print(file.split('.')[0])
        et = pd.read_csv(result_path+"/"+file, header=None, keep_default_na=False, quoting=csv.QUOTE_NONE).values[:topn,0]
        score = get_time_score_by_method(et, time_scores, time_index)
        scores.append(score)
        print(round(np.mean(score), 4))

### 2000-2009 

In [8]:
config_file = "configs/ai_2000-2009.yaml"
result(config_file, 0)

rf
0.3584
fl
0.3591
lo
0.3956
kl_rf
0.3583
mm
0.4104
kl_mm
0.4165
pr
0.3705


### 2010-2019 

In [9]:
config_file = "configs/ai_2010-2019.yaml"
result(config_file, 1)

rf
0.3195
fl
0.3217
lo
0.3326
kl_rf
0.3189
mm
0.3468
kl_mm
0.3523
pr
0.3211


### 2020-2021 

In [10]:
config_file = "configs/ai_2020-2021.yaml"
result(config_file, 2)

rf
0.323
fl
0.3336
lo
0.4043
kl_rf
0.3239
mm
0.5145
kl_mm
0.5215
pr
0.3641
