In [1]:
import pandas as pd
import numpy as np
import csv
from dataLoader import *
from utils import *
import yaml
import os
import json
from sklearn.metrics.pairwise import cosine_similarity

# Representativeness (Evaluation with groundtruths)

In [None]:
def get_clear_terms(terms):
    clear_terms = []
    for term in terms:
        term = term.lower()
        term = "_".join(term.split())
        clear_terms.append(term)
    return np.array(clear_terms)

def get_combined_embed(vocab, ename2eid, eid2embed, glove_vectors,  dim=300):
    
    term2gv = {}
    for i, term in enumerate(vocab):
        w2v_vec = eid2embed[ename2eid[term]][0] if term in ename2eid else np.zeros((dim,))
        glove_vec = glove_vectors[i]
        term2gv[term] = np.concatenate((w2v_vec[:dim], glove_vec[:dim]), axis=0)

    return term2gv


def cosine(term1_vec, term2_vec):
    sim = cosine_similarity([term1_vec], [term2_vec])[0,0]
    sim = max(0, min(1, sim))
    return sim

def representativeness(ground_truths, selected_terms, term2gv):
    score = 0
    idx = []
    for gt in ground_truths:
        _score = []
        for st in selected_terms:
            _score.append(cosine(term2gv[gt], term2gv[st]))
        score += np.max(_score)
    return score/len(ground_truths)

In [None]:
def result(config_file, gt_file):

    with open(config_file, 'r') as ymlfile:
        config = yaml.load(ymlfile, Loader=yaml.FullLoader)
    
    domain_path = config['dataset']['domain_path']
    result_path = os.path.join(config['dataset']['domain_path'], config['dataset']['result_folder'])
    
    eid2ename, ename2eid = loadEidToEntityMap(domain_path + 'intermediate/entity2id.txt')
    eid2DocProb = loadEid2DocFeature(domain_path + 'intermediate/eid2DocProb.txt')
    eidDocPair2Prob = loadEidDocPairFeature(domain_path + 'intermediate/eidDocPair2prob.txt')
    eid2embed = loadEntityEmbedding(domain_path + 'intermediate/eid2embed.txt', dim=300)[0]
    
    domain_terms = pd.read_csv(domain_path + 'intermediate/entity2freq.txt', sep='\t', header=None, keep_default_na=False, quoting=csv.QUOTE_NONE).values[:,0]
    
    gt_file = "../data/groundtruths/trending_keywords/"+gt_file
    gt = pd.read_csv(gt_file, header=None, keep_default_na=False, quoting=csv.QUOTE_NONE).values[:,0]
    gt = get_clear_terms(gt)
    gt = np.unique(gt)
    domain_terms = get_clear_terms(domain_terms)
    
    vocab = np.unique(list(domain_terms) + list(gt))
    
    glove_vectors = load_embeddings_glove("../data/glove.42B.300d.txt", vocab, phrase_connector='_')
    
    term2gv = get_combined_embed(vocab, ename2eid, eid2embed, glove_vectors,  dim=300)
    
    topn = 50
    
    
    for file in os.listdir(result_path):
        print(file.split('.')[0])
        selected_terms = pd.read_csv(result_path+"/"+file, header=None, keep_default_na=False, quoting=csv.QUOTE_NONE).values[:topn,0]
        rep = representativeness(gt, selected_terms, term2gv)
        print(rep)
    

### AI 2000-2009 

In [None]:
config_file = "configs/ai_2000-2009.yaml"
gt_file = "2000-2009.txt"
result(config_file, gt_file)

### AI 2010-2019 

In [None]:
config_file = "configs/ai_2010-2019.yaml"
gt_file = "2010-2019.txt"
result(config_file, gt_file)

### AI 2020-2021 

In [None]:
config_file = "configs/ai_2020-2021.yaml"
gt_file = "2020-2021.txt"
result(config_file, gt_file)

# Keywords trends evaluation (Google Trends)

In [None]:
# trends score from google trends for all the terms for various time stamps
selected_term_trends = None
with open('../data/arxiv/cs/ai_sp/selected_terms_trends.txt', 'r') as fin:
    selected_term_trends = json.loads(fin.read())

In [None]:
def get_scores_by_year_interval(start_year, end_year, trends):
    indices = []
    trends = np.array(trends)
    for entry in trends:
        year = int(str(entry[0]).split('-')[0])
        if year >= start_year and year <= end_year:
            indices += [True]
        else: indices += [False]
    return trends[indices, 1].astype(float)

def get_time_score_by_method(terms, time_scores, time_index):
    score = []
    for term in terms:
        time_score = np.maximum(time_scores[term],1)
        score_ = time_score/np.sum(time_score)
        score += [score_[time_index]]
    return score

In [None]:
# time_scores = dict()
for term in selected_term_trends:
    
    if len(selected_term_trends[term]) == 0:
        score2000s = 0
        score2010s = 0
        score2020s = 0
    else:
        score2000s = get_scores_by_year_interval(2000, 2009, selected_term_trends[term])
        score2010s = get_scores_by_year_interval(2010, 2019, selected_term_trends[term])
        score2020s = get_scores_by_year_interval(2020, 2021, selected_term_trends[term])
        
        score2000s = np.sum(score2000s)/max(np.shape(score2000s)[0], 1)
        score2010s = np.sum(score2010s)/max(np.shape(score2010s)[0], 1)
        score2020s = np.sum(score2020s)/max(np.shape(score2020s)[0], 1)

    time_scores[term] = [score2000s, score2010s, score2020s]

In [None]:
def result(config_file, time_index):
    topn = 50
    with open(config_file, 'r') as ymlfile:
        config = yaml.load(ymlfile, Loader=yaml.FullLoader)
    
    result_path = os.path.join(config['dataset']['domain_path'], config['dataset']['result_folder'])
    scores = []
    for file in os.listdir(result_path):
        print(file.split('.')[0])
        et = pd.read_csv(result_path+"/"+file, header=None, keep_default_na=False, quoting=csv.QUOTE_NONE).values[:topn,0]
        score = get_time_score_by_method(et, time_scores, time_index)
        scores.append(score)
        print(round(np.mean(score), 4))

### 2000-2009 

In [None]:
config_file = "configs/ai_2000-2009.yaml"
result(config_file, 0)

### 2000-2009 

In [None]:
config_file = "configs/ai_2010-2019.yaml"
result(config_file, 1)

### 2020-2021 

In [None]:
config_file = "configs/ai_2020-2021.yaml"
result(config_file, 2)