<a href="https://colab.research.google.com/github/userName/Salience-Prediction/blob/main/evaluate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Various functions to evaluate methods

In [2]:
import os
import nltk
import json
import copy
import scipy.stats
import numpy as np
import sklearn.metrics
from tqdm import tqdm

In [3]:
dataset_select = 0
# 0 for 'InfoPop, 1 for 'CNN_DailyMail'

version = 1
# 1 for ROUGE_1, 2 for ROUGE_2, 3 for ROUGE_L

if dataset_select == 0:
  dataset = 'InfoPop'
else:
  dataset = 'CNN_DailyMail'

data_dir = 'drive/My Drive/store/'
data_path = data_dir + dataset + '/' + 'test.json'

with open(data_path, 'r+') as f:
  data = json.load(f)

use_LexRank = 0
# Set 'use_LexRank' to 1
# computed scores will be standardized to the 0 to 1 range 

In [4]:
# Find number of test documents with number of sentences <= threshold
threshold = 5
count = 0

for doc in data:
  if len(doc['sent_labels']) <= threshold:
    count += 1

print('Number of of test documents = ' + str(len(data)))
print('Number of test documents with number of sentences <= ' + str(threshold) + ' = ' + str(count))

Number of of test documents = 5177
Number of test documents with number of sentences <= 5 = 44


In [5]:
def position_baseline(sentence_set):
  # Considers an ordered list of sentences
  # Generates importance scores based on position
  count = len(sentence_set)
  normalizer = int((count * (count + 1)) / 2)
  attach_scores = copy.deepcopy(sentence_set)

  for index in range(count):
    score = (count - index) / normalizer
    attach_scores[index].append(score)
    # for InfoPop unit[2] stores the predicted score
    # 'unit' being a sentential unit containing the sentence string, importance score

  return(attach_scores)

In [6]:
# To test out implementation of position baseline
test_position = 1
if test_position == 1:
  position_predicted = position_baseline(data[0]['sent_labels'])
  print('Inference outcome for the first sentence:')
  print(position_predicted[10])

Inference outcome for the first sentence:
['For years, hospitals and healthcare providers have blamed the country’s abysmal statistics on mothers for being too old, too fat or too unhealthy to have safe deliveries.', 0.03978408982232242, 0.022311022311022312]


In [7]:
def get_splits(document_predicted):
  global dataset_select, version, use_LexRank
  actual_index = 1
  predicted_index = 2

  if dataset_select == 1:
    actual_index = version
    predicted_index = 4

  actuals = []
  predicted = []
  for item in document_predicted:
    actuals.append(item[actual_index])
    predicted.append(item[predicted_index])
  
  actuals = np.asarray(actuals)
  predicted = np.asarray(predicted)

  if use_LexRank == 1:
    total_computed = np.sum(predicted)
    predicted = predicted / total_computed
  
  return actuals, predicted

def MSE_score(document_predicted):
  # Metric to calculate the MSE error for a single test document
  actuals, predicted = get_splits(document_predicted)
  MSE = sklearn.metrics.mean_squared_error(actuals, predicted)

  return MSE

def MAE_score(document_predicted):
  # Metric to calculate the MAE error for a single test document
  actuals, predicted = get_splits(document_predicted)
  MAE = sklearn.metrics.mean_absolute_error(actuals, predicted)

  return MAE

def check_top(document_predicted):
  # Metric to check if the sentence with the highest predicted score
  # is indeed the highest scored sentence
  actuals, predicted = get_splits(document_predicted)
  actual_first = np.argmax(actuals)
  predicted_first = np.argmax(predicted)

  return int(actual_first == predicted_first)

def create_check_overlap(k = 3):
  def check_overlap(document_predicted):
    # computes overlap between top k actual most important sentences
    # and top k predicted most important sentences
    actuals, predicted = get_splits(document_predicted)
    if k > len(actuals):
     return 1
    
    actuals_consider = np.argsort(actuals * (-1))[0: k]
    predicted_consider = np.argsort(predicted * (-1))[0: k]

    intersection = np.intersect1d(actuals_consider, predicted_consider)
    score = len(intersection) / k
    return score
  
  return check_overlap

def kendall_tau(document_predicted):
  # computes Kendall's tau's

  actuals, predicted = get_splits(document_predicted)
  actuals_rank = np.argsort(actuals * -1)
  predicted_rank = np.argsort(predicted * -1)

  score, _ = scipy.stats.kendalltau(actuals_rank, predicted_rank)
  return score

def spearman_rank(document_predicted):
  # computes Spearman rank order correlation coefficient

  actuals, predicted = get_splits(document_predicted)
  actuals_rank = np.argsort(actuals * -1)
  predicted_rank = np.argsort(predicted * -1)

  score, _ = scipy.stats.spearmanr(actuals_rank, predicted_rank)
  return score

def get_true_relevance(score_list):
  # relevance mapping for Normalized DCG computation
  relevances = []

  for score in score_list:
    relevance = None
    if score > 0.5:
      relevance = 100
    elif score > 0.1:
      relevance = 50
    elif score > 0.05:
      relevance = 10
    elif score > 0.01:
      relevance = 1
    else:
      relevance = 0
    relevances.append(relevance)
  
  return relevances
    
def ndcg_true_scores(document_predicted):
  # computes Normalized DCG score
  # based on true labels

  actuals, predicted = get_splits(document_predicted)
  
  actuals_unit = [actuals]
  predictions_unit = [predicted]

  score = sklearn.metrics.ndcg_score(y_true = actuals_unit, y_score = predictions_unit)
  return score

def ndcg_relevance_scores(document_predicted):
  # computes Normalized DCG score
  # based on a defined scheme of relevance labelling

  actuals, predicted = get_splits(document_predicted)
  predictions_unit = [predicted]
  
  actual_scores = np.ndarray.tolist(actuals)
  relevances = np.asarray([get_true_relevance(actual_scores)])

  score = sklearn.metrics.ndcg_score(y_true = relevances, y_score = predictions_unit)
  return score

In [8]:
# Test for implementation
if test_position == 1:
  print('Running tests on a single test document using the position baseline:')
  print('- Outcome of running check_top(): ' + str(check_top(position_predicted)))
  function = create_check_overlap(7)
  print('- Outcome of running check_overlap(): ' + str(function(position_predicted)))
  print('- Outcome of running MSE_score(): ' + str(MSE_score(position_predicted)))
  print('- Outcome of running MAE_score(): ' + str(MAE_score(position_predicted)))
  print('- Outcome of running kendall_tau(): ' + str(kendall_tau(position_predicted)))
  print('- Outcome of running spearman_rank(): ' + str(spearman_rank(position_predicted)))
  print('- Outcome of running ndcg_true_scores(): ' + str(ndcg_true_scores(position_predicted)))
  print('- Outcome of running ndcg_relevance_scores(): ' + str(ndcg_relevance_scores(position_predicted)))

Running tests on a single test document using the position baseline:
- Outcome of running check_top(): 0
- Outcome of running check_overlap(): 0.0
- Outcome of running MSE_score(): 0.0011858518668143318
- Outcome of running MAE_score(): 0.018615874990825915
- Outcome of running kendall_tau(): 0.05673274094326725
- Outcome of running spearman_rank(): 0.05762658394237341
- Outcome of running ndcg_true_scores(): 0.4017899614712867
- Outcome of running ndcg_relevance_scores(): 0.29373191277903143


In [9]:
def run_position_baseline(test_set):
  # computes scores using the positon baseline
  # on all test set documents
  global dataset_select

  computed = []
  for unit in test_set:
    item = {}
    item['id'] = unit['id']
    if dataset_select == 0:
      item['url'] = unit['url']
    item['sent_labels'] = position_baseline(unit['sent_labels'])
    computed.append(item)
  return computed

In [10]:
# Run position baseline on all samples
run_all_position = 1
if run_all_position == 1:
  position_scores = run_position_baseline(data)

In [11]:
def get_total_score(predicted, scoring_function = check_top):
  scores = []
  
  for unit in predicted:
    score = scoring_function(unit['sent_labels'])
    scores.append(score)
  scores = np.asarray(scores)
  return np.mean(scores)

In [12]:
if run_all_position == 1 and use_LexRank == 0:
  print('Results on the full test set using the position baseline')

  print('- Average highest importance prediction: ', end = '')
  print(get_total_score(position_scores) * 100, end = '')
  print(' %')

  top_K = []

  for k in range(1, 20):
    function = create_check_overlap(k)
    top_K.append(get_total_score(position_scores, scoring_function = function) * 100)

  print('Top k scores for k = 1(1)20:')
  print(top_K)

  print('- Average MSE: ', end = '')
  print(get_total_score(position_scores, scoring_function = MSE_score))
  
  print('- Average MAE: ', end = '')
  print(get_total_score(position_scores, scoring_function = MAE_score))

  print('- Average Kendall\'s Tau: ', end = '')
  print(get_total_score(position_scores, scoring_function = kendall_tau))

  print('- Average Spearman\'s Rank Correlation Coefficient: ', end = '')
  print(get_total_score(position_scores, scoring_function = spearman_rank))

  print('- Average nDCG based on true labels: ', end = '')
  print(get_total_score(position_scores, scoring_function = ndcg_true_scores))

  print('- Average nDCG based on synthetic relevance labels: ', end = '')
  print(get_total_score(position_scores, scoring_function = ndcg_relevance_scores))

Results on the full test set using the position baseline
- Average highest importance prediction: 6.9152018543558045 %
Top k scores for k = 1(1)20:
[6.9152018543558045, 10.80741742321808, 16.45740776511493, 22.21363724164574, 26.94610778443114, 31.366299658747028, 35.434200722978005, 38.866621595518644, 41.707981885690984, 44.44465906895885, 46.98579380827786, 49.43178159809414, 51.67976701683482, 53.93774662656254, 56.11100379885391, 58.29631060459726, 60.47336067902147, 62.455733693902516, 64.48969632890416]
- Average MSE: 0.007858691173335416
- Average MAE: 0.053011265403347406
- Average Kendall's Tau: 0.03339373658300352
- Average Spearman's Rank Correlation Coefficient: 0.0424231412389899
- Average nDCG based on true labels: 0.5803590013865478
- Average nDCG based on synthetic relevance labels: 0.5550813643685166


In [13]:
def get_metrics(name, dataset):
  results = [name]
  results.append(get_total_score(dataset) * 100)

  top_K = []

  for k in range(1, 20):
    function = create_check_overlap(k)
    top_K.append(get_total_score(dataset, scoring_function = function) * 100)

  results.append(top_K)
  results.append(get_total_score(dataset, scoring_function = MSE_score))
  results.append(get_total_score(dataset, scoring_function = MAE_score))
  results.append(get_total_score(dataset, scoring_function = kendall_tau))
  results.append(get_total_score(dataset, scoring_function = spearman_rank))
  results.append(get_total_score(dataset, scoring_function = ndcg_true_scores))
  results.append(get_total_score(dataset, scoring_function = ndcg_relevance_scores))

  return results

In [14]:
if use_LexRank == 0:
  model_name = 'SummaRunner-Reg'
  location = data_dir + model_name + '/' + dataset + '/'
  results_dump = []

  for unit in tqdm(os.listdir(location)):
    if os.path.isdir(location + unit) == False:
      continue

    with open(location + unit + '/' + 'batchpred.json', 'r+') as f:
      data = json.load(f)
      results = get_metrics(unit, data)
      results_dump.append(results)
      
  name = model_name + '_' + dataset
  if dataset_select == 1:
    name += ('_oracle_' + str(version))

  with open(name + '.json', 'w+') as w:
    json.dump(results_dump, w)

100%|██████████| 10/10 [02:51<00:00, 17.19s/it]


In [15]:
if use_LexRank == 1:
  model_name = 'LexRank'
  location = data_dir + model_name

  dump_name = location + '/' + dataset

  with open(dump_name + '.json', 'r+') as f:
    data = json.load(f)
    results = get_metrics(model_name, data)

  name = model_name + '_' + dataset
  if dataset_select == 1:
    name += ('_oracle_' + str(version))

  with open(name + '.json', 'w+') as w:
    json.dump(results, w)

In [16]:
if use_LexRank == 0:
  model_name = 'position'
  results = get_metrics(model_name, position_scores)

  name = model_name + '_' + dataset
  if dataset_select == 1:
    name += ('_oracle_' + str(version))

  with open(name + '.json', 'w+') as w:
    json.dump(results, w)

In [17]:
# ^_^ Thank You