# Compute NYT labels

In [1]:
import pandas as pd
import json
import os
from rouge_score import rouge_scorer
from nltk.tokenize import LineTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import concurrent.futures

In [2]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# regex
re_html = re.compile(r'<[^>]+>')

In [3]:
# Compute labels
def compute_labels(doc, summary, is_sep_n = False):
  global re_html
  labels = []
  s = []
  # remove html tags
  doc = re_html.sub('', doc)
  summary = re_html.sub('', summary)
  # split doc
  if is_sep_n:
    nltk_line_tokenizer = LineTokenizer()
    s = nltk_line_tokenizer.tokenize(doc)
  else:
    s = sent_tokenize(doc)
  # compute labels
  if (len(s) > 0):
    a = ""
    score = scorer.score(a, summary)
    for i in range(len(s)):
      current_score = scorer.score(a + s[i] + ".", summary)
      if  current_score["rouge1"].fmeasure > score["rouge1"].fmeasure or \
        current_score["rouge2"].fmeasure > score["rouge2"].fmeasure or \
        current_score["rougeL"].fmeasure > score["rougeL"].fmeasure:
        score = current_score
        a = a + s[i] + "."
        labels.append(1)
      else:
        labels.append(0)

  return labels

In [4]:
# compute labels for dataframe
def compute_labels_df(df):
  labels = []
  for idx in df.index:
    labels.append(compute_labels(df["docs"][idx], df["summaries"][idx], is_sep_n = False))
  return labels

In [5]:
# load dataset
df = pd.read_json("./data/nyt_corpus_LDC2008T19_50.json")

In [6]:
lbls = compute_labels_df(df)
df["labels"] = lbls

In [7]:
df.to_json("./data/nyt_corpus_LDC2008T19_50.json")

In [8]:
df = pd.read_json("./data/nyt_corpus_LDC2008T19_50.json")