<a href="https://colab.research.google.com/github/userName/Salience-Prediction/blob/main/InfoPop_data_build.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Builds and saves the InfoPop dataset with Train, Validation and Test splits

In [2]:
import json
import math
import nltk
import tqdm
import random

In [3]:
# Getting Resources
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
file_name = 'drive/My Drive/summworth/cleaned_document.json'

# Loading Created Pre-processed data
with open(file_name, 'r+') as f:
  data = json.load(f)

In [None]:
# Build the final data with explicit limits
final_set = []
sentence_token_count_limit = 40
total_sentence_count_upper = 100
total_sentence_count_lower = 2

# Stores the number of sentences with importance as 0 for each document
# Format: (Total Number of Sentences, Number of Sentences with Score 0)
scores_stats = []

too_small = 0
too_large = 0
not_a_number_count = 0
not_important = 0

def count_tokens(sentence):
  tokens = nltk.word_tokenize(sentence)
  return len(tokens)

def get_score_stats(document):
  total = len(document['sent_labels'])
  count = 0
  for sentence in document['sent_labels']:
    if float(sentence[1]) == 0.0:
      continue
    count += 1
  return [total, count]

def normalize_scores(document):
  sum_of_scores = 0
  for sentence in document['sent_labels']:
    sum_of_scores += float(sentence[1])
  
  normalized_labels = []
  for sentence in document['sent_labels']:
    new_score = float(sentence[1]) / sum_of_scores
    normalized_labels.append([sentence[0], new_score])
  
  normalized_doc = document
  normalized_doc['sent_labels'] = normalized_labels
  return normalized_doc

value_threshold = 5
too_large_sentences = 0
too_large_cases = []

for key, value in tqdm.tqdm(data.items()):
  not_a_number_flag = 0
  too_large_value_flag = 0

  units = data[key]['sent_labels']
  processed_units = data[key]
  processed_units['sent_labels'] = []
  count = 0
  for unit in units:
    if len(unit) != 3 or len(unit[0]) < 2 or unit[2] != 1 or count_tokens(unit[0]) > sentence_token_count_limit:
      continue

    # Verifying number of sentences with too large scores
    if float(unit[1]) > value_threshold:
      too_large_value_flag = 1
      too_large_sentences += 1
      too_large_cases.append([data[key]['id'], unit[0], float(unit[1])])

    if math.isnan(float(unit[1])):
      not_a_number_flag = 1
      break
    count += 1
    processed_units['sent_labels'].append(unit)
  
  if not_a_number_flag == 1:
    not_a_number_count += 1
  if count > total_sentence_count_upper:
    too_large += 1
    continue
  if count < total_sentence_count_lower:
    too_small += 1
    continue
  
  if not_a_number_flag == 0:
    importance_score_stats = get_score_stats(processed_units)
    if importance_score_stats[1] > 2: 
      final_set.append(normalize_scores(processed_units))
      scores_stats.append(importance_score_stats + [{'id': processed_units['id'], 'url': processed_units['url']}])
    else:
      not_important += 1

print()
print()
print('• Documents which were:')
print('  - Too Small: ' + str(too_small))
print('  - Too Large: ' + str(too_large))
print('• Documents containing NaN as an importance labels: ' + str(not_a_number_count))
print()
print('After pruning the above cases:')
print('• Documents containing less than three popular sentences: ' + str(not_important))

In [6]:
# View the Scores Statistics List
print('Viewing the Scores Statistics List:')
scores_stats[0: 3]

Viewing the Scores Statistics List:


[[72,
  12,
  {'id': 2, 'url': 'http://cnn.com/2009/opinion/10/26/opinion.jonathan.foer'}],
 [95,
  4,
  {'id': 3,
   'url': 'http://cnn.com/2013/02/19/world/africa/south-africa-pistorius-case/index.html'}],
 [10,
  3,
  {'id': 5,
   'url': 'http://cs.thomsonreuters.com/ua/acct_pr/acs/cs_us_en/common/com_proc/workflows-overview.htm'}]]

In [7]:
# Splitting the dataset into Train, Validation and Test Splits

indices = [id for id in range(len(final_set))]
random.seed(42)
random.shuffle(indices)

train_size = int((8 / 10) * len(final_set))
val_size = int((1 / 10) * len(final_set))
test_size = len(final_set) - (train_size + val_size)

print('Total Number of Datapoints: ' + str(len(final_set)))
print('Train Size: ' + str(train_size))
print('Val Size: ' + str(val_size))
print('Test Size: ' + str(test_size))

train_indices = set(indices[0: train_size])
val_indices = set(indices[train_size: train_size + val_size])
test_indices = set(indices[train_size + val_size: ])

train, val, test = [], [], []

for index in indices:
  if index in train_indices:
    train.append(final_set[index])
  elif index in val_indices:
    val.append(final_set[index])
  else:
    test.append(final_set[index])

# Splits created

Total Number of Datapoints: 51770
Train Size: 41416
Val Size: 5177
Test Size: 5177


In [8]:
# Uncomment to View Examples
# train[1]['sent_labels']

In [9]:
# Removing redundant labels for Grammaticality

def drop_label(dump):
  new_dump = []

  for document in dump:
    new_unit = {}
    new_unit['id'] = document['id']
    new_unit['url'] = document['url']
    sentence_labels = []
    old_labels = document['sent_labels']

    for sentence in old_labels:
      sentence_labels.append(sentence[0: 2])
    new_unit['sent_labels'] = sentence_labels

    new_dump.append(new_unit)

  return new_dump

train = drop_label(train)
val = drop_label(val)
test = drop_label(test)

In [10]:
# Saving Processed Files
location = 'drive/My Drive/store/InfoPop/'

with open(location + 'stats.json', 'w+') as f:
  json.dump(scores_stats, f)

with open(location + 'train.json', 'w+') as f:
  json.dump(train, f)

with open(location + 'val.json', 'w+') as f:
  json.dump(val, f)

with open(location + 'test.json', 'w+') as f:
  json.dump(test, f)

In [11]:
# ^_^ Thank You