<a href="https://colab.research.google.com/github/sayarghoshroy/proc_s2orc/blob/main/create_rel_work.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
import nltk
from nltk.corpus import stopwords
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
path = 'drive/My Drive/s2orc/'

read_file = 'pdf_parses_0.jsonl'
rel_work_file = 'out/output_relw_1_0.jsonl'

MAX_PROC = int(1e6)

In [None]:
# Read a jsonl and create a jsonl
import re
import ast
import json
import tqdm

stops = set(stopwords.words('english'))

def normalize(name):
  name = name.lower().strip()
  re.sub('\s+', ' ', name)
  return name

def check_rel_work(body_text):
  for blob in body_text:
    if normalize(blob['section']) == 'related work':
      return True, blob
  return False, None

def get_following_sections(body_text):
  flag = 0
  following_sections = []

  for blob in body_text:
    if flag == 0 and normalize(blob['section']) == 'related work':
      flag = 1
      continue
    if flag == 1:
      following_sections.append(blob['section'])

  return following_sections

def score_sections(section_id, count_sections, common_tokens, count_tokens):
  score = (count_sections - section_id) / count_sections + (common_tokens / count_tokens)
  return score

def get_rel_work(candidate_sections, blob):
  # if a subsequent section name has tokens contained in the related work section

  rel_works = []
  topics_set = set()
  tokens_blob = set(nltk.word_tokenize(blob['text'].lower()))
  num_sections = len(candidate_sections)

  for index, candidate in enumerate(candidate_sections):
    if candidate.lower().strip() == 'related work':
      continue
    
    if candidate.lower().strip() in topics_set:
      continue

    topics_set.add(candidate.lower().strip())
    candidate_tokens = nltk.word_tokenize(candidate.lower())
    common = tokens_blob.intersection(candidate_tokens)
    common = common.difference(stops)

    score = score_sections(index + 1, num_sections, len(common), len(candidate_tokens))

    rel_works.append([candidate.lower().strip(),
                      score,
                      index + 1,
                      len(common)])
    # Format: text, score, index, overlap

  sorted_works = sorted(rel_works, key = lambda works: works[1], reverse = True)
  return sorted_works

valid_lines = 0
possible_rel_works_count = []

with open(path + read_file, 'r+') as f:
  for counter in tqdm.tqdm(range(MAX_PROC)):
    line = f.readline();
    if not line:
        break
    paper_dict = json.loads(line)
    if paper_dict['body_text'] == []:
      continue

    check, blob = check_rel_work(paper_dict['body_text'])
    if check:
      valid_lines += 1
      unit = {}
      unit['paper_id'] = paper_dict['paper_id']
      unit['related_work'] = blob['text']
      
      candidate_sections = get_following_sections(paper_dict['body_text'])
      possible_rel_works = get_rel_work(candidate_sections, blob)

      unit['possible_rel_works'] = possible_rel_works
      possible_rel_works_count.append(len(possible_rel_works))

      with open(path + rel_work_file, 'a+') as o:
        o.write(str(json.dumps(ast.literal_eval(str(unit)))))
        o.write('\n')

print('\n\n' + 'Number of entries processed: ' + str(counter))
print('Number of entries with the Related Works sections: ' + str(valid_lines))
print('Average number of Related Work subsections: ' + str(np.mean(possible_rel_works_count)))

 31%|███       | 310736/1000000 [02:24<05:20, 2151.79it/s]



Number of entries processed: 310736
Number of entries with the Related Works sections: 1895
Average number of Related Work subsections: 11.130343007915567





In [None]:
# That's it