<a href="https://colab.research.google.com/github/anon/ILCiteR/blob/main/split_domains.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pickle
import ast
import json
import tqdm
from collections import defaultdict

In [2]:
location = 'drive/My Drive/cite_reco_s2orc/full/'

domains = ['mt', 'ner', 'sa', 'summ']
domain_to_paper_ids = defaultdict(list)
domain_to_papers = defaultdict(list)

In [3]:
# Create domain-wise paper IDs

for domain in domains:
  with open(location + domain + '.jsonl') as f:
    while True:
      line = f.readline()
      if not line:
        break
      meta = ast.literal_eval(line)
      domain_to_paper_ids[domain].append(meta['paper_id'])

In [4]:
# View number of fetched paper IDs for each domain

print('Number of fetched paper IDs:')

for domain in domains:
  print(domain, len(domain_to_paper_ids[domain]))
  domain_to_paper_ids[domain] = set(domain_to_paper_ids[domain])

Number of fetched paper IDs:
mt 9522
ner 2260
sa 5798
summ 8445


In [5]:
# Create domain-wise paper parse data

def generator():
  with open(location + 'parses.jsonl') as f:
    while True:
      line = f.readline()
      if not line:
        break
      paper = ast.literal_eval(line)
      for domain in domains:
        if paper['paper_id'] in domain_to_paper_ids[domain]:
          domain_to_papers[domain].append(paper)
      yield

for _ in tqdm.tqdm(generator()):
  pass

25366it [02:12, 191.46it/s]


In [6]:
# View number of fetched paper parses for each domain

print('Number of fetched paper parses:')

for domain in domains:
  print(domain, len(domain_to_papers[domain]))
  with open(location + 'domain_parses/' + domain + '.json', 'w+') as f:
    json.dump(domain_to_papers[domain], f)

Number of fetched paper parses:
mt 9522
ner 2260
sa 5798
summ 8445


In [7]:
# That's it