<a href="https://colab.research.google.com/github/anon/ILCiteR/blob/main/create_maps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
from tqdm import tqdm
import nltk
import string
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

In [2]:
%%capture
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
location = 'drive/My Drive/cite_reco_s2orc/full/'
mappings_loc = 'mappings/'
maps_loc = 'maps/'
split_locs = ['Database/', 'Eval/']

domains = ['ner', 'sa', 'summ', 'mt']

In [4]:
def get_mappings_from_file(domain, split_type):
  # split_type is either 0 for Database or 1 for Eval
  global location, mappings_loc, split_locs

  with open(location + mappings_loc + split_locs[split_type] + domain + '.json', 'r+') as f:
    mappings = json.load(f)

  return mappings

def dump_maps(domain, split_type, context_to_cites_map):
  # split_type is either 0 for Database or 1 for Eval
  global location, maps_loc, split_locs

  with open(location + maps_loc + split_locs[split_type] + domain + '.json', 'w+') as f:
    json.dump(context_to_cites_map, f)

  return

punct_string = ''
for punct in string.punctuation:
  punct_string += punct
punct_string += ' '

def clean_context(text):
  global punct_string
  return text.strip(punct_string)

In [5]:
# Format of a list unit within mappings:

# Paper ID from which context was fetched
# Raw Context
# REF ID for the Reference
# Reference Paper

# Format of Database Maps

# Key: Cleaned Context
# Value: List of Papers with Supports ([paper_dict, support])

In [6]:
# Create map from cleaned contexts to cited papers with support counts

def mappings_to_map(mappings):
  context_to_cites_map = defaultdict(list)

  for unit in tqdm(mappings):
    context = clean_context(unit[1])
    cited_paper = unit[3]

    exists = False
    for index, paper_info in enumerate(context_to_cites_map[context]):
      if paper_info[0] == cited_paper:
        context_to_cites_map[context][index][1] += 1
        exists = True
        break

    if not exists:
      context_to_cites_map[context].append([cited_paper, 1])

  context_to_cites_map = dict(context_to_cites_map)
  return context_to_cites_map

In [7]:
# Testing map creation
mappings = get_mappings_from_file('ner', 1)
context_to_cites_map = mappings_to_map(mappings)
dump_maps('ner', 1, context_to_cites_map)

100%|██████████| 5868/5868 [00:00<00:00, 92284.12it/s]


In [8]:
for domain in domains:
  for split_type in range(2):
    print('Domain: ' + str(domain) + ', Split: ' + str(split_type), flush = True)
    mappings = get_mappings_from_file(domain, split_type)
    context_to_cites_map = mappings_to_map(mappings)
    dump_maps(domain, split_type, context_to_cites_map)
    print('Number of Mappings: ' + str(len(list(context_to_cites_map.keys()))))
    print('', flush = True)

Domain: ner, Split: 0


100%|██████████| 33643/33643 [00:00<00:00, 40619.14it/s]


Number of Mappings: 23803

Domain: ner, Split: 1


100%|██████████| 5868/5868 [00:00<00:00, 209146.71it/s]


Number of Mappings: 4221

Domain: sa, Split: 0


100%|██████████| 83354/83354 [00:00<00:00, 100182.65it/s]


Number of Mappings: 56674

Domain: sa, Split: 1


100%|██████████| 5502/5502 [00:00<00:00, 181161.38it/s]


Number of Mappings: 3956

Domain: summ, Split: 0


100%|██████████| 117246/117246 [00:00<00:00, 220004.15it/s]


Number of Mappings: 79345

Domain: summ, Split: 1


100%|██████████| 8404/8404 [00:00<00:00, 115447.25it/s]


Number of Mappings: 5811

Domain: mt, Split: 0


100%|██████████| 161698/161698 [00:02<00:00, 78284.41it/s]


Number of Mappings: 108692

Domain: mt, Split: 1


100%|██████████| 8648/8648 [00:00<00:00, 266883.04it/s]


Number of Mappings: 6139



In [9]:
# That's it