In [1]:
import json
import os
import random
import tqdm
from tqdm import tqdm

import numpy as np
import pandas as pd

## 1. Removing papers that are not "useful"

**Aim.** To reduce the size of the S2ORC dataset by removing datapoints that are not "useful". Examples: 
- papers that do not cite any papers / are not cited by any papers 
- papers lacking a Results / Discussion section 

In [3]:
def load_data(filepath):
    d = {}
    with open(filepath) as f:
        for i, line in enumerate(f):
            d[i] = json.loads(line)
    return d

d = load_data("/home/jessica/data/s2orc/s2orc_shard00.jsonl")
s2orc = pd.DataFrame.from_dict(d).T
display(s2orc.head())

Unnamed: 0,paper_id,body_text
0,25054841,"[{'section': 'Introduction', 'text': 'The bene..."
1,25055864,"[{'section': '', 'text': 'Faculty in academic ..."
2,25056447,"[{'section': 'INTRODUCTION', 'text': 'A Biopha..."
3,25056469,"[{'section': 'C', 'text': 'hlorophyll is the m..."
4,25056663,"[{'section': '', 'text': 'Secretins in the out..."


In [28]:
def check_sections(body_text):
    has_results = 0
    has_discuss = 0
    for p in body_text:
        section_name = p["section"].lower()
        i = int("result" in section_name or "find" in section_name)
        j = int("discuss" in section_name or "conclu" in section_name)
        
        if i and j: continue # skip combined Results and Discussion sections
        
        has_results += i
        has_discuss += j
        
    return(has_results != 0 and has_discuss != 0)

s2orc["has_sections"] = s2orc.body_text.apply(check_sections)

In [37]:
lines_to_drop = []
for j, line in s2orc.iterrows(): 
    if not (line.has_outbound_citations and line.has_inbound_citations and line.has_sections):
        lines_to_drop.append(j)
    
print(len(lines_to_drop))
print(lines_to_drop)

63765
[0, 1, 6, 8, 9, 10, 11, 13, 14, 16, 18, 19, 20, 21, 22, 25, 27, 28, 29, 30, 33, 34, 35, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 80, 81, 86, 87, 89, 91, 92, 95, 98, 99, 104, 107, 108, 110, 112, 113, 120, 125, 126, 131, 132, 133, 136, 140, 141, 143, 147, 148, 149, 150, 153, 155, 160, 161, 162, 163, 164, 165, 166, 168, 170, 171, 172, 173, 174, 175, 177, 178, 179, 180, 183, 184, 186, 187, 189, 193, 198, 199, 202, 205, 206, 210, 211, 212, 213, 214, 215, 216, 217, 229, 230, 231, 233, 234, 235, 236, 237, 239, 244, 246, 248, 249, 251, 252, 253, 254, 255, 257, 258, 259, 261, 262, 263, 264, 265, 267, 268, 269, 270, 272, 273, 275, 276, 277, 279, 281, 282, 283, 284, 285, 287, 289, 290, 291, 292, 294, 295, 298, 299, 301, 302, 304, 305, 306, 307, 309, 310, 311, 316, 319, 321, 322, 323, 324, 326, 328, 329, 331, 332, 333, 334, 335, 336, 337, 338, 344, 348, 349, 351, 352, 353, 354, 356, 357, 35

In [39]:
usefulness = s2orc.has_outbound_citations + s2orc.has_inbound_citations + s2orc.has_sections
usefulness.value_counts()

2    45974
3    36235
1    17791
dtype: int64

In [48]:
trimmed = s2orc[usefulness == 3].loc[:, ["paper_id", "body_text"]]
trimmed

Unnamed: 0,paper_id,body_text
2,18980463,"[{'section': '', 'text': 'In spite of the morp..."
3,18981111,"[{'section': '', 'text': 'The ability to explo..."
4,18981358,"[{'section': 'INTRODUCTION', 'text': 'Spinal c..."
5,18982114,"[{'section': '', 'text': 'The population in th..."
7,18982460,"[{'section': 'Introduction', 'text': 'Benign p..."
...,...,...
99992,11683397,"[{'section': 'INTRODUCTION', 'text': 'Signals ..."
99994,11683525,"[{'section': 'INTRODUCTION', 'text': 'Imaging ..."
99996,11684430,"[{'section': 'Background', 'text': 'Cholera is..."
99997,11685058,"[{'section': 'Introduction', 'text': 'Polycyst..."


In [50]:
len(s2orc) - len(trimmed)

63765

100%|██████████| 85/85 [09:00<00:00,  6.35s/it]


In [4]:
d = load_data("../misc/intent_probabilities_xdg.jsonl")
probs = pd.DataFrame.from_dict(d).T

In [6]:
probs.intent.value_counts()

background    51111
result        11566
method         2323
Name: intent, dtype: int64