# Text and Tables Extraction on ICLR Data

This notebook presents how to use our pipeline to extract text and tables from arXiv papers with available LaTeX source code.

In [1]:
import pickle
from collections import defaultdict
import re
import glob

In [2]:
from pathlib import Path
from axcell.helpers.paper_extractor import PaperExtractor
from axcell.data.paper_collection import PaperCollection
from axcell.models.structure import TableType, TableStructurePredictor, TableTypePredictor
# from axcell.helpers.results_extractor import ResultsExtractor

### Structure of Directories

We cache the artifacts produced by successful execution of the intermediate steps of extraction pipeline. The `root` argument of `PaperExtractor` is a path under which the following directory structue is created:

```
root
├── sources                       # e-print archives
├── unpacked_sources              # extracted latex sources (generated automatically)
├── htmls                         # converted html files (generated automatically)
└── papers                        # extracted text and tables (generated automatically)
```

In [3]:
pwd

'/home/singh_shruti/workspace/axcell_ws/axcell/notebooks'

### Load the iclr_arxiv map.

We parse papers in the reverse manner. For iclr papers where arxivId is present, we extract tables, and extract bibitem from the leaderboard table csv. 

In [4]:
with open("/home/singh_shruti/workspace/PaperAcceptancePrediction/shruti/features/iclr_arxiv_map.pkl", "rb") as f:
    iclr_arxiv_map = pickle.load(f)

In [3]:
# ResultsExtractor??

### Run extraction one by one

In [6]:
ROOT_PATH = Path('data')
MODELS_PATH = Path('models')

SOURCES_PATH = ROOT_PATH / 'sources'
PAPERS_PATH = ROOT_PATH / 'papers'

In [7]:
extract = PaperExtractor(ROOT_PATH)

In [8]:
models_path = Path(MODELS_PATH)
ttp = TableTypePredictor(models_path, "table-type-classifier.pth")

[PID 35299] Load model table-type-classifier.pth


In [9]:
table_labels = {TableType.SOTA: 'leaderboard', TableType.ABLATION: 'ablation', TableType.IRRELEVANT: 'irrelevant'}

In [None]:
status_count = 0
err_count = 0
unextracted_papers = []


leaderboard_table_refs = defaultdict(dict)
leaderboard_refs = defaultdict(dict)

for k, v in iclr_arxiv_map.items():
    #if k.startswith("2017"):
    if True:
        if v["found"]:
            
            # Keep saving after 10 entries
            if status_count % 20 == 0:
                print("Status: ", status_count)
                with open("leaderboard_table_refs.pkl", "wb") as some_file:
                    pickle.dump(leaderboard_table_refs, some_file)
                with open("leaderboard_refs.pkl", "wb") as some_file:
                    pickle.dump(leaderboard_refs, some_file)
                    
            try:
#                 print(k, v["arxivId"])
                path_split = v["arxivId"].split(".")

                # Extraction
                extract(SOURCES_PATH / path_split[0] / v["arxivId"])

                pc = PaperCollection.from_files(PAPERS_PATH)
                paper = pc.get_by_id(v["arxivId"])
                
                if not paper:
                    unextracted_papers.append((k, v["arxivId"]))
                    continue

                #leaderboard_table_refs[k]["tcount"] = len(paper.tables)
                
                if len(paper.tables) > 0:
                    
                    # Add table type if absent
                    if paper.tables[0].gold_tags == "":
                        tables_types = ttp.predict(paper, paper.tables)
                        for table, table_type in zip(paper.tables, tables_types):
                            table.gold_tags = table_labels[table_type]
#                         print("check is retained: ", paper.tables[0].gold_tags)
#                         break
                    
                    #Count the different types of tables
                    table_type_count = [0, 0, 0]
                    for tt in paper.tables:
                        if tt.gold_tags.lower().strip() == "leaderboard":
                            table_type_count[0] = table_type_count[0] + 1
                        elif tt.gold_tags.lower().strip() == "ablation":
                            table_type_count[1] = table_type_count[1] + 1
                        elif tt.gold_tags.lower().strip() == "irrelevant":
                            table_type_count[2] = table_type_count[2] + 1
                        else:
                            print("Unknown table type: ", tt.gold_tags)
                    
                    # Save table stat in leaderboard_table_refs
                    leaderboard_table_refs[k]["tcount"] = [len(paper.tables), table_type_count[0], table_type_count[1], table_type_count[2]]
                else:
                    leaderboard_table_refs[k]["tcount"] = [0, 0, 0, 0]
                
                if (leaderboard_table_refs[k]["tcount"][1]+leaderboard_table_refs[k]["tcount"][2]) > 0:
                    leaderboard_refs[k]["refs"] = {}
                
                # If sota/leaderboard tables present, extract references from them
                if leaderboard_table_refs[k]["tcount"][1] > 0:
                    leaderboard_refs[k]["refs"]["ldb"] = []
                    leaderboard_refs[k]["count"] = 0
                    for iterid, table in enumerate(paper.tables):
                        idx = iterid + 1
                        if table.gold_tags == "leaderboard":
                            
                            table_dir_path = "/home/singh_shruti/workspace/axcell_ws/axcell/notebooks/data/papers/" + path_split[0] + "/" + v["arxivId"]
                            table_files = glob.glob(table_dir_path+"/*.csv")
                            table_files_stripped = [tfs.rsplit("/", 1)[1] for tfs in table_files]
                            
                            candidate_name = "table_" + "{:02d}".format(idx) + ".csv"
#                             print("Check: ", candidate_name)
#                             print(table_dir_path+"/*.csv")
#                             print("check: ", table_files)
                            if candidate_name in table_files_stripped:
                                cannot_use_f = open(table_dir_path + "/" + candidate_name, "r")
                                for line in cannot_use_f:
                                    m = re.findall("<ref id=[0-9a-zA-Z'-]*>[0-9]*</ref>", line)
                                    for iim in m:
                                        leaderboard_refs[k]["refs"]["ldb"].append(iim)
                                        leaderboard_refs[k]["count"] += 1
                            else:
                                print("For {} cannot find leaderboard table file: {}".format(k, candidate_name))
                            
                    leaderboard_refs[k]["refs"]["ldb"] = list(set(leaderboard_refs[k]["refs"]["ldb"]))
                    
                if leaderboard_table_refs[k]["tcount"][2] > 0:
                    leaderboard_refs[k]["refs"]["abl"] = []
                    for iterid, table in enumerate(paper.tables):
                        idx = iterid + 1
                        if table.gold_tags == "ablation":
                            
                            table_dir_path = "/home/singh_shruti/workspace/axcell_ws/axcell/notebooks/data/papers/" + path_split[0] + "/" + v["arxivId"]
                            table_files = glob.glob(table_dir_path+"/*.csv")
                            table_files_stripped = [tfs.rsplit("/", 1)[1] for tfs in table_files]
                            
                            candidate_name = "table_" + "{:02d}".format(idx) + ".csv"
                            if candidate_name in table_files_stripped:
                                cannot_use_f = open(table_dir_path + "/" + candidate_name, "r")
                                for line in cannot_use_f:
                                    m = re.findall("<ref id=[0-9a-zA-Z'-]*>[0-9]*</ref>", line)
                                    for iim in m:
                                        leaderboard_refs[k]["refs"]["abl"].append(iim)
                            else:
                                print("For {} cannot find ablation table file: {}".format(k, candidate_name))
                            
                    leaderboard_refs[k]["refs"]["abl"] = list(set(leaderboard_refs[k]["refs"]["abl"]))
                            
                    #table_csv_path = "/home/singh_shruti/workspace/axcell_ws/axcell/axcell/notebooks/data/papers/" + path_split[0] + "/" + v["arxivId"] + "/" + 
                
                status_count += 1
                #break
            except Exception as ex:
                print(k, v["arxivId"])
                print("Error: ", ex)
                err_count += 1

Status:  0


In [None]:
with open("leaderboard_table_refs.pkl", "wb") as some_file:
    pickle.dump(leaderboard_table_refs, some_file)
with open("leaderboard_refs.pkl", "wb") as some_file:
    pickle.dump(leaderboard_refs, some_file)

In [30]:

# status_count = 0
# err_count = 0

# leaderboard_table_refs = defaultdict(dict)
# leaderboard_refs = defaultdict(dict)

# for k, v in iclr_arxiv_map.items():
#     if k == "2017_B1ElR4cgg":
# #     if True:
#         if v["found"]:
            
#             # Keep saving after 10 entries
#             if status_count % 20 == 0:
#                 print("Status: ", status_count)
#                 with open("leaderboard_table_refs.pkl", "wb") as some_file:
#                     pickle.dump(leaderboard_table_refs, some_file)
#                 with open("leaderboard_refs.pkl", "wb") as some_file:
#                     pickle.dump(leaderboard_refs, some_file)
                    
#             try:
# #                 print(k, v["arxivId"])
#                 path_split = v["arxivId"].split(".")
#                 print("What si happening; ",  v["arxivId"], path_split)

#                 # Extraction
#                 extract(SOURCES_PATH / path_split[0] / v["arxivId"])

#                 pc = PaperCollection.from_files(PAPERS_PATH)
#                 paper = pc.get_by_id(v["arxivId"])
                
#                 print("Paper is: ", paper)
#                 #leaderboard_table_refs[k]["tcount"] = len(paper.tables)
                
#                 if len(paper.tables) > 0:
                    
#                     # Add table type if absent
#                     if paper.tables[0].gold_tags == "":
#                         tables_types = ttp.predict(paper, paper.tables)
#                         for table, table_type in zip(paper.tables, tables_types):
#                             table.gold_tags = table_labels[table_type]
# #                         print("check is retained: ", paper.tables[0].gold_tags)
# #                         break
                    
#                     #Count the different types of tables
#                     table_type_count = [0, 0, 0]
#                     for tt in paper.tables:
#                         if tt.gold_tags.lower().strip() == "leaderboard":
#                             table_type_count[0] = table_type_count[0] + 1
#                         elif tt.gold_tags.lower().strip() == "ablation":
#                             table_type_count[1] = table_type_count[1] + 1
#                         elif tt.gold_tags.lower().strip() == "irrelevant":
#                             table_type_count[2] = table_type_count[2] + 1
#                         else:
#                             print("Unknown table type: ", tt.gold_tags)
                    
#                     # Save table stat in leaderboard_table_refs
#                     leaderboard_table_refs[k]["tcount"] = [len(paper.tables), table_type_count[0], table_type_count[1], table_type_count[2]]
#                 else:
#                     leaderboard_table_refs[k]["tcount"] = [0, 0, 0, 0]
                
#                 if (leaderboard_table_refs[k]["tcount"][1]+leaderboard_table_refs[k]["tcount"][2]) > 0:
#                     leaderboard_refs[k]["refs"] = {}
                
#                 # If sota/leaderboard tables present, extract references from them
#                 if leaderboard_table_refs[k]["tcount"][1] > 0:
#                     leaderboard_refs[k]["refs"]["ldb"] = []
#                     leaderboard_refs[k]["count"] = 0
#                     for iterid, table in enumerate(paper.tables):
#                         idx = iterid + 1
#                         if table.gold_tags == "leaderboard":
                            
#                             table_dir_path = "/home/singh_shruti/workspace/axcell_ws/axcell/notebooks/data/papers/" + path_split[0] + "/" + v["arxivId"]
#                             table_files = glob.glob(table_dir_path+"/*.csv")
#                             table_files_stripped = [tfs.rsplit("/", 1)[1] for tfs in table_files]
                            
#                             candidate_name = "table_" + "{:02d}".format(idx) + ".csv"
# #                             print("Check: ", candidate_name)
# #                             print(table_dir_path+"/*.csv")
# #                             print("check: ", table_files)
#                             if candidate_name in table_files_stripped:
#                                 cannot_use_f = open(table_dir_path + "/" + candidate_name, "r")
#                                 for line in cannot_use_f:
#                                     m = re.findall("<ref id=[0-9a-zA-Z'-]*>[0-9]*</ref>", line)
#                                     for iim in m:
#                                         leaderboard_refs[k]["refs"]["ldb"].append(iim)
#                                         leaderboard_refs[k]["count"] += 1
#                             else:
#                                 print("For {} cannot find leaderboard table file: {}".format(k, candidate_name))
                            
#                     leaderboard_refs[k]["refs"]["ldb"] = list(set(leaderboard_refs[k]["refs"]["ldb"]))
                    
#                 if leaderboard_table_refs[k]["tcount"][2] > 0:
#                     leaderboard_refs[k]["refs"]["abl"] = []
#                     for iterid, table in enumerate(paper.tables):
#                         idx = iterid + 1
#                         if table.gold_tags == "ablation":
                            
#                             table_dir_path = "/home/singh_shruti/workspace/axcell_ws/axcell/notebooks/data/papers/" + path_split[0] + "/" + v["arxivId"]
#                             table_files = glob.glob(table_dir_path+"/*.csv")
#                             table_files_stripped = [tfs.rsplit("/", 1)[1] for tfs in table_files]
                            
#                             candidate_name = "table_" + "{:02d}".format(idx) + ".csv"
#                             if candidate_name in table_files_stripped:
#                                 cannot_use_f = open(table_dir_path + "/" + candidate_name, "r")
#                                 for line in cannot_use_f:
#                                     m = re.findall("<ref id=[0-9a-zA-Z'-]*>[0-9]*</ref>", line)
#                                     for iim in m:
#                                         leaderboard_refs[k]["refs"]["abl"].append(iim)
#                             else:
#                                 print("For {} cannot find ablation table file: {}".format(k, candidate_name))
                            
#                     leaderboard_refs[k]["refs"]["abl"] = list(set(leaderboard_refs[k]["refs"]["abl"]))
                            
#                     #table_csv_path = "/home/singh_shruti/workspace/axcell_ws/axcell/axcell/notebooks/data/papers/" + path_split[0] + "/" + v["arxivId"] + "/" + 
                
#                 status_count += 1
#                 break
#             except Exception as ex:
#                 print(k, v["arxivId"])
#                 print("Error: ", ex)
#                 err_count += 1

Status:  0
What si happening;  1606.00704v3 ['1606', '00704v3']
Paper is:  None
2017_B1ElR4cgg 1606.00704v3
Error:  'NoneType' object has no attribute 'tables'


In [11]:
k, v["arxivId"]

('2020_ryxz8CVYDH', '1910.09464v2')

In [36]:
arxiv_re = re.compile(r"^(?P<arxiv_id>\d{4}\.\d+(v\d+)?)(\..*)?$")
p = SOURCES_PATH / path_split[0] / v["arxivId"]
m = arxiv_re.match(p.name)
arxiv_id = m.group('arxiv_id')
print(arxiv_id)

1606.00704v3


In [37]:
m[0], m[1], m[2], m[3]

('1606.00704v3', '1606.00704v3', 'v3', None)

In [38]:
subpath = p.relative_to(ROOT_PATH / 'sources').parent / arxiv_id
print(subpath)

1606/1606.00704v3


In [39]:
p.relative_to(ROOT_PATH / 'sources')

PosixPath('1606/1606.00704v3')

In [40]:
unpack_path = ROOT_PATH / 'unpacked_sources' / subpath

In [43]:
from axcell.helpers import LatexConverter, Unpack
up = Unpack()
up(p, unpack_path)

In [44]:
html_path = ROOT_PATH / 'htmls' / subpath / 'index.html'

In [46]:
latex = LatexConverter()
html = latex.to_html(unpack_path)

LatexConversionError: LaTeXML was unable to convert source code of this paper

In [None]:
html_path.parent.mkdir(parents=True, exist_ok=True)
html_path.write_text(html, 'utf-8')

In [34]:
extract(SOURCES_PATH / path_split[0] / v["arxivId"])

'processing-error'

In [33]:
v["arxivId"] = "1606.00704v3"

In [48]:
leaderboard_table_refs

defaultdict(dict, {'2017_B1-Hhnslg': {'tcount': [6, 4, 2, 0]}})

In [49]:
leaderboard_refs

defaultdict(dict,
            {'2017_B1-Hhnslg': {'refs': ["<ref id='bib-bib20'>20</ref>",
               "<ref id='bib-bib36'>36</ref>"],
              'count': 70}})

In [51]:
try:
    print(c)
except Exception as ex:
    print("Error: ", ex)

Error:  name 'c' is not defined


In [46]:
glob.glob("/home/singh_shruti/workspace/axcell_ws/axcell/*")

['/home/singh_shruti/workspace/axcell_ws/axcell/axcell/data',
 '/home/singh_shruti/workspace/axcell_ws/axcell/axcell/scripts',
 '/home/singh_shruti/workspace/axcell_ws/axcell/axcell/errors.py',
 '/home/singh_shruti/workspace/axcell_ws/axcell/axcell/pipeline_logger.py',
 '/home/singh_shruti/workspace/axcell_ws/axcell/axcell/__init__.py',
 '/home/singh_shruti/workspace/axcell_ws/axcell/axcell/helpers',
 '/home/singh_shruti/workspace/axcell_ws/axcell/axcell/config.py',
 '/home/singh_shruti/workspace/axcell_ws/axcell/axcell/mocks',
 '/home/singh_shruti/workspace/axcell_ws/axcell/axcell/loggers.py',
 '/home/singh_shruti/workspace/axcell_ws/axcell/axcell/models']

In [None]:
/home/singh_shruti/workspace/axcell_ws/axcell/axcell/notebooks/data/papers/1703/1703.05175v2/*.csv

## Scratch

In [20]:
for x in tables_types:
    print(x.name)

SOTA
SOTA
SOTA
ABLATION
SOTA
ABLATION


In [10]:
len(paper.tables)

6

In [14]:
dir(tables_types[0])

['__class__', '__doc__', '__module__', 'name', 'value']

In [15]:
tables_types[0].name

'SOTA'

In [16]:
tables_types[0].value

0

In [17]:
tables_types[0].__class__, tables_types[0].__doc__

(<enum 'TableType'>, 'An enumeration.')

In [18]:
paper.tables[0].gold_tags == ""

True

In [21]:
table_labels[TableType.SOTA]

'leaderboard'

In [24]:
idx = 2
"{:02d}".format(idx)

'02'

In [53]:
t = "ResNet-50,Encoding [<ref id='bib-bib36'>36</ref>],FPN [<ref id='bib-bib20'>20</ref>],37.87"
m = re.findall("<ref id=[0-9a-zA-Z'-]*>[0-9]*</ref>", t)
print(m)

[]


In [31]:
m[0]

"<ref id='bib-bib36'>36</ref>"

In our case there's a single e-print archive:

In [None]:
#!tree {ROOT_PATH}

In [None]:
extract = PaperExtractor(ROOT_PATH)

To extract text and tables from a single paper just pass the path to the archive:

In [None]:
SOURCES_PATH = ROOT_PATH / 'sources'
extract(SOURCES_PATH / '1903' / '1903.11816v1')

In [None]:
SOURCES_PATH = ROOT_PATH / 'sources'
extract(SOURCES_PATH / '1611' / '1611.02200')

In [None]:
!tree -L 4 {ROOT_PATH}

The subdirectory structure under `sources` directory will be replicated in the other top-level directories.

In [None]:
!tree -L 4 {ROOT_PATH}

The extracted data is stored in `papers` directory. We can read it using `PaperCollection` class. `PaperCollection` is a wrapper for `list` of papers with additional functions added for convenience. Due to large number of papers it is recommended to load the dataset in parallel (default uses number of processes equal to number of CPU cores) and store it in a pickle file. Set jobs=1 to disable multiprocessing.

In [None]:
from axcell.data.paper_collection import PaperCollection

PAPERS_PATH = ROOT_PATH / 'papers'
pc = PaperCollection.from_files(PAPERS_PATH)
# pc.to_pickle('mypapers.pkl')
# pc = PaperCollection.from_pickle('mypapers.pkl')

In [None]:
paper = pc.get_by_id('1903.11816v1')

In [None]:
paper.text.title

In [None]:
paper.tables[0]
# print(paper.tables[2])

In [None]:
# from pprint import pprint
paper.tables[0].__dict__["df"][0][:].iloc[1]
# , paper.tables[0].__dict__["df"][1][:])

In [None]:
paper.tables[0].__dict__["df"][0][:], paper.tables[0].__dict__["df"][1][:]

In [None]:
paper.tables[1]

In [None]:
[i.value for i in paper.tables[1].__dict__["df"][0][:]]

In [None]:
paper.tables[0]

In [None]:
paper.tables[4].__dict__["df"]

As *FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation* (Wu et al., 2019) is present in our **SegmentedTables** dataset, we can use `PaperCollection` to import annotations (table segmentation and results):

In [None]:
from axcell.helpers.datasets import read_tables_annotations

V1_URL = 'https://github.com/paperswithcode/axcell/releases/download/v1.0/'
SEGMENTED_TABLES_URL = V1_URL + 'segmented-tables.json.xz'

segmented_tables = read_tables_annotations(SEGMENTED_TABLES_URL)

pc = PaperCollection.from_files(PAPERS_PATH, annotations=segmented_tables.to_dict('record'))

In [None]:
paper = pc.get_by_id('1903.11816')
paper.tables[4]

In [None]:
pc.cells_gold_tags_legend()

In [None]:
paper.tables[4].sota_records

## Parallel Extraction

For a single paper extraction can take from several seconds to a few minutes (the longest phase of converting LaTeX source into HTML is timed-out after 5 minutes), so to process multiple files we run extraction in parallel.

In [None]:
%%time

from joblib import delayed, Parallel

# access extract from the global context to avoid serialization
def extract_single(file): return extract(file)

files = sorted([path for path in SOURCES_PATH.glob('**/*') if path.is_file()])

statuses = Parallel(backend='multiprocessing', n_jobs=-1)(delayed(extract_single)(file) for file in files)