# Performance Evaluation on PWCLeaderboards dataset

This notebook runs AxCell on the **PWCLeaderboards** dataset.

For the pipeline to work we need a running elasticsearch instance. Run `docker-compose up -d` from the `axcell` repository to start a new instance.

In [1]:
from axcell.helpers.datasets import read_tables_annotations
from pathlib import Path

V1_URL = 'https://github.com/sotagents/axcell/releases/download/v1.0/'
PWC_LEADERBOARDS_URL = V1_URL + 'pwc-leaderboards.json.xz'

pwc_leaderboards = read_tables_annotations(PWC_LEADERBOARDS_URL)

In [2]:
# path to root directory containing e-prints
PWC_LEADERBOARDS_ROOT_PATH = Path('pwc-leaderboards')
PWC_LEADERBOARDS_ROOT_PATH = Path.home() / 'data/pwc-leaderboards'
SOURCES_PATH = PWC_LEADERBOARDS_ROOT_PATH / 'sources'

from axcell.helpers.paper_extractor import PaperExtractor
extract = PaperExtractor(PWC_LEADERBOARDS_ROOT_PATH)

In [3]:
%%time

from joblib import delayed, Parallel

# access extract from the global context to avoid serialization
def extract_single(file): return extract(file)

files = sorted([path for path in SOURCES_PATH.glob('**/*') if path.is_file()])

statuses = Parallel(backend='multiprocessing', n_jobs=-1)(delayed(extract_single)(file) for file in files)

CPU times: user 2.02 s, sys: 1.07 s, total: 3.09 s
Wall time: 12min 43s


In [4]:
assert statuses == ["success"] * 731

Download and unpack the archive with trained models (table type classifier, table segmentation), taxonomy and abbreviations.

In [5]:
MODELS_URL = V1_URL + 'models.tar.xz'
MODELS_ARCHIVE = 'models.tar.xz'
MODELS_PATH = Path('models')

from fastai.core import download_url
import tarfile

download_url(MODELS_URL, MODELS_ARCHIVE)
with tarfile.open(MODELS_ARCHIVE, 'r:*') as archive:
    archive.extractall()

from axcell.helpers.results_extractor import ResultsExtractor
extract_results = ResultsExtractor(MODELS_PATH)

[PID 10700] Load model table-structure-classifier.pth


In [6]:
import pandas as pd

papers = []
our_taxonomy = set(extract_results.taxonomy.taxonomy)
gold_records = []
for _, paper in pwc_leaderboards.iterrows():
    for table in paper.tables:
        for record in table['records']:
            r = dict(record)
            r['arxiv_id'] = paper.arxiv_id
            tdm = (record['task'], record['dataset'], record['metric'])
            if tdm in our_taxonomy:
                gold_records.append(r)
                papers.append(paper.arxiv_id)
gold_records = pd.DataFrame(gold_records)
papers = sorted(set(papers))

In [7]:
from axcell.data.paper_collection import PaperCollection
pc = PaperCollection.from_files(PWC_LEADERBOARDS_ROOT_PATH / "papers")
pc = PaperCollection([pc.get_by_id(p) for p in papers])

In [8]:
%%time

from joblib import delayed, Parallel

def process_single(index):
    extract_results = ResultsExtractor(MODELS_PATH)
    return extract_results(pc[index])

results = Parallel(backend='multiprocessing', n_jobs=-1)(delayed(process_single)(index) for index in range(len(pc)))

CPU times: user 21.4 s, sys: 17.8 s, total: 39.2 s
Wall time: 43min 27s


In [9]:
predicted_records = []
for paper, records in zip(pc, results):
    r = records.copy()
    r['arxiv_id'] = paper.arxiv_no_version
    predicted_records.append(r)
predicted_records = pd.concat(predicted_records)
predicted_records.to_json('axcell-predictions-on-pwc-leaderboards.json.xz', orient='records')

In [10]:
from axcell.helpers.evaluate import evaluate
evaluate(predicted_records, gold_records).style.format('{:.2%}')

Unnamed: 0,Micro Precision,Micro Recall,Micro F1,Macro Precision,Macro Recall,Macro F1
0,39.35%,24.18%,29.95%,24.18%,22.13%,21.34%
1,67.83%,47.35%,55.77%,47.94%,46.50%,43.62%
2,70.79%,57.27%,63.32%,60.78%,62.72%,59.60%
3,70.28%,48.06%,57.08%,53.64%,52.79%,50.02%
4,68.48%,58.09%,62.86%,58.22%,60.53%,56.38%
