# Reference Matcher Generation

In [1]:
from __future__ import absolute_import, division, print_function

from collections import defaultdict
from itertools import islice

from elasticsearch.helpers import scan
from flask import Flask
from six import iteritems

from inspire_dojson.utils import (
    get_record_ref,
    get_recid_from_ref
)
from inspire_utils.dedupers import dedupe_list

from invenio_search import InvenioSearch, current_search_client as es

from inspire_matcher import InspireMatcher, match
from inspire_utils.record import get_value

In [2]:
config_unique_identifiers = {
    'algorithm': [
        {
            'queries': [
                {
                    'path': 'reference.arxiv_eprint',
                    'search_path': 'arxiv_eprints.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.dois',
                    'search_path': 'dois.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.isbn',
                    'search_path': 'isbns.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.texkey',
                    'search_path': 'texkeys.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.report_numbers',
                    'search_path': 'report_numbers.value.fuzzy',
                    'type': 'exact',
                },
            ],
        },
    ],
    'doc_type': 'hep',
    'index': 'records-hep',
    'collections': [
        'Literature',
    ],
    'source': [
        'control_number',
    ],
}
"""Configuration for matching all HEP records (including JHEP and JCAP records)
using unique identifiers."""

config_default_publication_info = {
    'algorithm': [
        {
            'queries': [
                {
                    'paths': [
                        'reference.publication_info.journal_issue',
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.artid',
                    ],
                    'search_paths': [
                        'publication_info.journal_issue',
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.page_artid',
                    ],
                    'type': 'nested',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_issue',
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.page_start',
                    ],
                    'search_paths': [
                        'publication_info.journal_issue',
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.page_artid',
                    ],
                    'type': 'nested',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.artid',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.page_artid',
                    ],
                    'type': 'nested',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.page_start',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.page_artid',
                    ],
                    'type': 'nested',
                },
            ],
        },
    ],
    'doc_type': 'hep',
    'index': 'records-hep',
    'collections': [
        'Literature',
    ],
    'source': [
        'control_number',
    ],
}
"""Configuration for matching all HEP records using publication_info.
These are separate from the unique queries since these can result in
multiple matches (particularly in the case of errata)."""

config_jcap_and_jhep_publication_info = {
    'algorithm': [
        {
            'queries': [
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.year',
                        'reference.publication_info.artid',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.year',
                        'publication_info.page_artid',
                    ],
                    'type': 'nested',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.year',
                        'reference.publication_info.page_start',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.year',
                        'publication_info.page_artid',
                    ],
                    'type': 'nested',
                },
            ],
        },
    ],
    'doc_type': 'hep',
    'index': 'records-hep',
    'collections': [
        'Literature',
    ],
    'source': [
        'control_number',
    ],
}
"""Configuration for matching records JCAP and JHEP records using the
publication_info, since we have to look at the year as well for accurate
matching.
These are separate from the unique queries since these can result in
multiple matches (particularly in the case of errata)."""

config_data = {
    'algorithm': [
        {
            'queries': [
                {
                    'path': 'reference.dois',
                    'search_path': 'dois.value.raw',
                    'type': 'exact',
                },
            ],
        },
    ],
    'doc_type': 'data',
    'index': 'records-data',
    'source': [
        'control_number',
    ]
}
"""Configuration for matching data records. Please note that the
index and doc_type are different for data records."""


'Configuration for matching data records. Please note that the\nindex and doc_type are different for data records.'

In [3]:
def match_reference(reference, previous_matched_recid=None):
    """Match a reference using inspire-matcher.

    Args:
        reference (dict): the metadata of a reference.
        previous_matched_recid (int): the record id of the last matched
            reference from the list of references.

    Returns:
        dict: the matched reference.
    """
    journal_title = get_value(reference, 'reference.publication_info.journal_title')
    config_publication_info = config_jcap_and_jhep_publication_info if \
        journal_title in ['JCAP', 'JHEP'] else config_default_publication_info

    configs = [config_unique_identifiers, config_publication_info, config_data]

    matches = (match_reference_with_config(reference, config, previous_matched_recid) for config in configs)
    matches = (matched_record for matched_record in matches if 'matched_record' in matched_record)
    reference = next(matches, reference)

    return reference


def match_reference_with_config(reference, config, previous_matched_recid=None):
    """Match a reference using inspire-matcher given the config.

    Args:
        reference (dict): the metadata of the reference.
        config (dict): the list of inspire-matcher configurations for queries.
        previous_matched_recid (int): the record id of the last matched
            reference from the list of references.

    Returns:
        dict: the matched reference.
    """
    # XXX: avoid this type casting.
    try:
        reference['reference']['publication_info']['year'] = str(
            reference['reference']['publication_info']['year'])
    except KeyError:
        pass

    matched_records = dedupe_list(list(match(reference, config)))
    same_as_previous = any(matched_record['_source']['control_number'] == previous_matched_recid for matched_record in matched_records)
    if len(matched_records) == 1:
        _add_match_to_reference(reference, matched_records[0]['_source']['control_number'], config['index'])
    elif same_as_previous:
        _add_match_to_reference(reference, previous_matched_recid, config['index'])

    # XXX: avoid this type casting.
    try:
        reference['reference']['publication_info']['year'] = int(
            reference['reference']['publication_info']['year'])
    except KeyError:
        pass

    return reference

def _add_match_to_reference(reference, matched_recid, es_index):
    """Modifies a reference to include its record id."""
    if es_index == 'records-data':
        reference['matched_record'] = 'https://labs.inspirehep.net/api/data/' + str(matched_recid)
    elif es_index == 'records-hep':
        reference['matched_record'] = 'https://labs.inspirehep.net/api/literature/' + str(matched_recid)

In [4]:
app = Flask(__name__)
InvenioSearch(app)
InspireMatcher(app)

<inspire_matcher.ext.InspireMatcher at 0x7ffad20b04a8>

In [None]:
%%time

citations = defaultdict(set)

with app.app_context():
    search = scan(
        es,
        doc_type='hep',
        index='records-hep',
        query={
            '_source': [
                'control_number',
                'references',
                'core',
                'citeable',
            ],
            'query': {
                'exists': {
                    'field': 'references',
                },
            },
        },
        scroll='2d',
    )
    
    with open('new-citations-NEWDUMP.tsv', 'w') as f:
#         for hit in islice(search, 100):
        for hit in search:
            record = hit['_source']
            control_number = record['control_number']
            references = record['references']
            
            previous_matched_recid = None
            for ref in references:
                expected = ref.get('recid') or 0
                matched_ref = match_reference(ref, previous_matched_recid)
                if 'matched_record' in matched_ref:
                    result = int(matched_ref.get('matched_record').split('/')[-1])
                    previous_matched_recid = result
                else:
                    result = 0
                
                f.write('%d\t%d\t%d\t%r\n' % (control_number, expected, result, ref))
            
                if result:
                    citations[result].add(control_number)

print()
#                 if matched_recid:
#                     previous_matched_recid = matched_recid
#                     result = previous_matched_recid

In [None]:
with open('new-citation-counts-NEWDUMP.tsv', 'w') as f:
    for k, vs in sorted(iteritems(citations)):
        f.write('%d\t%d\n' % (k, len(vs)))