# Reference Matcher Generation

In [21]:
from __future__ import absolute_import, division, print_function

from collections import defaultdict
from itertools import islice

from elasticsearch.helpers import scan
from flask import Flask
from six import iteritems

from invenio_search import InvenioSearch, current_search_client as es

from inspire_matcher import InspireMatcher, match
from inspire_utils.record import get_value

In [22]:
config = {
    'algorithm': [
        {
            'queries': [
                {
                    'path': 'reference.arxiv_eprint',
                    'search_path': 'arxiv_eprints.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.dois',
                    'search_path': 'dois.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.isbn',
                    'search_path': 'isbns.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.report_numbers',
                    'search_path': 'report_numbers.value.fuzzy',
                    'type': 'exact',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.artid',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.artid',
                    ],
                    'type': 'nested',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.page_start',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.page_start',
                    ],
                    'type': 'nested',
                },
            ],
        },
    ],
    'doc_type': 'hep',
    'index': 'records-hep',
    'source': [
        'control_number',
    ]
}

In [23]:
config_for_jcap_and_jhep = {
    'algorithm': [
        {
            'queries': [
                {
                    'path': 'reference.arxiv_eprint',
                    'search_path': 'arxiv_eprints.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.dois',
                    'search_path': 'dois.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.isbn',
                    'search_path': 'isbns.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.report_numbers',
                    'search_path': 'report_numbers.value.fuzzy',
                    'type': 'exact',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.year',
                        'reference.publication_info.artid',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.year',
                        'publication_info.artid',
                    ],
                    'type': 'nested',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.year',
                        'reference.publication_info.page_start',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.year',
                        'publication_info.page_start',
                    ],
                    'type': 'nested',
                },
            ],
        },
    ],
    'doc_type': 'hep',
    'index': 'records-hep',
    'source': [
        'control_number',
    ]
}

In [24]:
config_for_data = {
    'algorithm': [
        {
            'queries': [
                {
                    'path': 'reference.dois',
                    'search_path': 'dois.value.raw',
                    'type': 'exact',
                },
            ],
        },
    ],
    'doc_type': 'data',
    'index': 'records-data',
    'source': [
        'control_number',
    ]
}

In [25]:
def match_reference(reference):
    if reference.get('legacy_curated') and reference.get('recid'):
        return reference['recid']
    
    journal_title = get_value(reference, 'reference.publication_info.journal_title')
    if journal_title in ['JCAP', 'JHEP']:
        try:
            if get_value(reference, 'reference.publication_info.year'):
                reference['reference']['publication_info']['year'] = str(reference['reference']['publication_info']['year'])
            result = next(match(reference, config_for_jcap_and_jhep))
            return result['_source']['control_number']
        except StopIteration:
            pass
    
    try:
        result = next(match(reference, config))
        return result['_source']['control_number']
    except StopIteration:
        pass

    try:
        result = next(match(reference, config_for_data))
        return result['_source']['control_number']
    except StopIteration:
        pass

In [26]:
app = Flask(__name__)
InvenioSearch(app)
InspireMatcher(app)

<inspire_matcher.ext.InspireMatcher at 0x7f8838b9fcf8>

In [None]:
%%time

citations = defaultdict(set)

with app.app_context():
    search = scan(
        es,
        doc_type='hep',
        index='records-hep',
        query={
            '_source': [
                'control_number',
                'references',
            ],
            'query': {
                'exists': {
                    'field': 'references',
                },
            },
        },
        scroll='2d',
    ) 
    
    with open('new-citations-overall.tsv', 'w') as f:
        #for hit in islice(search, 2000):
        for hit in search:
            record = hit['_source']
            control_number = record['control_number']
            references = record['references']
            
            for reference in references:
                # recid exists only for those records which are on INSPIRE
                # If we can get that, mathcing gets easier
                expected = reference.get('recid') or 0
                result = match_reference(reference) or 0
                
                f.write('%d\t%d\t%d\t%r\n' % (control_number, expected, result, reference))
            
                if result:
                    citations[result].add(control_number)

print()

In [None]:
with open('new-citation-counts.tsv', 'w') as f:
    for k, vs in sorted(iteritems(citations)):
        f.write('%d\t%d\n' % (k, len(vs)))