Skip to content

Commit

Permalink
MEF: fix deleted agents
Browse files Browse the repository at this point in the history
* Revert to not exclude bf:Families.
* Fixes MEF type creation.
* Adds MEF, entities count monitoring.

Co-Authored-by: Peter Weber <peter.weber@rero.ch>
  • Loading branch information
rerowep committed May 19, 2022
1 parent bc6d748 commit 851a172
Show file tree
Hide file tree
Showing 13 changed files with 265 additions and 49 deletions.
14 changes: 7 additions & 7 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

[pytest]
live_server_scope = module
addopts = --pycodestyle --pydocstyle --doctest-glob="*.rst" --doctest-modules --cov=rero_mef --cov-report=term-missing --ignore=setup.py --ignore=docs/conf.py
addopts = --pycodestyle --pydocstyle --doctest-glob="*.rst" --doctest-modules --cov=rero_mef --cov-report=term-missing --ignore=setup.py --ignore=docs/conf.py --color=yes
testpaths = docs tests rero_mef

# not displaying all the PendingDeprecationWarnings from invenio
Expand Down
14 changes: 13 additions & 1 deletion rero_mef/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,16 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""DOJSON transformations."""
"""Agents."""

from .gnd.api import AgentGndIndexer, AgentGndRecord, AgentGndSearch
from .idref.api import AgentIdrefIndexer, AgentIdrefRecord, AgentIdrefSearch
from .mef.api import AgentMefIndexer, AgentMefRecord, AgentMefSearch
from .rero.api import AgentReroIndexer, AgentReroRecord, AgentReroSearch
from .viaf.api import AgentViafIndexer, AgentViafRecord, AgentViafSearch

__all__ = (AgentGndIndexer, AgentGndRecord, AgentGndSearch,
AgentIdrefIndexer, AgentIdrefRecord, AgentIdrefSearch,
AgentMefIndexer, AgentMefRecord, AgentMefSearch,
AgentReroIndexer, AgentReroRecord, AgentReroSearch,
AgentViafIndexer, AgentViafRecord, AgentViafSearch)
16 changes: 13 additions & 3 deletions rero_mef/agents/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,16 @@ def create_from_viaf(test_md5, enqueue, online, verbose, progress, wait,
fg='green'
)
counts = {}
unexisting_pids = {}
agent_classes = get_entity_classes(without_mef_viaf=False)
for name, agent_class in agent_classes.items():
counts[name] = {}
counts[name]['old'] = agent_class.count()
if missing:
missing_pids = AgentMefRecord.get_all_missing_viaf_pids(
verbose=progress or verbose
)
missing_pids, unexisting_pids = AgentMefRecord. \
get_all_missing_viaf_pids(
verbose=progress or verbose
)
progress_bar = progressbar(
items=missing_pids,
length=len(missing_pids),
Expand Down Expand Up @@ -94,6 +96,14 @@ def create_from_viaf(test_md5, enqueue, online, verbose, progress, wait,
online=online,
verbose=verbose
)

if unexisting_pids:
click.echo(
f'Clean VIAF pids from MEF records: {len(unexisting_pids)}')
for pid, viaf_pid in unexisting_pids.items():
# TODO: clean MEF records with unexisting VIAF pids:
pass

if wait:
from ..cli import wait_empty_tasks
wait_empty_tasks(delay=3, verbose=True)
Expand Down
53 changes: 39 additions & 14 deletions rero_mef/agents/mef/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from .providers import MefProvider
from ...api import ReroIndexer
from ...api_mef import EntityMefRecord
from ...utils import progressbar
from ...utils import mef_get_all_missing_entity_pids, progressbar


class AgentMefSearch(RecordsSearch):
Expand Down Expand Up @@ -86,6 +86,7 @@ def get_all_missing_viaf_pids(cls, verbose=False):
"""
from ..viaf.api import AgentViafRecord
missing_pids = {}
unexisting_pids = {}
if verbose:
click.echo('Get pids from VIAF ...')
progress = progressbar(
Expand All @@ -97,29 +98,53 @@ def get_all_missing_viaf_pids(cls, verbose=False):
missing_pids[pid] = 1
if verbose:
click.echo('Get pids from MEF and calculate missing ...')
query = cls.search().filter('exists', field='viaf_pid')
progress = progressbar(
items=cls.search().filter('match_all').source().scan(),
length=cls.search().filter('match_all').source().count(),
items=query.source(['pid', 'viaf_pid']).scan(),
length=query.count(),
verbose=True
)
for hit in progress:
data = hit.to_dict()
viaf_pid = data.get('viaf_pid')
if viaf_pid:
missing_pids.pop(viaf_pid, None)
return missing_pids
if not missing_pids.pop(hit.viaf_pid, None):
unexisting_pids[hit.pid] = hit.viaf_pid
return [v for v in missing_pids], unexisting_pids

@classmethod
def get_all_missing_agents_pids(cls, agent, verbose=False):
"""Get all missing agent pids.
:param agent: agent name to get the missing pids.
:param verbose: Verbose.
:returns: Missing VIAF pids.
"""
return mef_get_all_missing_entity_pids(mef_class=cls, entity=agent,
verbose=verbose)

def replace_refs(self):
"""Replace $ref with real data."""
data = super().replace_refs()
sources = []
for agent in ['rero', 'gnd', 'idref']:
if agent in data and data[agent]:
sources.append(agent)
metadata = data[agent].get('metadata')
if metadata:
data[agent] = metadata
data['type'] = metadata['bf:Agent']
agent_data = data.get(agent)
if agent_data:
if agent_data.get('deleted'):
data.pop(agent)
current_app.logger.info(
f'MEF replace refs {data.get("pid")} {agent} deleted')
elif agent_data.get('status'):
data.pop(agent)
current_app.logger.error(
f'MEF replace refs {data.get("pid")} {agent}'
f' status: {agent_data.get("status")}'
f' {agent_data.get("message")}')
else:
sources.append(agent)
metadata = data[agent].get('metadata')
if metadata:
data[agent] = metadata
data['type'] = metadata['bf:Agent']
else:
data['type'] = data[agent]['bf:Agent']
data['sources'] = sources
return data

Expand Down
19 changes: 12 additions & 7 deletions rero_mef/agents/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

"""Tasks used by RERO-MEF."""

import click
from celery import shared_task

from .viaf.api import AgentViafRecord
Expand All @@ -38,13 +39,17 @@ def task_create_mef_from_viaf_agent(pid, dbcommit=True, reindex=True,
:returns: string with pid and actions
"""
viaf_record = AgentViafRecord.get_record_by_pid(pid)
actions = viaf_record.create_mef_and_agents(
dbcommit=dbcommit,
reindex=reindex,
test_md5=test_md5,
online=online,
verbose=verbose
)
action = 'NO VIAF'
if viaf_record:
actions = viaf_record.create_mef_and_agents(
dbcommit=dbcommit,
reindex=reindex,
test_md5=test_md5,
online=online,
verbose=verbose
)
else:
click.secho(f'{action}: {pid}', fg='red')
return actions


Expand Down
4 changes: 2 additions & 2 deletions rero_mef/agents/viaf/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,9 +309,9 @@ def get_missing_agent_pids(cls, agent, verbose=False):
)
for pid in progress:
pids_db[pid] = 1
agent_pid_name = f'{record_class.name}_pid'
if verbose:
click.echo(f'Get pids from VIAF with {agent} ...')
agent_pid_name = f'{agent}_pid'
click.echo(f'Get pids from VIAF with {agent_pid_name} ...')
query = AgentViafSearch() \
.filter('bool', should=[Q('exists', field=agent_pid_name)]) \
.source(['pid', agent_pid_name])
Expand Down
8 changes: 7 additions & 1 deletion rero_mef/concepts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,10 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""RERO."""
"""Concepts."""

from .mef.api import ConceptMefIndexer, ConceptMefRecord, ConceptMefSearch
from .rero.api import ConceptReroIndexer, ConceptReroRecord, ConceptReroSearch

__all__ = (ConceptMefIndexer, ConceptMefRecord, ConceptMefSearch,
ConceptReroIndexer, ConceptReroRecord, ConceptReroSearch)
12 changes: 12 additions & 0 deletions rero_mef/concepts/mef/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from .providers import ConceptMefProvider
from ...api import ReroIndexer
from ...api_mef import EntityMefRecord
from ...utils import mef_get_all_missing_entity_pids


class ConceptMefSearch(RecordsSearch):
Expand Down Expand Up @@ -76,6 +77,17 @@ def update_indexes(cls):
'ERROR flush and refresh: {err}'.format(err=err)
)

@classmethod
def get_all_missing_concepts_pids(cls, agent, verbose=False):
"""Get all missing agent pids.
:param agent: agent name to get the missing pids.
:param verbose: Verbose.
:returns: Missing VIAF pids.
"""
return mef_get_all_missing_entity_pids(mef_class=cls, entity=agent,
verbose=verbose)

def replace_refs(self):
"""Replace $ref with real data."""
data = super().replace_refs()
Expand Down
31 changes: 31 additions & 0 deletions rero_mef/concepts/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
#
# RERO MEF
# Copyright (C) 2021 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Utilities."""

from flask import current_app


def get_concepts_endpoints():
"""Get all contributions from config."""
concepts_endpoints = {}
concepts = current_app.config.get('CONCEPTS', [])
endpoints = current_app.config.get('RECORDS_REST_ENDPOINTS', {})
for endpoint, data in endpoints.items():
if endpoint in concepts:
concepts_endpoints[endpoint] = data
return concepts_endpoints
13 changes: 3 additions & 10 deletions rero_mef/marctojson/do_gnd_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,9 @@ def __init__(self, marc, logger=None, verbose=False, transform=True):

def _transform(self):
"""Call the transformation functions."""
# test if organisation or person but not family
is_organisation_person = False
is_family = False
for field_075 in self.marc.get_fields('075'):
for subfield_b in field_075.get_subfields('b'):
if subfield_b in ['b', 'f', 'p']:
is_organisation_person = True
if subfield_b == 'piz':
is_family = True
if is_organisation_person and not is_family:
if self.marc.get_fields('100') or \
self.marc.get_fields('110') or \
self.marc.get_fields('111'):
for func in dir(self):
if func.startswith('trans'):
func = getattr(self, func)
Expand Down

0 comments on commit 851a172

Please sign in to comment.