Skip to content

Commit

Permalink
Merge pull request #983 from sul-dlss/jgreben-patch-999s
Browse files Browse the repository at this point in the history
Deduplicate uuids fetched from sql and handle absent uuids in 999
  • Loading branch information
jermnelson committed May 15, 2024
2 parents f02ac4b + c6e1454 commit dca007f
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 24 deletions.
8 changes: 4 additions & 4 deletions libsys_airflow/plugins/data_exports/instance_ids.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import csv
import logging
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
from typing import Union
Expand Down Expand Up @@ -40,6 +40,8 @@ def fetch_record_ids(**kwargs) -> dict:
).execute(context)
)

results[kind] = list(np.unique(results[kind]))

return results


Expand Down Expand Up @@ -83,9 +85,7 @@ def save_ids(**kwargs) -> Union[str, None]:
data_path.parent.mkdir(parents=True, exist_ok=True)

with open(data_path, 'w') as f:
writer = csv.writer(f, lineterminator='\n')
for id in data:
if id:
writer.writerow(id)
f.write(f"{id}\n")

return str(data_path)
16 changes: 14 additions & 2 deletions libsys_airflow/plugins/data_exports/marc/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import pathlib
import pymarc
import re

from libsys_airflow.plugins.folio_client import folio_client
from airflow.operators.python import get_current_context
Expand Down Expand Up @@ -58,16 +59,27 @@ def add_holdings_items(self, marc_file: str, full_dump: bool):

marc_records = []
logger.info(f"Starting MARC processing on {marc_path}")
regex = re.compile(
r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"
)

with marc_path.open('rb') as fo:
for i, record in enumerate(pymarc.MARCReader(fo)):
try:
subfields_i = record["999"].get_subfields("i")
fields_999 = record.get_fields("999")
for field_999 in fields_999:
subfields_i = [
i for i in field_999.get_subfields("i") if regex.match(i)
]
if not subfields_i:
raise Exception("No uuid in subfields")

new_999s = self.add_holdings_items_fields(subfields_i)
record.add_field(*new_999s)
marc_records.append(record)
if not i % 100:
logger.info(f"{i:,} processed records")
except TypeError as e:
except Exception as e:
logger.warning(e)
continue

Expand Down
41 changes: 40 additions & 1 deletion tests/data_exports/test_marc_transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,44 @@ def mock_get_current_context(mocker):
return context


def test_skip_record_no_999i(
mocker, tmp_path, mock_folio_client, mock_get_current_context
):
mocker.patch.object(
marc_transformer, "get_current_context", mock_get_current_context
)
mocker.patch.object(marc_transformer, "SQLExecuteQueryOperator", MockSQLOperator)
mocker.patch(
'libsys_airflow.plugins.data_exports.marc.transformer.folio_client',
return_value=mock_folio_client,
)
record = pymarc.Record()
record.add_field(
pymarc.Field(
tag='999',
indicators=['f', 'f'],
subfields=[
pymarc.Subfield(code='i', value='not a uuid!'),
],
)
)
marc_file = tmp_path / "20240514.mrc"
with marc_file.open('wb+') as fo:
marc_writer = pymarc.MARCWriter(fo)
marc_writer.write(record)

transformer = marc_transformer.Transformer()
transformer.add_holdings_items(str(marc_file), full_dump=False)

with pytest.raises(Exception, match='No uuid in subfields'):
raise Exception('No uuid in subfields')

with marc_file.open('rb') as fo:
mod_marc_records = [r for r in pymarc.MARCReader(fo)]

assert len(mod_marc_records) == 0


def test_add_holdings_items_single_999(
mocker, tmp_path, mock_folio_client, mock_get_current_context
):
Expand All @@ -206,7 +244,8 @@ def test_add_holdings_items_single_999(
tag='999',
indicators=['f', 'f'],
subfields=[
pymarc.Subfield(code='i', value='5face3a3-9804-5034-aa02-1eb5db0c191c')
pymarc.Subfield(code='i', value='not a uuid!'),
pymarc.Subfield(code='i', value='5face3a3-9804-5034-aa02-1eb5db0c191c'),
],
)
)
Expand Down
31 changes: 14 additions & 17 deletions tests/data_exports/test_save_instance_ids.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import csv
import pandas as pd
import pathlib
import pytest
Expand All @@ -23,20 +22,12 @@ def mock_task_instance():
def mock_xcom_pull(**kwargs):
return {
"updates": [
[],
[
['4e66ce0d-4a1d-41dc-8b35-0914df20c7fb'],
['fe2e581f-9767-442a-ae3c-a421ac655fe2'],
],
[],
'4e66ce0d-4a1d-41dc-8b35-0914df20c7fb',
'fe2e581f-9767-442a-ae3c-a421ac655fe2',
],
"deletes": [
[],
[
['336971cd-2ea1-4ad2-af86-22ae7c0a95ae'],
['4e66ce0d-4a1d-41dc-8b35-0914df20c7fb'],
],
[],
'336971cd-2ea1-4ad2-af86-22ae7c0a95ae',
'4e66ce0d-4a1d-41dc-8b35-0914df20c7fb',
],
}

Expand All @@ -46,14 +37,20 @@ def test_save_ids_to_fs(tmp_path, mock_task_instance):
airflow=tmp_path, task_instance=mock_task_instance, vendor="oclc"
)

file_list = []
for i, path in enumerate(save_path):
file = pathlib.Path(path)
assert file.exists()

with file.open('r') as fo:
id_list = list(row for row in csv.reader(fo))

assert id_list[0][i] == "['4e66ce0d-4a1d-41dc-8b35-0914df20c7fb']"
for row in fo:
file_list.append(row)

assert file_list == [
'4e66ce0d-4a1d-41dc-8b35-0914df20c7fb\n',
'fe2e581f-9767-442a-ae3c-a421ac655fe2\n',
'336971cd-2ea1-4ad2-af86-22ae7c0a95ae\n',
'4e66ce0d-4a1d-41dc-8b35-0914df20c7fb\n',
]


def test_upload_data_export_file_ids_one_column():
Expand Down

0 comments on commit dca007f

Please sign in to comment.