Skip to content

Commit

Permalink
Merge pull request #975 from sul-dlss/969-deletes-for-pod
Browse files Browse the repository at this point in the history
Set leader position 5 to "d" for records in deletes list
  • Loading branch information
jermnelson committed May 9, 2024
2 parents 24b8444 + 009a955 commit cbbd96b
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 3 deletions.
11 changes: 10 additions & 1 deletion libsys_airflow/dags/data_exports/pod_selections.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from libsys_airflow.plugins.data_exports.marc.exports import marc_for_instances
from libsys_airflow.plugins.data_exports.marc.transforms import (
add_holdings_items_to_marc_files,
change_leader_for_deletes,
remove_fields_from_marc_files,
)

Expand Down Expand Up @@ -84,11 +85,19 @@
},
)

transform_leader_fields = PythonOperator(
task_id="transform_folio_modify_leader_fields",
python_callable=change_leader_for_deletes,
op_kwargs={
"marc_file_list": "{{ ti.xcom_pull('fetch_marc_records_from_folio') }}"
},
)

finish_processing_marc = EmptyOperator(
task_id="finish_marc",
)


fetch_folio_record_ids >> save_ids_to_file >> fetch_marc_records
fetch_marc_records >> transform_marc_record >> transform_marc_fields
transform_marc_fields >> finish_processing_marc
transform_marc_fields >> transform_leader_fields >> finish_processing_marc
40 changes: 39 additions & 1 deletion libsys_airflow/plugins/data_exports/marc/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,45 @@ def divide_into_oclc_libraries(**kwargs):
)


def change_leader_for_deletes(marc_file_list: str):
marc_list = ast.literal_eval(marc_file_list)
for file in marc_list['deletes']:
leader_for_deletes(file, False)


def leader_for_deletes(marc_file: str, full_dump: bool):
"""
Records specified as deleted by using d in position 05 in the MARC Leader
"""
marc_path = pathlib.Path(marc_file)
if full_dump:
marc_path = S3Path(marc_file)
logger.info(f"Changing leader using AWS S3 with path: {marc_path}")

with marc_path.open('rb') as fo:
marc_records = [record for record in pymarc.MARCReader(fo)]

logger.info(f"Changing leader for {len(marc_records):,} records")

for i, record in enumerate(marc_records):
try:
record.leader = pymarc.leader.Leader(record.leader)
record.leader[5] = "d" # type: ignore
if not i % 100:
logger.info(f"{i:,} records processed")
except AttributeError as e:
logger.warning(e)
continue

try:
with marc_path.open("wb") as fo:
marc_writer = pymarc.MARCWriter(fo)
for record in marc_records:
marc_writer.write(record)
except pymarc.exceptions.WriteNeedsRecord as e:
logger.warning(e)


def remove_fields_from_marc_files(marc_file_list: str):
marc_list = ast.literal_eval(marc_file_list)
for file in marc_list['updates']:
Expand Down Expand Up @@ -144,5 +183,4 @@ def remove_marc_fields(marc_file: str, full_dump: bool):
for record in marc_records:
marc_writer.write(record)
except pymarc.exceptions.WriteNeedsRecord as e:

logger.warning(e)
31 changes: 30 additions & 1 deletion tests/data_exports/test_marc_transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@

from unittest.mock import MagicMock

from libsys_airflow.plugins.data_exports.marc.transforms import remove_marc_fields
from libsys_airflow.plugins.data_exports.marc.transforms import (
leader_for_deletes,
remove_marc_fields,
)

from libsys_airflow.plugins.data_exports.marc import transformer as marc_transformer

Expand Down Expand Up @@ -363,3 +366,29 @@ def test_remove_marc_fields(tmp_path):

assert "598" not in current_fields
assert "699" not in current_fields


def test_change_leader(tmp_path):
marc_file = tmp_path / "20240509.mrc"

record = pymarc.Record()
record.add_field(
pymarc.Field(
tag='245',
indicators=[' ', ' '],
subfields=[pymarc.Subfield(code='a', value='A Short Title')],
),
)
assert not record.leader[5] == 'd'

with marc_file.open("wb+") as fo:
marc_writer = pymarc.MARCWriter(fo)
marc_writer.write(record)

leader_for_deletes(str(marc_file.absolute()), full_dump=False)

with marc_file.open('rb') as fo:
marc_reader = pymarc.MARCReader(fo)
modified_marc_record = next(marc_reader)

assert modified_marc_record.leader[5] == 'd'

0 comments on commit cbbd96b

Please sign in to comment.