From 5e15f3ae5bdf202c9c8362001c9413456b017ece Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:45:59 +0000 Subject: [PATCH 1/7] Initial plan From 468fd788e98d056446cb3872f16f2717997c8216 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:51:43 +0000 Subject: [PATCH 2/7] Add RawOrganizationMixin and update history models - Created RawOrganizationMixin in core/models.py with fields for raw organization data - Added RawOrganizationMixin to PublisherHistory, OwnerHistory, CopyrightHolderHistory, SponsorHistory - Updated _add_institution_history and add_* methods to accept and populate raw organization fields - Updated calls in am_to_core.py to pass raw organization data - Created task_replace_institution_by_raw_institution task in journal/tasks.py Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- core/models.py | 58 +++++++++++ journal/models.py | 82 +++++++++++++++- journal/sources/am_to_core.py | 18 ++++ journal/tasks.py | 179 ++++++++++++++++++++++++++++++++++ 4 files changed, 333 insertions(+), 4 deletions(-) diff --git a/core/models.py b/core/models.py index 1ae964eb6..ad42fed7a 100755 --- a/core/models.py +++ b/core/models.py @@ -277,6 +277,64 @@ class Meta: abstract = True +class RawOrganizationMixin(models.Model): + """ + Mixin for storing raw, unstructured organization data. + Intended to replace references to institution.models.Institution. + """ + raw_text = models.TextField( + _("Raw Text"), + null=True, + blank=True, + help_text=_("Free text, unstructured organization data"), + ) + raw_institution_name = models.CharField( + _("Raw Institution Name"), + max_length=510, + null=True, + blank=True, + help_text=_("Raw institution name as provided"), + ) + raw_country_name = models.CharField( + _("Raw Country Name"), + max_length=255, + null=True, + blank=True, + help_text=_("Raw country name as provided"), + ) + raw_country_code = models.CharField( + _("Raw Country Code"), + max_length=3, + null=True, + blank=True, + help_text=_("Raw country code (ISO) as provided"), + ) + raw_state_name = models.CharField( + _("Raw State Name"), + max_length=255, + null=True, + blank=True, + help_text=_("Raw state name as provided"), + ) + raw_state_acron = models.CharField( + _("Raw State Acronym"), + max_length=10, + null=True, + blank=True, + help_text=_("Raw state acronym as provided"), + ) + raw_city_name = models.CharField( + _("Raw City Name"), + max_length=255, + null=True, + blank=True, + help_text=_("Raw city name as provided"), + ) + + class Meta: + abstract = True + + class LanguageFallbackManager(models.Manager): def get_object_in_preferred_language(self, language): mission = self.filter(language=language) diff --git a/journal/models.py b/journal/models.py index 04119d78a..a8474c700 100755 --- a/journal/models.py +++ b/journal/models.py @@ -31,6 +31,7 @@ SocialNetwork, TextWithLang, CharFieldLangMixin, + RawOrganizationMixin, ) from core.utils import date_utils from core.utils.thread_context import get_current_collections, get_current_user @@ -1075,6 +1076,13 @@ def _add_institution_history( initial_date=None, final_date=None, location=None, + raw_text=None, + raw_institution_name=None, + raw_country_name=None, + raw_country_code=None, + raw_state_name=None, + raw_state_acron=None, + raw_city_name=None, ): """Adiciona instituição usando InstitutionHistory genérico.""" if not original_data and not organization: @@ -1106,6 +1114,16 @@ def _add_institution_history( ) institution_history.journal = self institution_history.organization = organization + + # Populate RawOrganizationMixin fields + institution_history.raw_text = raw_text + institution_history.raw_institution_name = raw_institution_name + institution_history.raw_country_name = raw_country_name + institution_history.raw_country_code = raw_country_code + institution_history.raw_state_name = raw_state_name + institution_history.raw_state_acron = raw_state_acron + institution_history.raw_city_name = raw_city_name + institution_history.save() return institution_history @@ -1117,6 +1135,13 @@ def add_publisher( initial_date=None, final_date=None, location=None, + raw_text=None, + raw_institution_name=None, + raw_country_name=None, + raw_country_code=None, + raw_state_name=None, + raw_state_acron=None, + raw_city_name=None, ): """Adiciona publisher usando PublisherHistory.""" return self._add_institution_history( @@ -1128,6 +1153,13 @@ def add_publisher( initial_date=initial_date, final_date=final_date, location=location, + raw_text=raw_text, + raw_institution_name=raw_institution_name, + raw_country_name=raw_country_name, + raw_country_code=raw_country_code, + raw_state_name=raw_state_name, + raw_state_acron=raw_state_acron, + raw_city_name=raw_city_name, ) def add_owner( @@ -1138,6 +1170,13 @@ def add_owner( initial_date=None, final_date=None, location=None, + raw_text=None, + raw_institution_name=None, + raw_country_name=None, + raw_country_code=None, + raw_state_name=None, + raw_state_acron=None, + raw_city_name=None, ): """Adiciona owner usando OwnerHistory.""" return self._add_institution_history( @@ -1149,6 +1188,13 @@ def add_owner( initial_date=initial_date, final_date=final_date, location=location, + raw_text=raw_text, + raw_institution_name=raw_institution_name, + raw_country_name=raw_country_name, + raw_country_code=raw_country_code, + raw_state_name=raw_state_name, + raw_state_acron=raw_state_acron, + raw_city_name=raw_city_name, ) def add_sponsor( @@ -1159,6 +1205,13 @@ def add_sponsor( initial_date=None, final_date=None, location=None, + raw_text=None, + raw_institution_name=None, + raw_country_name=None, + raw_country_code=None, + raw_state_name=None, + raw_state_acron=None, + raw_city_name=None, ): """Adiciona sponsor usando SponsorHistory.""" return self._add_institution_history( @@ -1170,6 +1223,13 @@ def add_sponsor( initial_date=initial_date, final_date=final_date, location=location, + raw_text=raw_text, + raw_institution_name=raw_institution_name, + raw_country_name=raw_country_name, + raw_country_code=raw_country_code, + raw_state_name=raw_state_name, + raw_state_acron=raw_state_acron, + raw_city_name=raw_city_name, ) def add_copyright_holder( @@ -1180,6 +1240,13 @@ def add_copyright_holder( initial_date=None, final_date=None, location=None, + raw_text=None, + raw_institution_name=None, + raw_country_name=None, + raw_country_code=None, + raw_state_name=None, + raw_state_acron=None, + raw_city_name=None, ): """Adiciona copyright_holder usando CopyrightHolderHistory.""" return self._add_institution_history( @@ -1191,6 +1258,13 @@ def add_copyright_holder( initial_date=initial_date, final_date=final_date, location=location, + raw_text=raw_text, + raw_institution_name=raw_institution_name, + raw_country_name=raw_country_name, + raw_country_code=raw_country_code, + raw_state_name=raw_state_name, + raw_state_acron=raw_state_acron, + raw_city_name=raw_city_name, ) @@ -1287,7 +1361,7 @@ def create_or_update( return obj -class OwnerHistory(Orderable, ClusterableModel, BaseHistoryItem): +class OwnerHistory(Orderable, ClusterableModel, BaseHistoryItem, RawOrganizationMixin): journal = ParentalKey( Journal, on_delete=models.SET_NULL, related_name="owner_history", null=True ) @@ -1321,7 +1395,7 @@ def get_org_level_model(cls): return OrgLevelOwner -class PublisherHistory(Orderable, ClusterableModel, BaseHistoryItem): +class PublisherHistory(Orderable, ClusterableModel, BaseHistoryItem, RawOrganizationMixin): journal = ParentalKey( Journal, on_delete=models.SET_NULL, related_name="publisher_history", null=True ) @@ -1355,7 +1429,7 @@ def get_org_level_model(cls): return OrgLevelPublisher -class SponsorHistory(Orderable, ClusterableModel, BaseHistoryItem): +class SponsorHistory(Orderable, ClusterableModel, BaseHistoryItem, RawOrganizationMixin): journal = ParentalKey( Journal, on_delete=models.SET_NULL, null=True, related_name="sponsor_history" ) @@ -1389,7 +1463,7 @@ def get_org_level_model(cls): return OrgLevelSponsor -class CopyrightHolderHistory(Orderable, ClusterableModel, BaseHistoryItem): +class CopyrightHolderHistory(Orderable, ClusterableModel, BaseHistoryItem, RawOrganizationMixin): journal = ParentalKey( Journal, on_delete=models.SET_NULL, diff --git a/journal/sources/am_to_core.py b/journal/sources/am_to_core.py index 009e4f8b6..c362e10ac 100644 --- a/journal/sources/am_to_core.py +++ b/journal/sources/am_to_core.py @@ -232,12 +232,20 @@ def update_panel_institution( user=user, original_data=p, location=location, + raw_institution_name=p, + raw_country_name=extract_value(publisher_country), + raw_state_name=extract_value(publisher_state), + raw_city_name=extract_value(publisher_city), ) # Usa novo método add_owner ao invés de OwnerHistory journal.add_owner( user=user, original_data=p, location=location, + raw_institution_name=p, + raw_country_name=extract_value(publisher_country), + raw_state_name=extract_value(publisher_state), + raw_city_name=extract_value(publisher_city), ) journal.contact_name = p @@ -541,6 +549,7 @@ def get_or_create_sponsor(sponsor, journal, user, location=None): user=user, original_data=s, location=location, + raw_institution_name=s, ) @@ -822,6 +831,7 @@ def get_or_create_copyright_holder(journal, copyright_holder_name, user, locatio user=user, original_data=cp, location=location, + raw_institution_name=cp, ) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() @@ -936,12 +946,20 @@ def create_location_and_add_institutions( user=user, original_data=p, location=location, + raw_institution_name=p, + raw_country_name=extract_value(publisher_country), + raw_state_name=extract_value(publisher_state), + raw_city_name=extract_value(publisher_city), ) # Adiciona owner (mesmo dado do publisher por padrão) journal.add_owner( user=user, original_data=p, location=location, + raw_institution_name=p, + raw_country_name=extract_value(publisher_country), + raw_state_name=extract_value(publisher_state), + raw_city_name=extract_value(publisher_city), ) # Adiciona sponsor(s) se fornecido diff --git a/journal/tasks.py b/journal/tasks.py index 56292d550..d907c3b5e 100644 --- a/journal/tasks.py +++ b/journal/tasks.py @@ -366,6 +366,185 @@ def task_export_journals_to_articlemeta( raise +@celery_app.task(bind=True, name="task_replace_institution_by_raw_institution") +def task_replace_institution_by_raw_institution( + self, + username=None, + user_id=None, + collection_acron_list=None, + journal_issns=None, +): + """ + Task to populate RawOrganizationMixin fields from AMJournal records. + + This task extracts institution data from journal.models.AMJournal records and + populates the raw organization fields in PublisherHistory, OwnerHistory, + CopyrightHolderHistory, and SponsorHistory. + + Args: + username: User name for authentication + user_id: User ID for authentication + collection_acron_list: List of collection acronyms to filter journals + journal_issns: List of journal ISSNs to filter journals + + Returns: + Dict with processing statistics + """ + from journal.sources.am_data_extraction import extract_value + + user = _get_user(self.request, username=username, user_id=user_id) + + try: + # Build queryset for AMJournal records + queryset = AMJournal.objects.all() + + # Filter by collection if provided + if collection_acron_list: + from collection.models import Collection + collections = Collection.objects.filter(acron3__in=collection_acron_list) + queryset = queryset.filter(collection__in=collections) + + # Filter by journal ISSN if provided + if journal_issns: + queryset = queryset.filter(pid__in=journal_issns) + + processed_count = 0 + error_count = 0 + + for am_journal in queryset.iterator(): + try: + # Skip if no data + if not am_journal.data: + continue + + # Get the corresponding journal + try: + scielo_journal = SciELOJournal.objects.get( + issn_scielo=am_journal.pid, + collection=am_journal.collection + ) + journal = scielo_journal.journal + except SciELOJournal.DoesNotExist: + logger.warning( + f"SciELOJournal not found for pid={am_journal.pid}, " + f"collection={am_journal.collection}" + ) + continue + + # Extract data from AMJournal + data = am_journal.data + + # Extract publisher/owner data + publisher = extract_value(data.get("publisher_name")) + publisher_country = extract_value(data.get("publisher_country")) + publisher_state = extract_value(data.get("publisher_state")) + publisher_city = extract_value(data.get("publisher_city")) + + # Extract sponsor data + sponsor = extract_value(data.get("sponsors")) + + # Extract copyright holder data + copyright_holder = extract_value(data.get("copyrighter")) + + # Update PublisherHistory and OwnerHistory records + if publisher: + if isinstance(publisher, str): + publisher = [publisher] + + for p in publisher: + if p: + # Update PublisherHistory records + for pub_hist in journal.publisher_history.filter( + institution__institution_name=p + ): + pub_hist.raw_institution_name = p + pub_hist.raw_country_name = publisher_country + pub_hist.raw_state_name = publisher_state + pub_hist.raw_city_name = publisher_city + pub_hist.save() + + # Update OwnerHistory records + for own_hist in journal.owner_history.filter( + institution__institution_name=p + ): + own_hist.raw_institution_name = p + own_hist.raw_country_name = publisher_country + own_hist.raw_state_name = publisher_state + own_hist.raw_city_name = publisher_city + own_hist.save() + + # Update SponsorHistory records + if sponsor: + if isinstance(sponsor, str): + sponsor = [sponsor] + + for s in sponsor: + if s: + for spon_hist in journal.sponsor_history.filter( + institution__institution_name=s + ): + spon_hist.raw_institution_name = s + spon_hist.save() + + # Update CopyrightHolderHistory records + if copyright_holder: + if isinstance(copyright_holder, str): + copyright_holder = [copyright_holder] + + for cp in copyright_holder: + if cp: + for cp_hist in journal.copyright_holder_history.filter( + institution__institution_name=cp + ): + cp_hist.raw_institution_name = cp + cp_hist.save() + + processed_count += 1 + + except Exception as e: + error_count += 1 + exc_type, exc_value, exc_traceback = sys.exc_info() + UnexpectedEvent.create( + exception=e, + exc_traceback=exc_traceback, + detail={ + "task": "task_replace_institution_by_raw_institution", + "am_journal_id": am_journal.id, + "pid": am_journal.pid, + "collection": str(am_journal.collection) if am_journal.collection else None, + }, + ) + logger.error( + f"Error processing AMJournal {am_journal.id}: {e}" + ) + + result = { + "processed_count": processed_count, + "error_count": error_count, + } + + logger.info( + f"task_replace_institution_by_raw_institution completed: {result}" + ) + + return result + + except Exception as e: + exc_type, exc_value, exc_traceback = sys.exc_info() + UnexpectedEvent.create( + exception=e, + exc_traceback=exc_traceback, + detail={ + "task": "task_replace_institution_by_raw_institution", + "collection_acron_list": collection_acron_list, + "journal_issns": journal_issns, + "user_id": user_id, + "username": username, + }, + ) + raise + + @celery_app.task(bind=True, name="task_export_journal_to_articlemeta") def task_export_journal_to_articlemeta( self, From 0afad4cbb222a0b0d6d30d606ec9734deab12e52 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:54:30 +0000 Subject: [PATCH 3/7] Add database migration for RawOrganizationMixin fields Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- .../0055_add_raw_organization_fields.py | 321 ++++++++++++++++++ src/legendarium | 1 + 2 files changed, 322 insertions(+) create mode 100644 journal/migrations/0055_add_raw_organization_fields.py create mode 160000 src/legendarium diff --git a/journal/migrations/0055_add_raw_organization_fields.py b/journal/migrations/0055_add_raw_organization_fields.py new file mode 100644 index 000000000..a0439d7f0 --- /dev/null +++ b/journal/migrations/0055_add_raw_organization_fields.py @@ -0,0 +1,321 @@ +# Generated manually for adding RawOrganizationMixin fields + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("journal", "0054_journaltableofcontents"), + ] + + operations = [ + # Add RawOrganizationMixin fields to OwnerHistory + migrations.AddField( + model_name="ownerhistory", + name="raw_text", + field=models.TextField( + blank=True, + help_text="Free text, unstructured organization data", + null=True, + verbose_name="Raw Text", + ), + ), + migrations.AddField( + model_name="ownerhistory", + name="raw_institution_name", + field=models.CharField( + blank=True, + help_text="Raw institution name as provided", + max_length=510, + null=True, + verbose_name="Raw Institution Name", + ), + ), + migrations.AddField( + model_name="ownerhistory", + name="raw_country_name", + field=models.CharField( + blank=True, + help_text="Raw country name as provided", + max_length=255, + null=True, + verbose_name="Raw Country Name", + ), + ), + migrations.AddField( + model_name="ownerhistory", + name="raw_country_code", + field=models.CharField( + blank=True, + help_text="Raw country code (ISO) as provided", + max_length=3, + null=True, + verbose_name="Raw Country Code", + ), + ), + migrations.AddField( + model_name="ownerhistory", + name="raw_state_name", + field=models.CharField( + blank=True, + help_text="Raw state name as provided", + max_length=255, + null=True, + verbose_name="Raw State Name", + ), + ), + migrations.AddField( + model_name="ownerhistory", + name="raw_state_acron", + field=models.CharField( + blank=True, + help_text="Raw state acronym as provided", + max_length=10, + null=True, + verbose_name="Raw State Acronym", + ), + ), + migrations.AddField( + model_name="ownerhistory", + name="raw_city_name", + field=models.CharField( + blank=True, + help_text="Raw city name as provided", + max_length=255, + null=True, + verbose_name="Raw City Name", + ), + ), + # Add RawOrganizationMixin fields to PublisherHistory + migrations.AddField( + model_name="publisherhistory", + name="raw_text", + field=models.TextField( + blank=True, + help_text="Free text, unstructured organization data", + null=True, + verbose_name="Raw Text", + ), + ), + migrations.AddField( + model_name="publisherhistory", + name="raw_institution_name", + field=models.CharField( + blank=True, + help_text="Raw institution name as provided", + max_length=510, + null=True, + verbose_name="Raw Institution Name", + ), + ), + migrations.AddField( + model_name="publisherhistory", + name="raw_country_name", + field=models.CharField( + blank=True, + help_text="Raw country name as provided", + max_length=255, + null=True, + verbose_name="Raw Country Name", + ), + ), + migrations.AddField( + model_name="publisherhistory", + name="raw_country_code", + field=models.CharField( + blank=True, + help_text="Raw country code (ISO) as provided", + max_length=3, + null=True, + verbose_name="Raw Country Code", + ), + ), + migrations.AddField( + model_name="publisherhistory", + name="raw_state_name", + field=models.CharField( + blank=True, + help_text="Raw state name as provided", + max_length=255, + null=True, + verbose_name="Raw State Name", + ), + ), + migrations.AddField( + model_name="publisherhistory", + name="raw_state_acron", + field=models.CharField( + blank=True, + help_text="Raw state acronym as provided", + max_length=10, + null=True, + verbose_name="Raw State Acronym", + ), + ), + migrations.AddField( + model_name="publisherhistory", + name="raw_city_name", + field=models.CharField( + blank=True, + help_text="Raw city name as provided", + max_length=255, + null=True, + verbose_name="Raw City Name", + ), + ), + # Add RawOrganizationMixin fields to SponsorHistory + migrations.AddField( + model_name="sponsorhistory", + name="raw_text", + field=models.TextField( + blank=True, + help_text="Free text, unstructured organization data", + null=True, + verbose_name="Raw Text", + ), + ), + migrations.AddField( + model_name="sponsorhistory", + name="raw_institution_name", + field=models.CharField( + blank=True, + help_text="Raw institution name as provided", + max_length=510, + null=True, + verbose_name="Raw Institution Name", + ), + ), + migrations.AddField( + model_name="sponsorhistory", + name="raw_country_name", + field=models.CharField( + blank=True, + help_text="Raw country name as provided", + max_length=255, + null=True, + verbose_name="Raw Country Name", + ), + ), + migrations.AddField( + model_name="sponsorhistory", + name="raw_country_code", + field=models.CharField( + blank=True, + help_text="Raw country code (ISO) as provided", + max_length=3, + null=True, + verbose_name="Raw Country Code", + ), + ), + migrations.AddField( + model_name="sponsorhistory", + name="raw_state_name", + field=models.CharField( + blank=True, + help_text="Raw state name as provided", + max_length=255, + null=True, + verbose_name="Raw State Name", + ), + ), + migrations.AddField( + model_name="sponsorhistory", + name="raw_state_acron", + field=models.CharField( + blank=True, + help_text="Raw state acronym as provided", + max_length=10, + null=True, + verbose_name="Raw State Acronym", + ), + ), + migrations.AddField( + model_name="sponsorhistory", + name="raw_city_name", + field=models.CharField( + blank=True, + help_text="Raw city name as provided", + max_length=255, + null=True, + verbose_name="Raw City Name", + ), + ), + # Add RawOrganizationMixin fields to CopyrightHolderHistory + migrations.AddField( + model_name="copyrightholderhistory", + name="raw_text", + field=models.TextField( + blank=True, + help_text="Free text, unstructured organization data", + null=True, + verbose_name="Raw Text", + ), + ), + migrations.AddField( + model_name="copyrightholderhistory", + name="raw_institution_name", + field=models.CharField( + blank=True, + help_text="Raw institution name as provided", + max_length=510, + null=True, + verbose_name="Raw Institution Name", + ), + ), + migrations.AddField( + model_name="copyrightholderhistory", + name="raw_country_name", + field=models.CharField( + blank=True, + help_text="Raw country name as provided", + max_length=255, + null=True, + verbose_name="Raw Country Name", + ), + ), + migrations.AddField( + model_name="copyrightholderhistory", + name="raw_country_code", + field=models.CharField( + blank=True, + help_text="Raw country code (ISO) as provided", + max_length=3, + null=True, + verbose_name="Raw Country Code", + ), + ), + migrations.AddField( + model_name="copyrightholderhistory", + name="raw_state_name", + field=models.CharField( + blank=True, + help_text="Raw state name as provided", + max_length=255, + null=True, + verbose_name="Raw State Name", + ), + ), + migrations.AddField( + model_name="copyrightholderhistory", + name="raw_state_acron", + field=models.CharField( + blank=True, + help_text="Raw state acronym as provided", + max_length=10, + null=True, + verbose_name="Raw State Acronym", + ), + ), + migrations.AddField( + model_name="copyrightholderhistory", + name="raw_city_name", + field=models.CharField( + blank=True, + help_text="Raw city name as provided", + max_length=255, + null=True, + verbose_name="Raw City Name", + ), + ), + ] diff --git a/src/legendarium b/src/legendarium new file mode 160000 index 000000000..d0df8f3e7 --- /dev/null +++ b/src/legendarium @@ -0,0 +1 @@ +Subproject commit d0df8f3e778b3a719ba08425ad561cb9fa1755b7 From 200fc1e542342ea4b338d22aa5bb2e4a8175d38a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:54:42 +0000 Subject: [PATCH 4/7] Remove accidentally added src/legendarium submodule --- .gitignore | 1 + src/legendarium | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 160000 src/legendarium diff --git a/.gitignore b/.gitignore index 56d58668b..adb51ae38 100755 --- a/.gitignore +++ b/.gitignore @@ -286,3 +286,4 @@ core/media/ .ipython/ .env !.envs/.local/ +src/ diff --git a/src/legendarium b/src/legendarium deleted file mode 160000 index d0df8f3e7..000000000 --- a/src/legendarium +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d0df8f3e778b3a719ba08425ad561cb9fa1755b7 From 403eea52e15fe0a4b14ce9c9edf92ac427f7ef1c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:55:54 +0000 Subject: [PATCH 5/7] Add tests for RawOrganizationMixin functionality - Added test cases for add_publisher, add_owner, add_sponsor, add_copyright_holder with raw organization fields - Added backward compatibility test to ensure existing code still works Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- journal/tests.py | 86 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/journal/tests.py b/journal/tests.py index a6a658efa..7b61e676e 100755 --- a/journal/tests.py +++ b/journal/tests.py @@ -285,3 +285,89 @@ def test_fetch_and_process_journal_logos_with_valid_collection(self, mock_group) # The task should complete without raising an exception # and should call group with the task signatures self.assertTrue(mock_group.called) + + +class RawOrganizationMixinTestCase(TestCase): + """Test cases for RawOrganizationMixin functionality""" + + def setUp(self): + """Set up test fixtures""" + self.user = User.objects.create_user(username="testuser") + self.collection = Collection.objects.create( + name="Test Collection", + acron3="TST", + ) + self.journal = Journal.objects.create( + title="Test Journal", + ) + + def test_add_publisher_with_raw_organization_fields(self): + """Test that add_publisher accepts and saves raw organization fields""" + publisher_history = self.journal.add_publisher( + user=self.user, + original_data="Test Publisher", + raw_institution_name="Test Publisher Inc.", + raw_country_name="Brazil", + raw_country_code="BR", + raw_state_name="São Paulo", + raw_state_acron="SP", + raw_city_name="São Paulo", + ) + + self.assertIsNotNone(publisher_history) + self.assertEqual(publisher_history.raw_institution_name, "Test Publisher Inc.") + self.assertEqual(publisher_history.raw_country_name, "Brazil") + self.assertEqual(publisher_history.raw_country_code, "BR") + self.assertEqual(publisher_history.raw_state_name, "São Paulo") + self.assertEqual(publisher_history.raw_state_acron, "SP") + self.assertEqual(publisher_history.raw_city_name, "São Paulo") + + def test_add_owner_with_raw_organization_fields(self): + """Test that add_owner accepts and saves raw organization fields""" + owner_history = self.journal.add_owner( + user=self.user, + original_data="Test Owner", + raw_institution_name="Test Owner Institution", + raw_country_name="Argentina", + ) + + self.assertIsNotNone(owner_history) + self.assertEqual(owner_history.raw_institution_name, "Test Owner Institution") + self.assertEqual(owner_history.raw_country_name, "Argentina") + + def test_add_sponsor_with_raw_organization_fields(self): + """Test that add_sponsor accepts and saves raw organization fields""" + sponsor_history = self.journal.add_sponsor( + user=self.user, + original_data="Test Sponsor", + raw_institution_name="Test Sponsor Foundation", + ) + + self.assertIsNotNone(sponsor_history) + self.assertEqual(sponsor_history.raw_institution_name, "Test Sponsor Foundation") + + def test_add_copyright_holder_with_raw_organization_fields(self): + """Test that add_copyright_holder accepts and saves raw organization fields""" + copyright_history = self.journal.add_copyright_holder( + user=self.user, + original_data="Test Copyright Holder", + raw_institution_name="Test Copyright Holder Corp", + raw_text="Full copyright text", + ) + + self.assertIsNotNone(copyright_history) + self.assertEqual(copyright_history.raw_institution_name, "Test Copyright Holder Corp") + self.assertEqual(copyright_history.raw_text, "Full copyright text") + + def test_backward_compatibility_without_raw_fields(self): + """Test that existing code without raw fields still works""" + # This tests backward compatibility + publisher_history = self.journal.add_publisher( + user=self.user, + original_data="Legacy Publisher", + ) + + self.assertIsNotNone(publisher_history) + # Raw fields should be None if not provided + self.assertIsNone(publisher_history.raw_institution_name) + self.assertIsNone(publisher_history.raw_country_name) From 4ca57279442a6a960dfbd77b327e9dd48ea1efa7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:56:54 +0000 Subject: [PATCH 6/7] Optimize task_replace_institution_by_raw_institution with bulk_update - Replace N+1 query pattern with bulk_update for better performance - Update PublisherHistory, OwnerHistory, SponsorHistory, CopyrightHolderHistory records in batches Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- journal/tasks.py | 54 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/journal/tasks.py b/journal/tasks.py index d907c3b5e..e8d03e38b 100644 --- a/journal/tasks.py +++ b/journal/tasks.py @@ -454,24 +454,40 @@ def task_replace_institution_by_raw_institution( for p in publisher: if p: # Update PublisherHistory records - for pub_hist in journal.publisher_history.filter( + pub_hist_list = list(journal.publisher_history.filter( institution__institution_name=p - ): + )) + for pub_hist in pub_hist_list: pub_hist.raw_institution_name = p pub_hist.raw_country_name = publisher_country pub_hist.raw_state_name = publisher_state pub_hist.raw_city_name = publisher_city - pub_hist.save() + + if pub_hist_list: + from journal.models import PublisherHistory + PublisherHistory.objects.bulk_update( + pub_hist_list, + ['raw_institution_name', 'raw_country_name', + 'raw_state_name', 'raw_city_name'] + ) # Update OwnerHistory records - for own_hist in journal.owner_history.filter( + own_hist_list = list(journal.owner_history.filter( institution__institution_name=p - ): + )) + for own_hist in own_hist_list: own_hist.raw_institution_name = p own_hist.raw_country_name = publisher_country own_hist.raw_state_name = publisher_state own_hist.raw_city_name = publisher_city - own_hist.save() + + if own_hist_list: + from journal.models import OwnerHistory + OwnerHistory.objects.bulk_update( + own_hist_list, + ['raw_institution_name', 'raw_country_name', + 'raw_state_name', 'raw_city_name'] + ) # Update SponsorHistory records if sponsor: @@ -480,11 +496,18 @@ def task_replace_institution_by_raw_institution( for s in sponsor: if s: - for spon_hist in journal.sponsor_history.filter( + spon_hist_list = list(journal.sponsor_history.filter( institution__institution_name=s - ): + )) + for spon_hist in spon_hist_list: spon_hist.raw_institution_name = s - spon_hist.save() + + if spon_hist_list: + from journal.models import SponsorHistory + SponsorHistory.objects.bulk_update( + spon_hist_list, + ['raw_institution_name'] + ) # Update CopyrightHolderHistory records if copyright_holder: @@ -493,11 +516,18 @@ def task_replace_institution_by_raw_institution( for cp in copyright_holder: if cp: - for cp_hist in journal.copyright_holder_history.filter( + cp_hist_list = list(journal.copyright_holder_history.filter( institution__institution_name=cp - ): + )) + for cp_hist in cp_hist_list: cp_hist.raw_institution_name = cp - cp_hist.save() + + if cp_hist_list: + from journal.models import CopyrightHolderHistory + CopyrightHolderHistory.objects.bulk_update( + cp_hist_list, + ['raw_institution_name'] + ) processed_count += 1 From 9fdf0a7479336556fe0911b7379e08ad7113a113 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 16:58:03 +0000 Subject: [PATCH 7/7] Further optimize task with __in queries to reduce DB hits - Use institution__institution_name__in for batch filtering - Reduces multiple queries to single query per history type - Collects all names first before filtering Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com> --- journal/tasks.py | 137 +++++++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 65 deletions(-) diff --git a/journal/tasks.py b/journal/tasks.py index e8d03e38b..02b3643c2 100644 --- a/journal/tasks.py +++ b/journal/tasks.py @@ -451,83 +451,90 @@ def task_replace_institution_by_raw_institution( if isinstance(publisher, str): publisher = [publisher] - for p in publisher: - if p: - # Update PublisherHistory records - pub_hist_list = list(journal.publisher_history.filter( - institution__institution_name=p - )) - for pub_hist in pub_hist_list: - pub_hist.raw_institution_name = p - pub_hist.raw_country_name = publisher_country - pub_hist.raw_state_name = publisher_state - pub_hist.raw_city_name = publisher_city - - if pub_hist_list: - from journal.models import PublisherHistory - PublisherHistory.objects.bulk_update( - pub_hist_list, - ['raw_institution_name', 'raw_country_name', - 'raw_state_name', 'raw_city_name'] - ) - - # Update OwnerHistory records - own_hist_list = list(journal.owner_history.filter( - institution__institution_name=p - )) - for own_hist in own_hist_list: - own_hist.raw_institution_name = p - own_hist.raw_country_name = publisher_country - own_hist.raw_state_name = publisher_state - own_hist.raw_city_name = publisher_city - - if own_hist_list: - from journal.models import OwnerHistory - OwnerHistory.objects.bulk_update( - own_hist_list, - ['raw_institution_name', 'raw_country_name', - 'raw_state_name', 'raw_city_name'] - ) + # Filter non-empty publisher names + publisher_names = [p for p in publisher if p] + + if publisher_names: + # Update PublisherHistory records - use __in for single query + pub_hist_list = list(journal.publisher_history.filter( + institution__institution_name__in=publisher_names + )) + # Create a mapping of institution names to publisher data + for pub_hist in pub_hist_list: + pub_hist.raw_institution_name = pub_hist.institution.institution_name + pub_hist.raw_country_name = publisher_country + pub_hist.raw_state_name = publisher_state + pub_hist.raw_city_name = publisher_city + + if pub_hist_list: + from journal.models import PublisherHistory + PublisherHistory.objects.bulk_update( + pub_hist_list, + ['raw_institution_name', 'raw_country_name', + 'raw_state_name', 'raw_city_name'] + ) + + # Update OwnerHistory records - use __in for single query + own_hist_list = list(journal.owner_history.filter( + institution__institution_name__in=publisher_names + )) + for own_hist in own_hist_list: + own_hist.raw_institution_name = own_hist.institution.institution_name + own_hist.raw_country_name = publisher_country + own_hist.raw_state_name = publisher_state + own_hist.raw_city_name = publisher_city + + if own_hist_list: + from journal.models import OwnerHistory + OwnerHistory.objects.bulk_update( + own_hist_list, + ['raw_institution_name', 'raw_country_name', + 'raw_state_name', 'raw_city_name'] + ) # Update SponsorHistory records if sponsor: if isinstance(sponsor, str): sponsor = [sponsor] - for s in sponsor: - if s: - spon_hist_list = list(journal.sponsor_history.filter( - institution__institution_name=s - )) - for spon_hist in spon_hist_list: - spon_hist.raw_institution_name = s - - if spon_hist_list: - from journal.models import SponsorHistory - SponsorHistory.objects.bulk_update( - spon_hist_list, - ['raw_institution_name'] - ) + # Filter non-empty sponsor names + sponsor_names = [s for s in sponsor if s] + + if sponsor_names: + spon_hist_list = list(journal.sponsor_history.filter( + institution__institution_name__in=sponsor_names + )) + for spon_hist in spon_hist_list: + spon_hist.raw_institution_name = spon_hist.institution.institution_name + + if spon_hist_list: + from journal.models import SponsorHistory + SponsorHistory.objects.bulk_update( + spon_hist_list, + ['raw_institution_name'] + ) # Update CopyrightHolderHistory records if copyright_holder: if isinstance(copyright_holder, str): copyright_holder = [copyright_holder] - for cp in copyright_holder: - if cp: - cp_hist_list = list(journal.copyright_holder_history.filter( - institution__institution_name=cp - )) - for cp_hist in cp_hist_list: - cp_hist.raw_institution_name = cp - - if cp_hist_list: - from journal.models import CopyrightHolderHistory - CopyrightHolderHistory.objects.bulk_update( - cp_hist_list, - ['raw_institution_name'] - ) + # Filter non-empty copyright holder names + cp_names = [cp for cp in copyright_holder if cp] + + if cp_names: + cp_hist_list = list(journal.copyright_holder_history.filter( + institution__institution_name__in=cp_names + )) + for cp_hist in cp_hist_list: + cp_hist.raw_institution_name = cp_hist.institution.institution_name + + if cp_hist_list: + from journal.models import CopyrightHolderHistory + CopyrightHolderHistory.objects.bulk_update( + cp_hist_list, + ['raw_institution_name'] + ) processed_count += 1