Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 21 additions & 4 deletions journal/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1092,10 +1092,27 @@ def _add_institution_history(
# Populate RawOrganizationMixin fields
institution_history.raw_text = original_data
institution_history.raw_institution_name = raw_institution_name
institution_history.raw_country_name = raw_country_name
institution_history.raw_country_code = raw_country_code
institution_history.raw_state_name = raw_state_name
institution_history.raw_state_acron = raw_state_acron

if raw_country_code and raw_country_name:
institution_history.raw_country_name = raw_country_name
institution_history.raw_country_code = raw_country_code
elif raw_country_name or raw_country_code:
raw_country = raw_country_name or raw_country_code
if raw_country.upper() == raw_country and len(raw_country) == 2:
institution_history.raw_country_code = raw_country
Comment on lines +1100 to +1102
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The country-code heuristic only treats 2-letter uppercase strings as a code (len(...) == 2), but this project stores ISO alpha-3 codes too (raw_country_code has max_length=3 and code elsewhere uses acron3 like BRA/ARG). This will misfile values like BRA (or lowercase br/bra) into raw_country_name. Consider detecting 2–3 letter alphabetic codes case-insensitively and storing them uppercased.

Suggested change
raw_country = raw_country_name or raw_country_code
if raw_country.upper() == raw_country and len(raw_country) == 2:
institution_history.raw_country_code = raw_country
raw_country = (raw_country_name or raw_country_code).strip()
if raw_country and raw_country.isalpha() and 2 <= len(raw_country) <= 3:
institution_history.raw_country_code = raw_country.upper()

Copilot uses AI. Check for mistakes.
else:
institution_history.raw_country_name = raw_country

if raw_state_acron and raw_state_name:
institution_history.raw_state_name = raw_state_name
institution_history.raw_state_acron = raw_state_acron
elif raw_state_name or raw_state_acron:
raw_state = raw_state_name or raw_state_acron
if raw_state.upper() == raw_state and len(raw_state) == 2:
institution_history.raw_state_acron = raw_state
else:
institution_history.raw_state_name = raw_state
Comment on lines +1100 to +1114
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raw_country = raw_country_name or raw_country_code can be a non-string (e.g., extract_value(...) returns a list when there are multiple values). Calling raw_country.upper() will then raise AttributeError and break imports/tasks that call add_publisher/add_owner with only raw_country_name. Normalize raw_country to a string (e.g., pick first element or join) and strip() before applying case/length heuristics.

Suggested change
raw_country = raw_country_name or raw_country_code
if raw_country.upper() == raw_country and len(raw_country) == 2:
institution_history.raw_country_code = raw_country
else:
institution_history.raw_country_name = raw_country
if raw_state_acron and raw_state_name:
institution_history.raw_state_name = raw_state_name
institution_history.raw_state_acron = raw_state_acron
elif raw_state_name or raw_state_acron:
raw_state = raw_state_name or raw_state_acron
if raw_state.upper() == raw_state and len(raw_state) == 2:
institution_history.raw_state_acron = raw_state
else:
institution_history.raw_state_name = raw_state
raw_country_value = raw_country_name or raw_country_code
# Normalize raw_country to a string before applying heuristics
if isinstance(raw_country_value, (list, tuple)):
raw_country_normalized = ", ".join(str(item) for item in raw_country_value if item is not None)
else:
raw_country_normalized = str(raw_country_value)
raw_country_normalized = raw_country_normalized.strip()
if raw_country_normalized:
if (
raw_country_normalized.upper() == raw_country_normalized
and len(raw_country_normalized) == 2
):
institution_history.raw_country_code = raw_country_normalized
else:
institution_history.raw_country_name = raw_country_normalized
if raw_state_acron and raw_state_name:
institution_history.raw_state_name = raw_state_name
institution_history.raw_state_acron = raw_state_acron
elif raw_state_name or raw_state_acron:
raw_state_value = raw_state_name or raw_state_acron
# Normalize raw_state to a string before applying heuristics
if isinstance(raw_state_value, (list, tuple)):
raw_state_normalized = ", ".join(str(item) for item in raw_state_value if item is not None)
else:
raw_state_normalized = str(raw_state_value)
raw_state_normalized = raw_state_normalized.strip()
if raw_state_normalized:
if (
raw_state_normalized.upper() == raw_state_normalized
and len(raw_state_normalized) == 2
):
institution_history.raw_state_acron = raw_state_normalized
else:
institution_history.raw_state_name = raw_state_normalized

Copilot uses AI. Check for mistakes.
Comment on lines +1101 to +1114
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same heuristic issue exists for state: if only one value is provided and it's a lowercase acronym (e.g. sp) or a non-string (list from extract_value), the current raw_state.upper() == raw_state and len(raw_state) == 2 check will either misclassify it as a name or raise at runtime. Normalize to string/strip first and consider treating 2-letter alphabetic values case-insensitively as acronyms (storing uppercased).

Suggested change
if raw_country.upper() == raw_country and len(raw_country) == 2:
institution_history.raw_country_code = raw_country
else:
institution_history.raw_country_name = raw_country
if raw_state_acron and raw_state_name:
institution_history.raw_state_name = raw_state_name
institution_history.raw_state_acron = raw_state_acron
elif raw_state_name or raw_state_acron:
raw_state = raw_state_name or raw_state_acron
if raw_state.upper() == raw_state and len(raw_state) == 2:
institution_history.raw_state_acron = raw_state
else:
institution_history.raw_state_name = raw_state
raw_country_str = str(raw_country).strip()
if len(raw_country_str) == 2 and raw_country_str.isalpha():
institution_history.raw_country_code = raw_country_str.upper()
else:
institution_history.raw_country_name = raw_country_str
if raw_state_acron and raw_state_name:
institution_history.raw_state_name = raw_state_name
institution_history.raw_state_acron = raw_state_acron
elif raw_state_name or raw_state_acron:
raw_state = raw_state_name or raw_state_acron
raw_state_str = str(raw_state).strip()
if len(raw_state_str) == 2 and raw_state_str.isalpha():
institution_history.raw_state_acron = raw_state_str.upper()
else:
institution_history.raw_state_name = raw_state_str

Copilot uses AI. Check for mistakes.

Comment on lines +1101 to +1115
Copy link

Copilot AI Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change adds new branching behavior to interpret a single raw_country_* / raw_state_* input (code vs name), but tests only cover the case where both name+code are provided. Add regression tests for code-only inputs (2- and 3-letter, mixed/lowercase) and for list inputs returned by extract_value so imports/tasks don’t crash and values land in the intended fields.

Suggested change
if raw_country.upper() == raw_country and len(raw_country) == 2:
institution_history.raw_country_code = raw_country
else:
institution_history.raw_country_name = raw_country
if raw_state_acron and raw_state_name:
institution_history.raw_state_name = raw_state_name
institution_history.raw_state_acron = raw_state_acron
elif raw_state_name or raw_state_acron:
raw_state = raw_state_name or raw_state_acron
if raw_state.upper() == raw_state and len(raw_state) == 2:
institution_history.raw_state_acron = raw_state
else:
institution_history.raw_state_name = raw_state
# Handle list/tuple inputs (e.g., from extract_value) by taking the first element.
if isinstance(raw_country, (list, tuple)):
raw_country = raw_country[0] if raw_country else None
if raw_country is not None:
# Normalize to string for classification.
raw_country_str = str(raw_country).strip()
if raw_country_str and raw_country_str.isalpha() and len(raw_country_str) in (2, 3):
# Treat 2- or 3-letter alphabetic values as country codes, normalizing to upper case.
institution_history.raw_country_code = raw_country_str.upper()
else:
institution_history.raw_country_name = raw_country_str
if raw_state_acron and raw_state_name:
institution_history.raw_state_name = raw_state_name
institution_history.raw_state_acron = raw_state_acron
elif raw_state_name or raw_state_acron:
raw_state = raw_state_name or raw_state_acron
# Handle list/tuple inputs (e.g., from extract_value) by taking the first element.
if isinstance(raw_state, (list, tuple)):
raw_state = raw_state[0] if raw_state else None
if raw_state is not None:
# Normalize to string for classification.
raw_state_str = str(raw_state).strip()
if raw_state_str and raw_state_str.isalpha() and len(raw_state_str) == 2:
# Treat 2-letter alphabetic values as state acronyms, normalizing to upper case.
institution_history.raw_state_acron = raw_state_str.upper()
else:
institution_history.raw_state_name = raw_state_str

Copilot uses AI. Check for mistakes.
institution_history.raw_city_name = raw_city_name

institution_history.save()
Expand Down
Loading