diff --git a/VERSION b/VERSION index 26ca594..dc1e644 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.5.1 +1.6.0 diff --git a/article/migrations/0002_alter_article_unique_together_article_files_and_more.py b/article/migrations/0002_alter_article_unique_together_article_files_and_more.py new file mode 100644 index 0000000..cee055c --- /dev/null +++ b/article/migrations/0002_alter_article_unique_together_article_files_and_more.py @@ -0,0 +1,42 @@ +# Generated by Django 5.0.7 on 2025-04-01 01:09 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("article", "0001_initial"), + ("collection", "0001_initial"), + ] + + operations = [ + migrations.AddField( + model_name="article", + name="files", + field=models.JSONField( + blank=True, default=dict, null=True, verbose_name="Files" + ), + ), + migrations.AddField( + model_name="article", + name="pid_generic", + field=models.CharField( + blank=True, + db_index=True, + max_length=50, + null=True, + verbose_name="PID Generic", + ), + ), + migrations.RemoveField( + model_name="article", + name="pdfs", + ), + migrations.AlterUniqueTogether( + name="article", + unique_together={ + ("collection", "scielo_issn", "pid_v2", "pid_v3", "pid_generic") + }, + ), + + ] diff --git a/article/models.py b/article/models.py index 8d1e34b..5eaf11e 100644 --- a/article/models.py +++ b/article/models.py @@ -39,8 +39,16 @@ class Article(CommonControlField): db_index=True, ) - pdfs = models.JSONField( - verbose_name=_('Format with Language'), + pid_generic = models.CharField( + verbose_name=_('PID Generic'), + max_length=50, + blank=True, + null=True, + db_index=True, + ) + + files = models.JSONField( + verbose_name=_('Files'), null=True, blank=True, default=dict, @@ -92,9 +100,10 @@ def metadata(cls, collection=None): yield { 'collection': a.collection.acron3, 'default_lang': a.default_lang, - 'pdfs': a.pdfs, + 'files': a.files, 'pid_v2': a.pid_v2, 'pid_v3': a.pid_v3, + 'pid_generic': a.pid_generic, 'processing_date': a.processing_date, 'publication_date': a.publication_date, 'publication_year': a.publication_year, @@ -110,4 +119,5 @@ class Meta: 'scielo_issn', 'pid_v2', 'pid_v3', + 'pid_generic', ) diff --git a/article/tasks.py b/article/tasks.py index 55a5be1..0626348 100644 --- a/article/tasks.py +++ b/article/tasks.py @@ -6,7 +6,7 @@ from collection.models import Collection from config import celery_app -from core.utils import date_utils +from core.utils import date_utils, standardizer from core.utils.utils import _get_user from journal.models import Journal @@ -54,7 +54,7 @@ def task_load_article_from_article_meta(self, from_date=None, until_date=None, d article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=jou.scielo_issn, pid_v2=obj.get('code')) if created or force_update: - article.pdfs = obj.get('pdfs') or {} + article.files = obj.get('files') or {} article.processing_date = obj.get('processing_date') or '' article.publication_date = obj.get('publication_date') or '' article.publication_year = obj.get('publication_year') or '' @@ -119,12 +119,12 @@ def task_load_article_from_opac(self, collection='scl', from_date=None, until_da return True -@celery_app.task(bind=True, name=_('Load preprint data from Preprints Server'), timelimit=-1) +@celery_app.task(bind=True, name=_('Load preprint data from SciELO Preprints'), timelimit=-1) def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None): user = _get_user(self.request, username=username, user_id=user_id) from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) - logging.info(f'Loading preprints from Preprints Server. From: {from_date}, Until: {until_date}') + logging.info(f'Loading preprints from SciELO Preprints. From: {from_date}, Until: {until_date}') col_obj = Collection.objects.get(acron3='preprints') if not col_obj: @@ -134,12 +134,11 @@ def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None for record in utils.fetch_preprint_oai_pmh(from_date, until_date): data = utils.extract_preprint_data(record) - if not data.get('pid_v2'): - logging.error(f'PIDv2 not found in record: {record}') + if not data.get('pid_generic'): + logging.error(f'Preprint ID not found in record: {record}') continue - # Currently, we are using the record.header.identifier as the PIDv2 - article, created = models.Article.objects.get_or_create(collection=col_obj, pid_v2=data['pid_v2']) + article, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=data['pid_generic']) if created or force_update: article.text_langs = data.get('text_langs') article.default_lang = data.get('default_language') @@ -151,3 +150,39 @@ def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None article.save() logging.debug(f'Article {"created" if created else "updated"}: {article}') + + +@celery_app.task(bind=True, name=_('Load dataset metadata from Dataverse'), timelimit=-1) +def task_load_dataset_metadata_from_dataverse(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None): + user = _get_user(self.request, username=username, user_id=user_id) + + from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) + logging.info(f'Loading dataset metadata from SciELO Data. From: {from_date}, Until: {until_date}') + + col_obj = Collection.objects.get(acron3='data') + if not col_obj: + logging.error(f'Collection not found: data') + return False + + for record in utils.fetch_dataverse_metadata(from_date, until_date): + dataset_doi = record.get('dataset_doi') + if not dataset_doi: + logging.error(f'Dataset DOI not found in record: {record}') + continue + + dataset, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=dataset_doi) + if created or force_update: + dataset.publication_date = record.get('dataset_published') + + file_persistent_id = record.get('file_persistent_id') + file_id = record.get('file_id') + file_name = record.get('file_name') + file_url = record.get('file_url') + + if file_id: + dataset.files[file_id] = {'name': file_name, 'url': file_url, 'file_persisent_id': file_persistent_id} + + dataset.save() + logging.debug(f'Dataset {"created" if created else "updated"}: {dataset}') + + return True diff --git a/article/utils.py b/article/utils.py index 9a03bdd..0814bbb 100644 --- a/article/utils.py +++ b/article/utils.py @@ -1,4 +1,3 @@ -import langcodes import logging import requests import os @@ -6,19 +5,26 @@ from sickle import Sickle from time import sleep +from core.utils import standardizer + ARTICLEMETA_ENDPOINT = os.environ.get('ARTICLEMETA_COLLECT_URL', 'http://articlemeta.scielo.org/api/v1/article/counter_dict') -ARTICLEMETA_MAX_RETRIES = int(os.environ.get('ARTICLEMETA_COLLECT_MAX_RETRIES', 5)) -ARTICLEMETA_SLEEP_TIME = int(os.environ.get('ARTICLEMETA_COLLECT_URL_SLEEP_TIME', 30)) +ARTICLEMETA_MAX_RETRIES = int(os.environ.get('ARTICLEMETA_MAX_RETRIES', 5)) +ARTICLEMETA_SLEEP_TIME = int(os.environ.get('ARTICLEMETA_SLEEP_TIME', 30)) OPAC_ENDPOINT = os.environ.get('OPAC_ENDPOINT', 'https://www.scielo.br/api/v1/counter_dict') -OPAC_MAX_RETRIES = int(os.environ.get('OPAC_COLLECT_MAX_RETRIES', 5)) -OPAC_SLEEP_TIME = int(os.environ.get('OPAC_COLLECT_URL_SLEEP_TIME', 30)) +OPAC_MAX_RETRIES = int(os.environ.get('OPAC_MAX_RETRIES', 5)) +OPAC_SLEEP_TIME = int(os.environ.get('OPAC_SLEEP_TIME', 30)) OAI_PMH_PREPRINT_ENDPOINT = os.environ.get('OAI_PMH_PREPRINT_ENDPOINT', 'https://preprints.scielo.org/index.php/scielo/oai') OAI_METADATA_PREFIX = os.environ.get('OAI_METADATA_PREFIX', 'oai_dc') OAI_PMH_MAX_RETRIES = int(os.environ.get('OAI_PMH_MAX_RETRIES', 5)) +DATAVERSE_ENDPOINT = os.environ.get('DATAVERSE_ENDPOINT', 'https://data.scielo.org/api') +DATAVERSE_ROOT_COLLECTION = os.environ.get('DATAVERSE_ROOT_COLLECTION', 'scielodata') +DATAVERSE_MAX_RETRIES = int(os.environ.get('DATAVERSE_MAX_RETRIES', 5)) +DATAVERSE_SLEEP_TIME = int(os.environ.get('DATAVERSE_SLEEP_TIME', 30)) + def fetch_article_meta_dict(from_date, until_date, offset=0, limit=1000, collection=None, issn=None): for t in range(1, ARTICLEMETA_MAX_RETRIES + 1): @@ -91,14 +97,14 @@ def fetch_preprint_oai_pmh(from_date, until_date): def extract_preprint_data(record): - pid_v2 = _extract_preprint_compatible_identifer(record.header.identifier) - text_langs = [_standardize_langcode(l) for l in record.metadata.get('language', [])] + pid_generic = _extract_preprint_compatible_identifer(record.header.identifier) + text_langs = [standardizer.standardize_language_code(l) for l in record.metadata.get('language', [])] publication_date = record.metadata.get('date', [''])[0] default_language = text_langs[0] if text_langs else '' publication_year = _extract_preprint_publication_year_from_date(publication_date) data = { - 'pid_v2': pid_v2, + 'pid_generic': pid_generic, 'text_langs': text_langs, 'publication_date': publication_date, 'default_language': default_language, @@ -124,12 +130,75 @@ def _extract_preprint_publication_year_from_date(date_str): return '' -def _standardize_langcode(language): - if langcodes.tag_is_valid(language): - return langcodes.standardize_tag(language) +def fetch_dataverse_metadata(from_date=None, until_date=None): + def get_subdataverses(): + url = f"{DATAVERSE_ENDPOINT}/dataverses/{DATAVERSE_ROOT_COLLECTION}/contents" + try: + response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME) + response.raise_for_status() + return response.json().get("data", []) + except requests.exceptions.RequestException as e: + logging.error(f"Error fetching subdataverses: {e}") + return [] - logging.warning(f'Tentando padronizar {language}') - inferred_lang, score = langcodes.best_match(language, langcodes.LANGUAGE_ALPHA3.keys()) - - if score >= 0.75: - return langcodes.standardize_tag(inferred_lang) + def get_datasets(subdataverse_id): + url = f"{DATAVERSE_ENDPOINT}/dataverses/{subdataverse_id}/contents" + try: + response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME) + response.raise_for_status() + return response.json().get("data", []) + except requests.exceptions.RequestException as e: + logging.error(f"Error fetching datasets for subdataverse {subdataverse_id}: {e}") + return [] + + def get_files(dataset_id): + url = f"{DATAVERSE_ENDPOINT}/datasets/{dataset_id}/versions/:latest/files" + try: + response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME) + response.raise_for_status() + return response.json().get("data", []) + except requests.exceptions.RequestException as e: + logging.error(f"Error fetching files for dataset {dataset_id}: {e}") + return [] + + subdataverses = get_subdataverses() + + for subdataverse in subdataverses: + if subdataverse["type"] != "dataverse": + continue + + subdataverse_id = subdataverse["id"] + subdataverse_title = subdataverse["title"] + datasets = get_datasets(subdataverse_id) + + for dataset in datasets: + if dataset["type"] != "dataset": + continue + + dataset_id = dataset["id"] + doi = standardizer.standardize_doi(dataset.get("persistentUrl")) + if not doi: + logging.warning(f"Dataset {dataset_id} does not have a DOI.") + continue + + publication_date = dataset.get("publicationDate", None) + + if publication_date: + if (from_date and publication_date < from_date) or (until_date and publication_date > until_date): + continue + + files = get_files(dataset_id) + + for file in files: + file_persistent_id = file["dataFile"].get("persistentId", None) + file_persistent_id_stz = standardizer.standardize_pid_generic(file_persistent_id) if file_persistent_id else None + + yield { + "title": subdataverse_title, + "dataset_doi": doi, + "dataset_published": publication_date, + "file_id": file["dataFile"]["id"], + "file_name": file["label"], + "file_url": f"{DATAVERSE_ENDPOINT}/access/datafile/{file['dataFile']['id']}", + "file_persistent_id": file_persistent_id_stz, + } diff --git a/article/wagtail_hooks.py b/article/wagtail_hooks.py index 70efcb7..777aa02 100644 --- a/article/wagtail_hooks.py +++ b/article/wagtail_hooks.py @@ -20,12 +20,9 @@ class ArticleSnippetViewSet(SnippetViewSet): "scielo_issn", "pid_v2", "pid_v3", - "pdfs", - "default_lang", - "text_langs", - "processing_date", + "pid_generic", + "files", "publication_date", - "publication_year", ) list_filter = ( "collection", @@ -36,6 +33,7 @@ class ArticleSnippetViewSet(SnippetViewSet): "scielo_issn", "pid_v2", "pid_v3", + "pid_generic", ) register_snippet(ArticleSnippetViewSet) diff --git a/core/tests_standardizer.py b/core/tests_standardizer.py index 48fdff1..a12f88a 100644 --- a/core/tests_standardizer.py +++ b/core/tests_standardizer.py @@ -64,3 +64,76 @@ def test_standardize_name(self): for i, item in enumerate(result): with self.subTest(i): self.assertEqual({"name": expected[i]}, item) + + +class StandardizerStandardizeLanguageCode(TestCase): + def test_standardize_language_code_en_us_is_valid(self): + language_code = 'en-US' + standardized = standardizer.standardize_language_code(language_code) + self.assertEqual(standardized, 'en') + + def test_standardize_language_code_esp_is_valid(self): + language_code = 'esp' + standardized = standardizer.standardize_language_code(language_code) + self.assertEqual(standardized, 'es') + + def test_standardize_language_code_pt_br_is_valid(self): + language_code = 'pt-BR' + standardized = standardizer.standardize_language_code(language_code) + self.assertEqual(standardized, 'pt') + + def test_standardize_language_code_es_is_valid(self): + language_code = 'spa' + standardized = standardizer.standardize_language_code(language_code) + self.assertEqual(standardized, 'es') + + def test_standardize_language_code_en_gb_is_valid(self): + language_code = 'en-GB' + standardized = standardizer.standardize_language_code(language_code) + self.assertEqual(standardized, 'en') + + +class StandardizerStandardizePIDV3(TestCase): + def test_standardize_pid_v3_is_valid(self): + pid_v3 = 'jGJccQ7bFdbz6wy3nfXGVdv' + standardized = standardizer.standardize_pid_v3(pid_v3) + self.assertEqual(standardized, 'jGJccQ7bFdbz6wy3nfXGVdv') + + +class StandardizerStandardizePIDV2(TestCase): + def test_standardize_pid_v2_is_valid(self): + pid_v2 = 'S0102-67202020000100001' + standardized = standardizer.standardize_pid_v2(pid_v2) + self.assertEqual(standardized, 'S0102-67202020000100001') + + +class StandardizerStandardizeDOI(TestCase): + def test_standardize_doi_is_valid(self): + doi = '10.1590/S0102-67202020000100001' + standardized = standardizer.standardize_doi(doi) + self.assertEqual(standardized, '10.1590/S0102-67202020000100001') + + def test_standardize_doi_is_valid_with_doi_prefix(self): + doi = 'doi:10.1590/S0102-67202020000100001' + standardized = standardizer.standardize_doi(doi) + self.assertEqual(standardized, '10.1590/S0102-67202020000100001') + + def test_standardize_doi_is_valid_with_http_prefix(self): + doi = 'http://doi.org/10.1590/S0102-67202020000100001' + standardized = standardizer.standardize_doi(doi) + self.assertEqual(standardized, '10.1590/S0102-67202020000100001') + + def test_standardize_doi_is_valid_with_https_prefix(self): + doi = 'https://doi.org/10.1590/S0102-67202020000100001' + standardized = standardizer.standardize_doi(doi) + self.assertEqual(standardized, '10.1590/S0102-67202020000100001') + + def test_standardize_doi_is_valid_with_doi_prefix_and_http_prefix(self): + doi = 'doi:http://doi.org/10.1590/S0102-67202020000100001' + standardized = standardizer.standardize_doi(doi) + self.assertEqual(standardized, '10.1590/S0102-67202020000100001') + + def test_standardize_doi_is_valid_with_doi_prefix_and_https_prefix(self): + doi = 'doi:https://doi.org/10.1590/S0102-67202020000100001' + standardized = standardizer.standardize_doi(doi) + self.assertEqual(standardized, '10.1590/S0102-67202020000100001') diff --git a/core/utils/standardizer.py b/core/utils/standardizer.py index a8f0691..dda02e9 100644 --- a/core/utils/standardizer.py +++ b/core/utils/standardizer.py @@ -1,4 +1,5 @@ -import logging +import langcodes +import re ITEMS_SEP_FOR_LOCATION = [";", ", ", "|", "/"] @@ -89,3 +90,131 @@ def standardize_name(original): if not row: continue yield {"name": row} + + +def standardize_language_code(language_code: str, threshold=0.75): + """ + Standardizes a media language using langcodes library. + + Parameters: + media_language (str): The media language to be standardized. + threshold (float): The minimum score for a language to be considered valid. Default is 0.75. + + Returns: + str: The standardized media language or None if the input is not a valid language tag. + """ + if not language_code: + return 'un' + + if langcodes.tag_is_valid(language_code): + return langcodes.standardize_tag(language_code).split('-')[0] + + # Handle special cases + if language_code.lower() == 'esp': + return 'es' + + inferred_lang, score = langcodes.best_match(language_code, langcodes.LANGUAGE_ALPHA3.keys()) + + if score >= threshold: + return langcodes.standardize_tag(inferred_lang).split('-')[0] + + # Handle unknown languages + return 'un' + + +def standardize_pid_v2(pid_v2): + """ + Standardizes a PID v2. + + Parameters: + pid_v2 (str): The PID v2 to be standardized. + + Returns: + str: The standardized PID v2 or an empty string if the input is not a valid PID v2. + """ + if not pid_v2 or not pid_v2.lower().startswith('s') or len(pid_v2) < 23: + return '' + + if len(pid_v2) == 23: + return pid_v2[0].upper() + pid_v2[1:] + + if len(pid_v2) > 23: + return pid_v2[0].upper() + pid_v2[1:23] + + if len(pid_v2) < 23: + return '' + + +def standardize_pid_v3(pid_v3): + """ + Standardizes a PID v3 using langcodes library." + + Parameters: + pid_v3 (str): The PID v3 to be standardized. + + Returns: + str: The standardized PID v3 or an empty string if the input is not a valid PID v3. + """ + + if not pid_v3: + return '' + + if len(pid_v3) == 23: + return pid_v3 + + if len(pid_v3) > 23: + return pid_v3[:23] + + if len(pid_v3) < 23: + return '' + + +def standardize_doi(text): + """" + Standardizes a DOI. + + Parameters: + text (str): The DOI to be standardized. + + Returns: + str: The standardized DOI + """ + PATTERNS_DOI = [re.compile(pd) for pd in [ + r'10.\d{4,9}/[-._;()/:A-Z0-9]+$', + r'10.1002/[^\s]+$', + r'10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$', + r'10.1207/[\w\d]+\&\d+_\d+$', + r'10.\d{4,9}/[-._;()/:a-zA-Z0-9]*'] + ] + matched_doi = False + + for pattern_doi in PATTERNS_DOI: + matched_doi = pattern_doi.search(text) + if matched_doi: + break + + if not matched_doi: + return + + return matched_doi.group().upper() + + +def standardize_pid_generic(pid_generic): + """ + Standardizes a PID." + + Parameters: + pid_generic (str): The PID to be standardized. + + Returns: + str: The standardized PID or an empty string if the input is not a valid PID. + """ + + if not pid_generic: + return '' + + pid_generic_based_on_doi = standardize_doi(pid_generic) + if pid_generic_based_on_doi: + return pid_generic_based_on_doi + + return pid_generic.strip().upper() diff --git a/metrics/migrations/0006_alter_itemaccess_content_type.py b/metrics/migrations/0006_alter_itemaccess_content_type.py new file mode 100644 index 0000000..0e81287 --- /dev/null +++ b/metrics/migrations/0006_alter_itemaccess_content_type.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.7 on 2025-03-31 21:07 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("metrics", "0005_alter_itemaccess_unique_together_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="itemaccess", + name="content_type", + field=models.CharField(max_length=32, verbose_name="Content Type"), + ), + ] diff --git a/metrics/models.py b/metrics/models.py index 1dfe3fa..40179a7 100644 --- a/metrics/models.py +++ b/metrics/models.py @@ -36,7 +36,7 @@ class Item(models.Model): def __str__(self): return '|'.join([ - self.collection.acron2, + self.collection.acron3, self.journal.acronym, self.article.pid_v2, ]) @@ -175,7 +175,7 @@ class ItemAccess(models.Model): content_type = models.CharField( verbose_name=_("Content Type"), - max_length=16, + max_length=32, null=False, blank=False, ) @@ -187,7 +187,7 @@ class ItemAccess(models.Model): def __str__(self): return '|'.join([ - self.item.collection.acron2, + self.item.collection.acron3, self.item.journal.acronym, self.item.article.pid_v2, self.user_session.user_session(), diff --git a/metrics/tasks.py b/metrics/tasks.py index 7bca425..322e6e5 100644 --- a/metrics/tasks.py +++ b/metrics/tasks.py @@ -13,6 +13,7 @@ from config import celery_app from article.models import Article +from core.utils import standardizer from journal.models import Journal from log_manager import choices from log_manager_config.models import ( @@ -25,9 +26,6 @@ from tracker import choices as tracker_choices from .utils import ( - standardize_media_language, - standardize_pid_v2, - standardize_pid_v3, is_valid_item_access_data, translator_class_name_to_obj, ) @@ -150,13 +148,14 @@ def _process_line(line, utm, log_file): except Exception as e: _log_discarded_line(log_file, line, tracker_choices.LOG_FILE_DISCARDED_LINE_REASON_URL_TRANSLATION, str(e)) return False - + item_access_data = { 'collection': log_file.collection, 'scielo_issn': translated_url.get('scielo_issn'), - 'pid_v2': standardize_pid_v2(translated_url.get('pid_v2')), - 'pid_v3': standardize_pid_v3(translated_url.get('pid_v3')), - 'media_language': standardize_media_language(translated_url.get('media_language')), + 'pid_v2': standardizer.standardize_pid_v2(translated_url.get('pid_v2')), + 'pid_v3': standardizer.standardize_pid_v3(translated_url.get('pid_v3')), + 'pid_generic': standardizer.standardize_pid_generic(translated_url.get('pid_generic')), + 'media_language': standardizer.standardize_language_code(translated_url.get('media_language')), 'media_format': translated_url.get('media_format'), 'content_type': translated_url.get('content_type'), } @@ -179,6 +178,7 @@ def _register_item_access(item_access_data, line, log_file): scielo_issn = item_access_data.get('scielo_issn') pid_v2 = item_access_data.get('pid_v2') pid_v3 = item_access_data.get('pid_v3') + pid_generic = item_access_data.get('pid_generic') media_format = item_access_data.get('media_format') media_language = item_access_data.get('media_language') content_type = item_access_data.get('content_type') @@ -190,7 +190,7 @@ def _register_item_access(item_access_data, line, log_file): country_code = line.get('country_code') ip_address = line.get('ip_address') - art_obj = _fetch_article(collection, pid_v2, pid_v3, log_file, line) + art_obj = _fetch_article(collection, pid_v2, pid_v3, pid_generic, log_file, line) if not art_obj: return @@ -215,8 +215,10 @@ def _register_item_access(item_access_data, line, log_file): ita.save() -def _fetch_article(collection, pid_v2, pid_v3, log_file, line): +def _fetch_article(collection, pid_v2, pid_v3, pid_generic, log_file, line): try: + if pid_generic: + return Article.objects.get(Q(collection=collection) & Q(pid_generic=pid_generic)) return Article.objects.get(Q(collection=collection) & (Q(pid_v2=pid_v2) | Q(pid_v3=pid_v3))) except Article.DoesNotExist: _log_discarded_line( diff --git a/metrics/tests/test_utils.py b/metrics/tests/test_utils.py index 07d3583..cff448e 100644 --- a/metrics/tests/test_utils.py +++ b/metrics/tests/test_utils.py @@ -7,52 +7,13 @@ CONTENT_TYPE_UNDEFINED, CONTENT_TYPE_FULL_TEXT, CONTENT_TYPE_ABSTRACT, + DEFAULT_SCIELO_ISSN, ) -from metrics.utils import ( - standardize_media_language, - standardize_pid_v2, - standardize_pid_v3, - is_valid_item_access_data, -) +from metrics.utils import is_valid_item_access_data class TestUtils(unittest.TestCase): - def test_standardize_media_language_en_us_is_valid(self): - media_language = 'en-US' - standardized = standardize_media_language(media_language) - self.assertEqual(standardized, 'en') - - def test_standardize_media_language_esp_is_valid(self): - media_language = 'esp' - standardized = standardize_media_language(media_language) - self.assertEqual(standardized, 'es') - - def test_standardize_media_language_pt_br_is_valid(self): - media_language = 'pt-BR' - standardized = standardize_media_language(media_language) - self.assertEqual(standardized, 'pt') - - def test_standardize_media_language_es_is_valid(self): - media_language = 'spa' - standardized = standardize_media_language(media_language) - self.assertEqual(standardized, 'es') - - def test_standardize_media_language_en_gb_is_valid(self): - media_language = 'en-GB' - standardized = standardize_media_language(media_language) - self.assertEqual(standardized, 'en') - - def test_standardize_pid_v3_is_valid(self): - pid_v3 = 'jGJccQ7bFdbz6wy3nfXGVdv' - standardized = standardize_pid_v3(pid_v3) - self.assertEqual(standardized, 'jGJccQ7bFdbz6wy3nfXGVdv') - - def test_standardize_pid_v2_is_valid(self): - pid_v2 = 'S0102-67202020000100001' - standardized = standardize_pid_v2(pid_v2) - self.assertEqual(standardized, 'S0102-67202020000100001') - def test_is_valid_item_access_data_valid(self): data = { 'scielo_issn': '1234-5678', @@ -122,3 +83,14 @@ def test_is_valid_item_access_data_content_type_abstract(self): 'content_type': CONTENT_TYPE_ABSTRACT } self.assertTrue(is_valid_item_access_data(data)) + + def test_is_valid_item_acess_data_dataverse(self): + data = { + 'scielo_issn': DEFAULT_SCIELO_ISSN, + 'pid_v2': None, + 'pid_v3': None, + 'pid_generic': 'DOI:10.48331/SCIELODATA.JLMAIY', + 'media_format': MEDIA_FORMAT_HTML, + 'content_type': CONTENT_TYPE_ABSTRACT, + } + self.assertTrue(is_valid_item_access_data(data)) \ No newline at end of file diff --git a/metrics/utils.py b/metrics/utils.py index 2820516..e170e07 100644 --- a/metrics/utils.py +++ b/metrics/utils.py @@ -1,6 +1,5 @@ import csv import io -import langcodes import tarfile from scielo_usage_counter.values import ( @@ -84,83 +83,6 @@ def load_tar_gz(file_path, delimiter='\t'): ) -def standardize_media_language(media_language: str, threshold=0.75): - """ - Standardizes a media language using langcodes library. - - Parameters: - media_language (str): The media language to be standardized. - threshold (float): The minimum score for a language to be considered valid. Default is 0.75. - - Returns: - str: The standardized media language or None if the input is not a valid language tag. - """ - if not media_language: - return 'un' - - if langcodes.tag_is_valid(media_language): - return langcodes.standardize_tag(media_language).split('-')[0] - - # Handle special cases - if media_language.lower() == 'esp': - return 'es' - - inferred_lang, score = langcodes.best_match(media_language, langcodes.LANGUAGE_ALPHA3.keys()) - - if score >= threshold: - return langcodes.standardize_tag(inferred_lang).split('-')[0] - - # Handle unknown languages - return 'un' - - -def standardize_pid_v2(pid_v2): - """ - Standardizes a PID v2. - - Parameters: - pid_v2 (str): The PID v2 to be standardized. - - Returns: - str: The standardized PID v2 or an empty string if the input is not a valid PID v2. - """ - if not pid_v2 or not pid_v2.lower().startswith('s') or len(pid_v2) < 23: - return '' - - if len(pid_v2) == 23: - return pid_v2[0].upper() + pid_v2[1:] - - if len(pid_v2) > 23: - return pid_v2[0].upper() + pid_v2[1:23] - - if len(pid_v2) < 23: - return '' - - -def standardize_pid_v3(pid_v3): - """ - Standardizes a PID v3 using langcodes library." - - Parameters: - pid_v3 (str): The PID v3 to be standardized. - - Returns: - str: The standardized PID v3 or an empty string if the input is not a valid PID v3. - """ - - if not pid_v3: - return '' - - if len(pid_v3) == 23: - return pid_v3 - - if len(pid_v3) > 23: - return pid_v3[:23] - - if len(pid_v3) < 23: - return '' - - def is_valid_item_access_data(data): """ Validates the item access data based on the provided parameters. @@ -184,12 +106,13 @@ def is_valid_item_access_data(data): content_type = data.get('content_type') pid_v2 = data.get('pid_v2') pid_v3 = data.get('pid_v3') + pid_generic = data.get('pid_generic') if not all([ scielo_issn, media_format and media_format != MEDIA_FORMAT_UNDEFINED, content_type and content_type != CONTENT_TYPE_UNDEFINED, - pid_v2 or pid_v3 + pid_v2 or pid_v3 or pid_generic, ]): return False return True diff --git a/requirements/base.txt b/requirements/base.txt index 9c3fce3..95b68bf 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -67,7 +67,7 @@ reverse-geocode==1.6 # https://pypi.org/project/reverse-geocode/ # SciELO Usage COUNTER device-detector==0.10 # https://github.com/thinkwelltwd/device_detector --e git+https://github.com/scieloorg/scielo_usage_counter@1.2.3#egg=scielo_usage_counter +-e git+https://github.com/scieloorg/scielo_usage_counter@1.3.0#egg=scielo_usage_counter # langcodes # ------------------------------------------------------------------------------