Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.5.1
1.6.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Generated by Django 5.0.7 on 2025-04-01 01:09

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("article", "0001_initial"),
("collection", "0001_initial"),
]

operations = [
migrations.AddField(
model_name="article",
name="files",
field=models.JSONField(
blank=True, default=dict, null=True, verbose_name="Files"
),
),
migrations.AddField(
model_name="article",
name="pid_generic",
field=models.CharField(
blank=True,
db_index=True,
max_length=50,
null=True,
verbose_name="PID Generic",
),
),
migrations.RemoveField(
model_name="article",
name="pdfs",
),
migrations.AlterUniqueTogether(
name="article",
unique_together={
("collection", "scielo_issn", "pid_v2", "pid_v3", "pid_generic")
},
),

]
16 changes: 13 additions & 3 deletions article/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,16 @@ class Article(CommonControlField):
db_index=True,
)

pdfs = models.JSONField(
verbose_name=_('Format with Language'),
pid_generic = models.CharField(
verbose_name=_('PID Generic'),
max_length=50,
blank=True,
null=True,
db_index=True,
)

files = models.JSONField(
verbose_name=_('Files'),
null=True,
blank=True,
default=dict,
Expand Down Expand Up @@ -92,9 +100,10 @@ def metadata(cls, collection=None):
yield {
'collection': a.collection.acron3,
'default_lang': a.default_lang,
'pdfs': a.pdfs,
'files': a.files,
'pid_v2': a.pid_v2,
'pid_v3': a.pid_v3,
'pid_generic': a.pid_generic,
'processing_date': a.processing_date,
'publication_date': a.publication_date,
'publication_year': a.publication_year,
Expand All @@ -110,4 +119,5 @@ class Meta:
'scielo_issn',
'pid_v2',
'pid_v3',
'pid_generic',
)
51 changes: 43 additions & 8 deletions article/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from collection.models import Collection
from config import celery_app
from core.utils import date_utils
from core.utils import date_utils, standardizer
from core.utils.utils import _get_user

from journal.models import Journal
Expand Down Expand Up @@ -54,7 +54,7 @@ def task_load_article_from_article_meta(self, from_date=None, until_date=None, d

article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=jou.scielo_issn, pid_v2=obj.get('code'))
if created or force_update:
article.pdfs = obj.get('pdfs') or {}
article.files = obj.get('files') or {}
article.processing_date = obj.get('processing_date') or ''
article.publication_date = obj.get('publication_date') or ''
article.publication_year = obj.get('publication_year') or ''
Expand Down Expand Up @@ -119,12 +119,12 @@ def task_load_article_from_opac(self, collection='scl', from_date=None, until_da
return True


@celery_app.task(bind=True, name=_('Load preprint data from Preprints Server'), timelimit=-1)
@celery_app.task(bind=True, name=_('Load preprint data from SciELO Preprints'), timelimit=-1)
def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None):
user = _get_user(self.request, username=username, user_id=user_id)

from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
logging.info(f'Loading preprints from Preprints Server. From: {from_date}, Until: {until_date}')
logging.info(f'Loading preprints from SciELO Preprints. From: {from_date}, Until: {until_date}')

col_obj = Collection.objects.get(acron3='preprints')
if not col_obj:
Expand All @@ -134,12 +134,11 @@ def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None
for record in utils.fetch_preprint_oai_pmh(from_date, until_date):
data = utils.extract_preprint_data(record)

if not data.get('pid_v2'):
logging.error(f'PIDv2 not found in record: {record}')
if not data.get('pid_generic'):
logging.error(f'Preprint ID not found in record: {record}')
continue

# Currently, we are using the record.header.identifier as the PIDv2
article, created = models.Article.objects.get_or_create(collection=col_obj, pid_v2=data['pid_v2'])
article, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=data['pid_generic'])
if created or force_update:
article.text_langs = data.get('text_langs')
article.default_lang = data.get('default_language')
Expand All @@ -151,3 +150,39 @@ def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None

article.save()
logging.debug(f'Article {"created" if created else "updated"}: {article}')


@celery_app.task(bind=True, name=_('Load dataset metadata from Dataverse'), timelimit=-1)
def task_load_dataset_metadata_from_dataverse(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None):
user = _get_user(self.request, username=username, user_id=user_id)

from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
logging.info(f'Loading dataset metadata from SciELO Data. From: {from_date}, Until: {until_date}')

col_obj = Collection.objects.get(acron3='data')
if not col_obj:
logging.error(f'Collection not found: data')
return False

for record in utils.fetch_dataverse_metadata(from_date, until_date):
dataset_doi = record.get('dataset_doi')
if not dataset_doi:
logging.error(f'Dataset DOI not found in record: {record}')
continue

dataset, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=dataset_doi)
if created or force_update:
dataset.publication_date = record.get('dataset_published')

file_persistent_id = record.get('file_persistent_id')
file_id = record.get('file_id')
file_name = record.get('file_name')
file_url = record.get('file_url')

if file_id:
dataset.files[file_id] = {'name': file_name, 'url': file_url, 'file_persisent_id': file_persistent_id}

dataset.save()
logging.debug(f'Dataset {"created" if created else "updated"}: {dataset}')

return True
101 changes: 85 additions & 16 deletions article/utils.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
import langcodes
import logging
import requests
import os

from sickle import Sickle
from time import sleep

from core.utils import standardizer


ARTICLEMETA_ENDPOINT = os.environ.get('ARTICLEMETA_COLLECT_URL', 'http://articlemeta.scielo.org/api/v1/article/counter_dict')
ARTICLEMETA_MAX_RETRIES = int(os.environ.get('ARTICLEMETA_COLLECT_MAX_RETRIES', 5))
ARTICLEMETA_SLEEP_TIME = int(os.environ.get('ARTICLEMETA_COLLECT_URL_SLEEP_TIME', 30))
ARTICLEMETA_MAX_RETRIES = int(os.environ.get('ARTICLEMETA_MAX_RETRIES', 5))
ARTICLEMETA_SLEEP_TIME = int(os.environ.get('ARTICLEMETA_SLEEP_TIME', 30))

OPAC_ENDPOINT = os.environ.get('OPAC_ENDPOINT', 'https://www.scielo.br/api/v1/counter_dict')
OPAC_MAX_RETRIES = int(os.environ.get('OPAC_COLLECT_MAX_RETRIES', 5))
OPAC_SLEEP_TIME = int(os.environ.get('OPAC_COLLECT_URL_SLEEP_TIME', 30))
OPAC_MAX_RETRIES = int(os.environ.get('OPAC_MAX_RETRIES', 5))
OPAC_SLEEP_TIME = int(os.environ.get('OPAC_SLEEP_TIME', 30))

OAI_PMH_PREPRINT_ENDPOINT = os.environ.get('OAI_PMH_PREPRINT_ENDPOINT', 'https://preprints.scielo.org/index.php/scielo/oai')
OAI_METADATA_PREFIX = os.environ.get('OAI_METADATA_PREFIX', 'oai_dc')
OAI_PMH_MAX_RETRIES = int(os.environ.get('OAI_PMH_MAX_RETRIES', 5))

DATAVERSE_ENDPOINT = os.environ.get('DATAVERSE_ENDPOINT', 'https://data.scielo.org/api')
DATAVERSE_ROOT_COLLECTION = os.environ.get('DATAVERSE_ROOT_COLLECTION', 'scielodata')
DATAVERSE_MAX_RETRIES = int(os.environ.get('DATAVERSE_MAX_RETRIES', 5))
DATAVERSE_SLEEP_TIME = int(os.environ.get('DATAVERSE_SLEEP_TIME', 30))


def fetch_article_meta_dict(from_date, until_date, offset=0, limit=1000, collection=None, issn=None):
for t in range(1, ARTICLEMETA_MAX_RETRIES + 1):
Expand Down Expand Up @@ -91,14 +97,14 @@ def fetch_preprint_oai_pmh(from_date, until_date):


def extract_preprint_data(record):
pid_v2 = _extract_preprint_compatible_identifer(record.header.identifier)
text_langs = [_standardize_langcode(l) for l in record.metadata.get('language', [])]
pid_generic = _extract_preprint_compatible_identifer(record.header.identifier)
text_langs = [standardizer.standardize_language_code(l) for l in record.metadata.get('language', [])]
publication_date = record.metadata.get('date', [''])[0]
default_language = text_langs[0] if text_langs else ''
publication_year = _extract_preprint_publication_year_from_date(publication_date)

data = {
'pid_v2': pid_v2,
'pid_generic': pid_generic,
'text_langs': text_langs,
'publication_date': publication_date,
'default_language': default_language,
Expand All @@ -124,12 +130,75 @@ def _extract_preprint_publication_year_from_date(date_str):
return ''


def _standardize_langcode(language):
if langcodes.tag_is_valid(language):
return langcodes.standardize_tag(language)
def fetch_dataverse_metadata(from_date=None, until_date=None):
def get_subdataverses():
url = f"{DATAVERSE_ENDPOINT}/dataverses/{DATAVERSE_ROOT_COLLECTION}/contents"
try:
response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
response.raise_for_status()
return response.json().get("data", [])
except requests.exceptions.RequestException as e:
logging.error(f"Error fetching subdataverses: {e}")
return []

logging.warning(f'Tentando padronizar {language}')
inferred_lang, score = langcodes.best_match(language, langcodes.LANGUAGE_ALPHA3.keys())

if score >= 0.75:
return langcodes.standardize_tag(inferred_lang)
def get_datasets(subdataverse_id):
url = f"{DATAVERSE_ENDPOINT}/dataverses/{subdataverse_id}/contents"
try:
response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
response.raise_for_status()
return response.json().get("data", [])
except requests.exceptions.RequestException as e:
logging.error(f"Error fetching datasets for subdataverse {subdataverse_id}: {e}")
return []

def get_files(dataset_id):
url = f"{DATAVERSE_ENDPOINT}/datasets/{dataset_id}/versions/:latest/files"
try:
response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
response.raise_for_status()
return response.json().get("data", [])
except requests.exceptions.RequestException as e:
logging.error(f"Error fetching files for dataset {dataset_id}: {e}")
return []

subdataverses = get_subdataverses()

for subdataverse in subdataverses:
if subdataverse["type"] != "dataverse":
continue

subdataverse_id = subdataverse["id"]
subdataverse_title = subdataverse["title"]
datasets = get_datasets(subdataverse_id)

for dataset in datasets:
if dataset["type"] != "dataset":
continue

dataset_id = dataset["id"]
doi = standardizer.standardize_doi(dataset.get("persistentUrl"))
if not doi:
logging.warning(f"Dataset {dataset_id} does not have a DOI.")
continue

publication_date = dataset.get("publicationDate", None)

if publication_date:
if (from_date and publication_date < from_date) or (until_date and publication_date > until_date):
continue

files = get_files(dataset_id)

for file in files:
file_persistent_id = file["dataFile"].get("persistentId", None)
file_persistent_id_stz = standardizer.standardize_pid_generic(file_persistent_id) if file_persistent_id else None

yield {
"title": subdataverse_title,
"dataset_doi": doi,
"dataset_published": publication_date,
"file_id": file["dataFile"]["id"],
"file_name": file["label"],
"file_url": f"{DATAVERSE_ENDPOINT}/access/datafile/{file['dataFile']['id']}",
"file_persistent_id": file_persistent_id_stz,
}
8 changes: 3 additions & 5 deletions article/wagtail_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,9 @@ class ArticleSnippetViewSet(SnippetViewSet):
"scielo_issn",
"pid_v2",
"pid_v3",
"pdfs",
"default_lang",
"text_langs",
"processing_date",
"pid_generic",
"files",
"publication_date",
"publication_year",
)
list_filter = (
"collection",
Expand All @@ -36,6 +33,7 @@ class ArticleSnippetViewSet(SnippetViewSet):
"scielo_issn",
"pid_v2",
"pid_v3",
"pid_generic",
)

register_snippet(ArticleSnippetViewSet)
Loading