From d951de1cb8cf9e9041702a528519891d1072ea1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johnny=20Marie=CC=81thoz?= Date: Wed, 4 May 2022 16:38:45 +0200 Subject: [PATCH] statistics: enable file download statistics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Due to invenio isssues, the file download statistics are not enabled. See: https://github.com/inveniosoftware/invenio-stats/issues/116 for more details. Co-Authored-by: Johnny MarieĢthoz --- scripts/setup | 4 + sonar/config.py | 126 +++++++++++++++--- sonar/ext.py | 5 +- sonar/signals.py | 34 +++++ .../documents/test_documents_files_rest.py | 25 +++- tests/conftest.py | 17 +++ 6 files changed, 192 insertions(+), 19 deletions(-) create mode 100644 sonar/signals.py diff --git a/scripts/setup b/scripts/setup index f0babe00..9542dbb8 100755 --- a/scripts/setup +++ b/scripts/setup @@ -91,6 +91,10 @@ invenio index destroy --force --yes-i-know invenio utils es-init --force # To take templates into account invenio index queue init purge +section "Initialize invenio queues." +invenio queues delete +invenio queues declare + # Create admin role to restrict access section "Create roles for users" "info" invenio roles create superuser diff --git a/sonar/config.py b/sonar/config.py index ace20aa1..ac522bf5 100644 --- a/sonar/config.py +++ b/sonar/config.py @@ -31,7 +31,9 @@ from celery.schedules import crontab from invenio_oauthclient.contrib import orcid from invenio_records_rest.facets import range_filter +from invenio_stats.aggregations import StatAggregator from invenio_stats.processors import EventsIndexer +from invenio_stats.queries import ESTermsQuery from sonar.modules.collections.config import \ Configuration as CollectionConfiguration @@ -168,6 +170,15 @@ def _(x): 'schedule': timedelta(minutes=30), 'args': [('record-view', 'file-download')], }, + # Stats Agg events + 'stats-aggregate-events': { + 'task': 'invenio_stats.tasks.aggregate_events', + 'schedule': timedelta(minutes=35), + 'args': [( + 'record-view-agg', 'record-view-agg', + 'file-download-agg', 'file-download-agg', + )], + }, # Documents stats 'documents-stats': { 'task': ('sonar.modules.stats.tasks.collect_stats'), @@ -866,16 +877,16 @@ def _(x): # Stats # ===== + STATS_EVENTS = { 'file-download': { - 'signal': - 'invenio_files_rest.signals.file_downloaded', - 'templates': - 'invenio_stats.contrib.file_download', - 'event_builders': - ['invenio_stats.contrib.event_builders.file_download_event_builder'], - 'cls': - EventsIndexer, + 'signal': 'sonar.signals.file_downloaded', + 'templates': 'invenio_stats.contrib.file_download', + 'event_builders': [ + 'invenio_stats.contrib.event_builders' + '.file_download_event_builder' + ], + 'cls': EventsIndexer, 'params': { 'preprocessors': [ 'invenio_stats.processors:flag_robots', @@ -894,14 +905,13 @@ def _(x): } }, 'record-view': { - 'signal': - 'invenio_records_ui.signals.record_viewed', - 'templates': - 'invenio_stats.contrib.record_view', - 'event_builders': - ['invenio_stats.contrib.event_builders.record_view_event_builder'], - 'cls': - EventsIndexer, + 'signal': 'invenio_records_ui.signals.record_viewed', + 'templates': 'invenio_stats.contrib.record_view', + 'event_builders': [ + 'invenio_stats.contrib.event_builders' + '.record_view_event_builder' + ], + 'cls': EventsIndexer, 'params': { 'preprocessors': [ 'invenio_stats.processors:flag_robots', @@ -917,6 +927,88 @@ def _(x): # Create one index per year which will store file download events 'suffix': '%Y', - }, + } + } +} + +STATS_AGGREGATIONS = { + 'file-download-agg': dict( + templates='invenio_stats.contrib.aggregations.aggr_file_download', + cls=StatAggregator, + params=dict( + event='file-download', + field='unique_id', + interval='day', + index_interval='month', + copy_fields=dict( + unique_id='unique_id', + unique_session_id='unique_session_id', + file_key='file_key', + bucket_id='bucket_id', + file_id='file_id', + ), + metric_fields={ + 'unique_count': ( + 'cardinality', 'unique_session_id', + {'precision_threshold': 1000}, + ), + 'volume': ('sum', 'size', {}), + }, + ) + ), + 'record-view-agg': dict( + templates='invenio_stats.contrib.aggregations.aggr_record_view', + cls=StatAggregator, + params=dict( + event='record-view', + field='unique_id', + interval='day', + index_interval='month', + copy_fields=dict( + # record_id='record_id', + unique_id='unique_id', + unique_session_id='unique_session_id', + pid_type='pid_type', + pid_value='pid_value', + ), + metric_fields={ + 'unique_count': ( + 'cardinality', 'unique_session_id', + {'precision_threshold': 1000}, + ) + }, + ) + ) +} + +STATS_QUERIES = { + 'file-download': { + 'cls': ESTermsQuery, + 'params': dict( + index='stats-file-download', + required_filters={ + 'bucket_id': 'bucket_id', + }, + copy_fields={ + 'bucket_id': 'bucket_id' + }, + aggregated_fields=['file_key'], + metric_fields=dict( + count=("sum", "count", {}), + unique_count=("sum", "unique_count", {}), + ) + ) }, + "record-view": dict( + cls=ESTermsQuery, + params=dict( + index="stats-record-view", + copy_fields=dict(pid_type="pid_type", pid_value="pid_value"), + required_filters=dict(pid_value="pid_value", pid_type="pid_type"), + metric_fields=dict( + count=("sum", "count", {}), + unique_count=("sum", "unique_count", {}), + ) + ) + ) } diff --git a/sonar/ext.py b/sonar/ext.py index 0d069f2a..578bf172 100644 --- a/sonar/ext.py +++ b/sonar/ext.py @@ -26,7 +26,8 @@ from flask_security import user_registered from flask_wiki import Wiki from flask_wiki.markdown_ext import BootstrapExtension -from invenio_files_rest.signals import file_deleted, file_uploaded +from invenio_files_rest.signals import file_deleted, file_downloaded, \ + file_uploaded from invenio_indexer.signals import before_record_index from werkzeug.datastructures import MIMEAccept @@ -43,6 +44,7 @@ RecordResource as ProjectRecordResource from sonar.resources.projects.service import \ RecordService as ProjectRecordService +from sonar.signals import file_download_proxy from . import config_sonar from .route_converters import OrganisationCodeConverter @@ -100,6 +102,7 @@ def init_app(self, app): # Connect to signal sent when a file is uploaded or deleted file_uploaded.connect(file_uploaded_listener, weak=False) + file_downloaded.connect(file_download_proxy, weak=False) file_deleted.connect(file_deleted_listener, weak=False) # Add user's full name before record index diff --git a/sonar/signals.py b/sonar/signals.py new file mode 100644 index 00000000..606a5958 --- /dev/null +++ b/sonar/signals.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2021 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Signals for SONAR.""" + +from blinker import Namespace +from flask import current_app + +_signals = Namespace() + +file_downloaded = _signals.signal('file-downloaded') +"""File downloaded signal.""" + +def file_download_proxy(obj): + """This proxy add a sender to the original signal. + + TODO: this is a workaround that can be remove once invenio-stats has + fixed some issues. + """ + file_downloaded.send(current_app._get_current_object(), obj=obj) diff --git a/tests/api/documents/test_documents_files_rest.py b/tests/api/documents/test_documents_files_rest.py index 0ecbf442..59308395 100644 --- a/tests/api/documents/test_documents_files_rest.py +++ b/tests/api/documents/test_documents_files_rest.py @@ -18,7 +18,9 @@ """Test REST endpoint for documents.""" +from elasticsearch_dsl import Search from flask import url_for +from invenio_stats.tasks import process_events def test_get_content(app, client, document_with_file): @@ -28,7 +30,6 @@ def test_get_content(app, client, document_with_file): file_name = 'test1.pdf' fulltext_file_name = 'test1-pdf.txt' thumbnail_file_name = 'test1-pdf.jpg' - # get the pdf file url_file_content = url_for('invenio_records_files.doc_object_api', pid_value=document_with_file.get('pid'), key=file_name) res = client.get(url_file_content) @@ -133,3 +134,25 @@ def test_put_delete(app, client, document, pdf_file): # TODO: is it the right approach? Do we need to remove files and # the bucket? assert len(content) == 2 + + +def test_stats(app, client, document_with_file, es, event_queues): + """Test get existing stats for file downloads.""" + app.config.update(SONAR_APP_DISABLE_PERMISSION_CHECKS=True) + + file_name = 'test1.pdf' + # get the pdf file + url_file_content = url_for('invenio_records_files.doc_object_api', pid_value=document_with_file.get('pid'), key=file_name) + res = client.get(url_file_content) + assert res.status_code == 200 + assert res.content_type == 'application/octet-stream' + assert res.content_length > 0 + + # process the task + process_events(['file-download']) + + es.indices.refresh(index='events-stats-file-download') + search = Search(using=es).index('events-stats-file-download') + + # should have at least one stats in the index + assert search.execute().hits.total.value > 0 diff --git a/tests/conftest.py b/tests/conftest.py index b373df9a..31952e0e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,6 +28,7 @@ from invenio_access.models import ActionUsers, Role from invenio_accounts.ext import hash_password from invenio_files_rest.models import Location +from invenio_queues.proxies import current_queues from utils import MockArkServer from sonar.modules.collections.api import Record as CollectionRecord @@ -39,6 +40,17 @@ from sonar.proxies import sonar +@pytest.fixture() +def event_queues(app): + """Delete and declare test queues.""" + current_queues.delete() + try: + current_queues.declare() + yield + finally: + current_queues.delete() + + @pytest.fixture(scope='module') def embargo_date(): """Embargo date in one year from now.""" @@ -125,6 +137,11 @@ def app_config(app_config): app_config['SONAR_APP_ARK_NAAN'] = '99999' app_config['SONAR_APP_ARK_SCHEME'] = 'ark:' app_config['SONAR_APP_ARK_SHOULDER'] = 'ffk3' + + # Celery + app_config['CELERY_BROKER_URL'] = 'memory://' + app_config['CELERY_TASK_ALWAYS_EAGER'] = True + app_config['CELERY_TASK_EAGER_PROPAGATES'] = True return app_config