Skip to content

Commit

Permalink
statistics: enable file download statistics
Browse files Browse the repository at this point in the history
Due to invenio isssues, the file download statistics are not enabled.

See: inveniosoftware/invenio-stats#116 for
more details.

Co-Authored-by: Johnny Mariéthoz <Johnny.Mariethoz@rero.ch>
  • Loading branch information
jma committed May 24, 2022
1 parent ce78f91 commit d951de1
Show file tree
Hide file tree
Showing 6 changed files with 192 additions and 19 deletions.
4 changes: 4 additions & 0 deletions scripts/setup
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ invenio index destroy --force --yes-i-know
invenio utils es-init --force # To take templates into account
invenio index queue init purge

section "Initialize invenio queues."
invenio queues delete
invenio queues declare

# Create admin role to restrict access
section "Create roles for users" "info"
invenio roles create superuser
Expand Down
126 changes: 109 additions & 17 deletions sonar/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
from celery.schedules import crontab
from invenio_oauthclient.contrib import orcid
from invenio_records_rest.facets import range_filter
from invenio_stats.aggregations import StatAggregator
from invenio_stats.processors import EventsIndexer
from invenio_stats.queries import ESTermsQuery

from sonar.modules.collections.config import \
Configuration as CollectionConfiguration
Expand Down Expand Up @@ -168,6 +170,15 @@ def _(x):
'schedule': timedelta(minutes=30),
'args': [('record-view', 'file-download')],
},
# Stats Agg events
'stats-aggregate-events': {
'task': 'invenio_stats.tasks.aggregate_events',
'schedule': timedelta(minutes=35),
'args': [(
'record-view-agg', 'record-view-agg',
'file-download-agg', 'file-download-agg',
)],
},
# Documents stats
'documents-stats': {
'task': ('sonar.modules.stats.tasks.collect_stats'),
Expand Down Expand Up @@ -866,16 +877,16 @@ def _(x):

# Stats
# =====

STATS_EVENTS = {
'file-download': {
'signal':
'invenio_files_rest.signals.file_downloaded',
'templates':
'invenio_stats.contrib.file_download',
'event_builders':
['invenio_stats.contrib.event_builders.file_download_event_builder'],
'cls':
EventsIndexer,
'signal': 'sonar.signals.file_downloaded',
'templates': 'invenio_stats.contrib.file_download',
'event_builders': [
'invenio_stats.contrib.event_builders'
'.file_download_event_builder'
],
'cls': EventsIndexer,
'params': {
'preprocessors': [
'invenio_stats.processors:flag_robots',
Expand All @@ -894,14 +905,13 @@ def _(x):
}
},
'record-view': {
'signal':
'invenio_records_ui.signals.record_viewed',
'templates':
'invenio_stats.contrib.record_view',
'event_builders':
['invenio_stats.contrib.event_builders.record_view_event_builder'],
'cls':
EventsIndexer,
'signal': 'invenio_records_ui.signals.record_viewed',
'templates': 'invenio_stats.contrib.record_view',
'event_builders': [
'invenio_stats.contrib.event_builders'
'.record_view_event_builder'
],
'cls': EventsIndexer,
'params': {
'preprocessors': [
'invenio_stats.processors:flag_robots',
Expand All @@ -917,6 +927,88 @@ def _(x):
# Create one index per year which will store file download events
'suffix':
'%Y',
},
}
}
}

STATS_AGGREGATIONS = {
'file-download-agg': dict(
templates='invenio_stats.contrib.aggregations.aggr_file_download',
cls=StatAggregator,
params=dict(
event='file-download',
field='unique_id',
interval='day',
index_interval='month',
copy_fields=dict(
unique_id='unique_id',
unique_session_id='unique_session_id',
file_key='file_key',
bucket_id='bucket_id',
file_id='file_id',
),
metric_fields={
'unique_count': (
'cardinality', 'unique_session_id',
{'precision_threshold': 1000},
),
'volume': ('sum', 'size', {}),
},
)
),
'record-view-agg': dict(
templates='invenio_stats.contrib.aggregations.aggr_record_view',
cls=StatAggregator,
params=dict(
event='record-view',
field='unique_id',
interval='day',
index_interval='month',
copy_fields=dict(
# record_id='record_id',
unique_id='unique_id',
unique_session_id='unique_session_id',
pid_type='pid_type',
pid_value='pid_value',
),
metric_fields={
'unique_count': (
'cardinality', 'unique_session_id',
{'precision_threshold': 1000},
)
},
)
)
}

STATS_QUERIES = {
'file-download': {
'cls': ESTermsQuery,
'params': dict(
index='stats-file-download',
required_filters={
'bucket_id': 'bucket_id',
},
copy_fields={
'bucket_id': 'bucket_id'
},
aggregated_fields=['file_key'],
metric_fields=dict(
count=("sum", "count", {}),
unique_count=("sum", "unique_count", {}),
)
)
},
"record-view": dict(
cls=ESTermsQuery,
params=dict(
index="stats-record-view",
copy_fields=dict(pid_type="pid_type", pid_value="pid_value"),
required_filters=dict(pid_value="pid_value", pid_type="pid_type"),
metric_fields=dict(
count=("sum", "count", {}),
unique_count=("sum", "unique_count", {}),
)
)
)
}
5 changes: 4 additions & 1 deletion sonar/ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
from flask_security import user_registered
from flask_wiki import Wiki
from flask_wiki.markdown_ext import BootstrapExtension
from invenio_files_rest.signals import file_deleted, file_uploaded
from invenio_files_rest.signals import file_deleted, file_downloaded, \
file_uploaded
from invenio_indexer.signals import before_record_index
from werkzeug.datastructures import MIMEAccept

Expand All @@ -43,6 +44,7 @@
RecordResource as ProjectRecordResource
from sonar.resources.projects.service import \
RecordService as ProjectRecordService
from sonar.signals import file_download_proxy

from . import config_sonar
from .route_converters import OrganisationCodeConverter
Expand Down Expand Up @@ -100,6 +102,7 @@ def init_app(self, app):

# Connect to signal sent when a file is uploaded or deleted
file_uploaded.connect(file_uploaded_listener, weak=False)
file_downloaded.connect(file_download_proxy, weak=False)
file_deleted.connect(file_deleted_listener, weak=False)

# Add user's full name before record index
Expand Down
34 changes: 34 additions & 0 deletions sonar/signals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
#
# Swiss Open Access Repository
# Copyright (C) 2021 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Signals for SONAR."""

from blinker import Namespace
from flask import current_app

_signals = Namespace()

file_downloaded = _signals.signal('file-downloaded')
"""File downloaded signal."""

def file_download_proxy(obj):
"""This proxy add a sender to the original signal.
TODO: this is a workaround that can be remove once invenio-stats has
fixed some issues.
"""
file_downloaded.send(current_app._get_current_object(), obj=obj)
25 changes: 24 additions & 1 deletion tests/api/documents/test_documents_files_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
"""Test REST endpoint for documents."""


from elasticsearch_dsl import Search
from flask import url_for
from invenio_stats.tasks import process_events


def test_get_content(app, client, document_with_file):
Expand All @@ -28,7 +30,6 @@ def test_get_content(app, client, document_with_file):
file_name = 'test1.pdf'
fulltext_file_name = 'test1-pdf.txt'
thumbnail_file_name = 'test1-pdf.jpg'

# get the pdf file
url_file_content = url_for('invenio_records_files.doc_object_api', pid_value=document_with_file.get('pid'), key=file_name)
res = client.get(url_file_content)
Expand Down Expand Up @@ -133,3 +134,25 @@ def test_put_delete(app, client, document, pdf_file):
# TODO: is it the right approach? Do we need to remove files and
# the bucket?
assert len(content) == 2


def test_stats(app, client, document_with_file, es, event_queues):
"""Test get existing stats for file downloads."""
app.config.update(SONAR_APP_DISABLE_PERMISSION_CHECKS=True)

file_name = 'test1.pdf'
# get the pdf file
url_file_content = url_for('invenio_records_files.doc_object_api', pid_value=document_with_file.get('pid'), key=file_name)
res = client.get(url_file_content)
assert res.status_code == 200
assert res.content_type == 'application/octet-stream'
assert res.content_length > 0

# process the task
process_events(['file-download'])

es.indices.refresh(index='events-stats-file-download')
search = Search(using=es).index('events-stats-file-download')

# should have at least one stats in the index
assert search.execute().hits.total.value > 0
17 changes: 17 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from invenio_access.models import ActionUsers, Role
from invenio_accounts.ext import hash_password
from invenio_files_rest.models import Location
from invenio_queues.proxies import current_queues
from utils import MockArkServer

from sonar.modules.collections.api import Record as CollectionRecord
Expand All @@ -39,6 +40,17 @@
from sonar.proxies import sonar


@pytest.fixture()
def event_queues(app):
"""Delete and declare test queues."""
current_queues.delete()
try:
current_queues.declare()
yield
finally:
current_queues.delete()


@pytest.fixture(scope='module')
def embargo_date():
"""Embargo date in one year from now."""
Expand Down Expand Up @@ -125,6 +137,11 @@ def app_config(app_config):
app_config['SONAR_APP_ARK_NAAN'] = '99999'
app_config['SONAR_APP_ARK_SCHEME'] = 'ark:'
app_config['SONAR_APP_ARK_SHOULDER'] = 'ffk3'

# Celery
app_config['CELERY_BROKER_URL'] = 'memory://'
app_config['CELERY_TASK_ALWAYS_EAGER'] = True
app_config['CELERY_TASK_EAGER_PROPAGATES'] = True
return app_config


Expand Down

0 comments on commit d951de1

Please sign in to comment.