Skip to content

Commit

Permalink
stats: collect and display stats for organisations
Browse files Browse the repository at this point in the history
* Collects and displays stats for organisations documents.
* Schedules the collect to every morning.
* Improves `format_date` filter to use the right timezone.
* Closes #562.

Co-Authored-by: Sébastien Délèze <sebastien.deleze@rero.ch>
  • Loading branch information
Sébastien Délèze committed Aug 26, 2021
1 parent 0d45c81 commit cb7feff
Show file tree
Hide file tree
Showing 26 changed files with 1,207 additions and 9 deletions.
16 changes: 13 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
'projects_hepvs = sonar.dedicated.hepvs.projects.jsonschemas',
'collections = sonar.modules.collections.jsonschemas',
'subdivisions = sonar.modules.subdivisions.jsonschemas',
'stats = sonar.modules.stats.jsonschemas',
'common = sonar.common.jsonschemas'
],
'invenio_search.mappings': [
Expand All @@ -125,7 +126,8 @@
'deposits = sonar.modules.deposits.mappings',
'projects = sonar.resources.projects.mappings',
'collections = sonar.modules.collections.mappings',
'subdivisions = sonar.modules.subdivisions.mappings'
'subdivisions = sonar.modules.subdivisions.mappings',
'stats = sonar.modules.stats.mappings'
],
'invenio_search.templates': [
'base-record = sonar.es_templates:list_es_templates'
Expand All @@ -142,7 +144,9 @@
'collections_id = \
sonar.modules.collections.api:pid_minter',
'subdivisions_id = \
sonar.modules.subdivisions.api:pid_minter'
sonar.modules.subdivisions.api:pid_minter',
'stats_id = \
sonar.modules.stats.api:pid_minter'
],
'invenio_pidstore.fetchers': [
'document_id = \
Expand All @@ -157,6 +161,8 @@
sonar.modules.collections.api:pid_fetcher',
'subdivisions_id = \
sonar.modules.subdivisions.api:pid_fetcher',
'stats_id = \
sonar.modules.stats.api:pid_fetcher'
],
"invenio_records.jsonresolver": [
"organisation = sonar.modules.organisations.jsonresolvers",
Expand All @@ -167,7 +173,11 @@
"subdivisions = sonar.modules.subdivisions.jsonresolvers"
],
'invenio_celery.tasks' : [
'documents = sonar.modules.documents.tasks'
'documents = sonar.modules.documents.tasks',
'stats = sonar.modules.stats.tasks',
],
'invenio_admin.views': [
'stats = sonar.modules.stats.admin:stats_adminview'
],
'babel.extractors': [
'json = sonar.modules.babel_extractors:extract_json'
Expand Down
11 changes: 11 additions & 0 deletions sonar/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import os
from datetime import timedelta

from celery.schedules import crontab
from invenio_oauthclient.contrib import orcid
from invenio_records_rest.facets import range_filter
from invenio_stats.processors import EventsIndexer
Expand All @@ -44,6 +45,7 @@
from sonar.modules.permissions import record_permission_factory, \
wiki_edit_permission
from sonar.modules.query import and_term_filter, missing_field_filter
from sonar.modules.stats.config import Configuration as StatConfiguration
from sonar.modules.subdivisions.config import \
Configuration as SubdivisionConfiguration
from sonar.modules.users.api import UserRecord, UserSearch
Expand Down Expand Up @@ -165,6 +167,11 @@ def _(x):
'task': 'invenio_stats.tasks.process_events',
'schedule': timedelta(minutes=30),
'args': [('record-view', 'file-download')],
},
# Documents stats
'documents-stats': {
'task': ('sonar.modules.stats.tasks.collect_stats'),
'schedule': crontab(minute=0, hour=1), # Every day at 01:00 UTC,
}
}
CELERY_BROKER_HEARTBEAT = 0
Expand Down Expand Up @@ -503,6 +510,10 @@ def _(x):
RECORDS_REST_ENDPOINTS['subd'] = SubdivisionConfiguration.rest_endpoint
"""REST endpoints."""

# Add endpoint for stats
RECORDS_REST_ENDPOINTS['stat'] = StatConfiguration.rest_endpoint
"""REST endpoints for statistics."""

DEFAULT_AGGREGATION_SIZE = 50
"""Default size for aggregations."""

Expand Down
18 changes: 18 additions & 0 deletions sonar/modules/organisations/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,24 @@ class Meta:
index = 'organisations'
doc_types = []

def get_shared_or_dedicated_list(self):
"""Get the list of dedicated or shared organisations.
:returns: Iterator of dedicated or shared organisations.
"""
return self.filter('bool',
should=[{
'term': {
'isDedicated': True
}
}, {
'term': {
'isShared': True
}
}]).source(
['pid', 'name', 'isShared',
'isDedicated']).execute().hits


class OrganisationRecord(SonarRecord):
"""Organisation record class."""
Expand Down
18 changes: 18 additions & 0 deletions sonar/modules/stats/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
#
# Swiss Open Access Repository
# Copyright (C) 2021 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Resource."""
76 changes: 76 additions & 0 deletions sonar/modules/stats/admin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
#
# Swiss Open Access Repository
# Copyright (C) 2021 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Stats admin views."""

from flask import abort, redirect, request, url_for
from flask_admin.base import BaseView, expose

from sonar.modules.stats.api import Record, RecordSearch


class DocumentsStats(BaseView):
"""Documents stats admin views."""

@expose('/')
def index(self):
"""Stats index view.
:returns: Rendered template
"""
hits = RecordSearch().sort('-_created')[0:100].execute().to_dict()
return self.render('sonar/stats/index.html',
records=hits['hits']['hits'])

@expose('/collect')
def collect(self):
"""Collect statistics.
:returns: Rendered template or redirection to detail view.
"""
save = bool(request.args.get('save'))
record = Record.collect(save)
if not save:
return self.render('sonar/stats/detail.html',
record=record,
live=True)

return redirect(url_for('documentsstats.detail', pid=record['pid']))

@expose('/<pid>')
def detail(self, pid):
"""Stats detail view.
:param string pid: PID
:returns: Rendered template
"""
record = Record.get_record_by_pid(pid)

if not record:
abort(404)

return self.render('sonar/stats/detail.html', record=record)


stats_adminview = {
'view_class': DocumentsStats,
'kwargs': {
'name': 'Stats'
},
}

__all__ = ('stats_adminview', )
139 changes: 139 additions & 0 deletions sonar/modules/stats/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# -*- coding: utf-8 -*-
#
# Swiss Open Access Repository
# Copyright (C) 2021 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Stats record API."""

from functools import partial

from invenio_db import db

from sonar.modules.documents.api import DocumentSearch
from sonar.modules.organisations.api import OrganisationSearch

from ..api import SonarIndexer, SonarRecord, SonarSearch
from ..fetchers import id_fetcher
from ..providers import Provider
from .config import Configuration
from .minters import id_minter

# provider
RecordProvider = type('RecordProvider', (Provider, ),
dict(pid_type=Configuration.pid_type))
# minter
pid_minter = partial(id_minter, provider=RecordProvider)
# fetcher
pid_fetcher = partial(id_fetcher, provider=RecordProvider)


class Record(SonarRecord):
"""Stats record."""

minter = pid_minter
fetcher = pid_fetcher
provider = RecordProvider
schema = Configuration.schema

@classmethod
def collect(cls, save=True):
"""Collect statistics.
:params bool save: Wether the stats collected are saved into DB
:returns: Stats record object
:rtype: Record
"""

def has_fulltext_file(document):
"""Check if document has at least a full-text file.
:param dict document: Document dictionary
:returns: True if document has a full-text file
:rtype: bool
"""
for file in document.get('_files', []):
if file.get('mimetype') == 'application/pdf' and file.get(
'type') == 'file':
return True

return False

stats = []
for organisation in OrganisationSearch().get_shared_or_dedicated_list(
):
documents = cls.get_documents(organisation['pid'])
fulltext = 0
pids = []

for document in documents:
document = document.to_dict()

# Add PID to list.
pids.append(document['pid'])

# Increment fulltext counter.
if has_fulltext_file(document):
fulltext = fulltext + 1

stats.append({
'organisation':
organisation['name'],
'type':
'dedicated'
if organisation.to_dict().get('isDedicated') else 'shared',
'full_text':
fulltext,
'pids':
pids
})

record = cls.create({'values': stats})

if save:
record.commit()
db.session.commit()
record.reindex()

return record

@classmethod
def get_documents(cls, organisation_pid):
"""Get documents for organisation.
:param str organisation_pid: Organisation PID.
:returns: A generator for getting documents PID and files.
:rtype: generator
"""
query = DocumentSearch().filter(
'term',
organisation__pid=organisation_pid).source(['pid', '_files'])

return query.scan()


class RecordSearch(SonarSearch):
"""Record search."""

class Meta:
"""Search only on item index."""

index = Configuration.index
doc_types = []


class RecordIndexer(SonarIndexer):
"""Indexing documents in Elasticsearch."""

record_cls = Record
Loading

0 comments on commit cb7feff

Please sign in to comment.