From 6a451ec7c7f843341b47fa8a16583dace3f285ba Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Thu, 20 Dec 2018 09:25:26 -0500 Subject: [PATCH] Reapply search upgrade to master. This is a squashed commit of all previous changes. --- .travis.yml | 2 +- conftest.py | 6 + docs/custom_installs/elasticsearch.rst | 108 ----- docs/development/search.rst | 110 ++++++ docs/index.rst | 1 + docs/install.rst | 6 +- docs/settings.rst | 78 ++++ .../commands/provision_elasticsearch.py | 33 -- .../commands/reindex_elasticsearch.py | 58 --- .../static-src/core/js/doc-embed/search.js | 69 ++-- readthedocs/projects/admin.py | 2 + readthedocs/projects/managers.py | 7 + .../projects/migrations/0032_add_htmlfile.py | 25 ++ readthedocs/projects/models.py | 50 ++- readthedocs/projects/signals.py | 4 + readthedocs/projects/tasks.py | 79 ++-- readthedocs/projects/views/public.py | 50 +-- readthedocs/restapi/urls.py | 38 +- readthedocs/restapi/utils.py | 115 ------ readthedocs/restapi/views/search_views.py | 151 ------- readthedocs/search/__init__.py | 1 + readthedocs/search/api.py | 58 +++ readthedocs/search/apps.py | 10 + readthedocs/search/documents.py | 125 ++++++ readthedocs/search/faceted_search.py | 44 +++ readthedocs/search/filters.py | 20 + readthedocs/search/indexes.py | 374 ------------------ readthedocs/search/lib.py | 250 ------------ readthedocs/search/management/__init__.py | 0 .../search/management/commands/__init__.py | 0 .../commands/reindex_elasticsearch.py | 123 ++++++ readthedocs/search/mixins.py | 68 ++++ readthedocs/search/pagination.py | 7 + readthedocs/search/parse_json.py | 2 +- readthedocs/search/serializers.py | 23 ++ readthedocs/search/signals.py | 30 ++ readthedocs/search/tasks.py | 115 ++++++ readthedocs/search/tests/conftest.py | 80 ++-- readthedocs/search/tests/data/docs/story.json | 3 +- .../search/tests/data/docs/wiping.json | 2 +- .../search/tests/data/kuma/docker.json | 2 +- .../search/tests/data/kuma/documentation.json | 2 +- .../tests/data/pipeline/installation.json | 2 +- .../search/tests/data/pipeline/signals.json | 2 +- readthedocs/search/tests/dummy_data.py | 31 +- readthedocs/search/tests/test_api.py | 135 +++++++ .../search/tests/test_faceted_search.py | 50 +++ readthedocs/search/tests/test_views.py | 102 +++-- readthedocs/search/tests/utils.py | 7 +- readthedocs/search/utils.py | 23 ++ readthedocs/search/views.py | 38 +- readthedocs/settings/base.py | 49 ++- readthedocs/settings/dev.py | 3 + readthedocs/settings/test.py | 11 + .../search/elastic_project_search.html | 12 +- .../templates/search/elastic_search.html | 51 +-- readthedocs/urls.py | 4 +- requirements/pip.txt | 5 +- scripts/travis/install_elasticsearch.sh | 4 +- 59 files changed, 1437 insertions(+), 1423 deletions(-) delete mode 100644 docs/custom_installs/elasticsearch.rst create mode 100644 docs/development/search.rst delete mode 100644 readthedocs/core/management/commands/provision_elasticsearch.py delete mode 100644 readthedocs/core/management/commands/reindex_elasticsearch.py create mode 100644 readthedocs/projects/managers.py create mode 100644 readthedocs/projects/migrations/0032_add_htmlfile.py delete mode 100644 readthedocs/restapi/views/search_views.py create mode 100644 readthedocs/search/api.py create mode 100644 readthedocs/search/apps.py create mode 100644 readthedocs/search/documents.py create mode 100644 readthedocs/search/faceted_search.py create mode 100644 readthedocs/search/filters.py delete mode 100644 readthedocs/search/indexes.py delete mode 100644 readthedocs/search/lib.py create mode 100644 readthedocs/search/management/__init__.py create mode 100644 readthedocs/search/management/commands/__init__.py create mode 100644 readthedocs/search/management/commands/reindex_elasticsearch.py create mode 100644 readthedocs/search/mixins.py create mode 100644 readthedocs/search/pagination.py create mode 100644 readthedocs/search/serializers.py create mode 100644 readthedocs/search/tasks.py create mode 100644 readthedocs/search/tests/test_api.py create mode 100644 readthedocs/search/tests/test_faceted_search.py diff --git a/.travis.yml b/.travis.yml index 6743bf289bb..8feec0811dd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,7 @@ python: - 2.7 - 3.6 env: - - ES_VERSION=1.3.9 ES_DOWNLOAD_URL=https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch-${ES_VERSION}.tar.gz + - ES_VERSION=6.2.4 ES_DOWNLOAD_URL=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ES_VERSION}.tar.gz matrix: include: - python: 3.6 diff --git a/conftest.py b/conftest.py index 862d9f43c8e..3eb3f7e14ed 100644 --- a/conftest.py +++ b/conftest.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- import pytest +from django.conf import settings +from rest_framework.test import APIClient try: # TODO: this file is read/executed even when called from ``readthedocsinc``, @@ -44,3 +46,7 @@ def pytest_configure(config): @pytest.fixture(autouse=True) def settings_modification(settings): settings.CELERY_ALWAYS_EAGER = True + +@pytest.fixture +def api_client(): + return APIClient() diff --git a/docs/custom_installs/elasticsearch.rst b/docs/custom_installs/elasticsearch.rst deleted file mode 100644 index 3612374594a..00000000000 --- a/docs/custom_installs/elasticsearch.rst +++ /dev/null @@ -1,108 +0,0 @@ -========================================== -Enabling Elasticsearch on the local server -========================================== - -Read the Docs has been using Elasticsearch for indexing and searching. To enable this on your local installation, you need to install elasticsearch and run the Elastic server locally. - -Installation has been mainly divided into following steps. - -Installing Java ---------------- - -Elasticsearch requires Java 8 or later. Use `Oracle official documentation `_. -or opensource distribution like `OpenJDK `_. - -After installing java, verify the installation by,:: - - $ java -version - -The result should be something like this:: - - openjdk version "1.8.0_151" - OpenJDK Runtime Environment (build 1.8.0_151-8u151-b12-0ubuntu0.16.04.2-b12) - OpenJDK 64-Bit Server VM (build 25.151-b12, mixed mode) - - -Downloading and installing Elasticsearch ----------------------------------------- - -Elasticsearch can be downloaded directly from elastic.co. For Ubuntu, it's best to use the deb (Debian) package which will install everything you need to run Elasticsearch. - -RTD currently uses elasticsearch 1.x which can be easily downloaded and installed from `elastic.co -`_. - -Install the downloaded package by following command:: - - $ sudo apt install .{path-to-downloaded-file}/elasticsearch-1.3.8.deb - -Custom setup ------------- - -You need the icu plugin:: - - $ elasticsearch/bin/plugin -install elasticsearch/elasticsearch-analysis-icu/2.3.0 - -Running Elasticsearch from command line ---------------------------------------- - -Elasticsearch is not started automatically after installation. How to start and stop Elasticsearch depends on whether your system uses SysV init or systemd (used by newer distributions). You can tell which is being used by running this command:: - - $ ps -p 1 - -**Running Elasticsearch with SysV init** - -Use the ``update-rc.d command`` to configure Elasticsearch to start automatically when the system boots up:: - - $ sudo update-rc.d elasticsearch defaults 95 10 - -Elasticsearch can be started and stopped using the service command:: - - $ sudo -i service elasticsearch start - $ sudo -i service elasticsearch stop - -If Elasticsearch fails to start for any reason, it will print the reason for failure to STDOUT. Log files can be found in /var/log/elasticsearch/. - -**Running Elasticsearch with systemd** - -To configure Elasticsearch to start automatically when the system boots up, run the following commands:: - - $ sudo /bin/systemctl daemon-reload - $ sudo /bin/systemctl enable elasticsearch.service - -Elasticsearch can be started and stopped as follows:: - - $ sudo systemctl start elasticsearch.service - $ sudo systemctl stop elasticsearch.service - -To verify run:: - - $ curl http://localhost:9200 - - -You should get something like:: - - { - status: 200, - name: "Amina Synge", - version: { - number: "1.3.8", - build_hash: "475733ee0837fba18c00c3ee76cd49a08755550c", - build_timestamp: "2015-02-11T14:45:42Z", - build_snapshot: false, - lucene_version: "4.9" - }, - tagline: "You Know, for Search" - } - -Index the data available at RTD database ----------------------------------------- - -You need to create the indexes:: - - $ python manage.py provision_elasticsearch - -In order to search through the RTD database, you need to index it into the elasticsearch index:: - - $ python manage.py reindex_elasticsearch - -You are ready to go! diff --git a/docs/development/search.rst b/docs/development/search.rst new file mode 100644 index 00000000000..223f65366b2 --- /dev/null +++ b/docs/development/search.rst @@ -0,0 +1,110 @@ +Search +====== + +Read The Docs uses Elasticsearch_ instead of the built in Sphinx search for providing better search +results. Documents are indexed in the Elasticsearch index and the search is made through the API. +All the Search Code is open source and lives in the `GitHub Repository`_. +Currently we are using the `Elasticsearch 6.3`_ version. + +Local Development Configuration +------------------------------- + +Installing and running Elasticsearch +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +You need to install and run Elasticsearch_ version 6.3 on your local development machine. +You can get the installation instructions +`here `_. +Otherwise, you can also start an Elasticsearch Docker container by running the following command:: + + docker run -p 9200:9200 -p 9300:9300 \ + -e "discovery.type=single-node" \ + docker.elastic.co/elasticsearch/elasticsearch:6.3.2 + +Indexing into Elasticsearch +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +For using search, you need to index data to the Elasticsearch Index. Run ``reindex_elasticsearch`` +management command:: + + ./manage.py reindex_elasticsearch + +For performance optimization, we implemented our own version of management command rather than +the built in management command provided by the `django-elasticsearch-dsl`_ package. + +Auto Indexing +^^^^^^^^^^^^^ +By default, Auto Indexing is turned off in development mode. To turn it on, change the +``ELASTICSEARCH_DSL_AUTOSYNC`` settings to `True` in the `readthedocs/settings/dev.py` file. +After that, whenever a documentation successfully builds, or project gets added, +the search index will update automatically. + + +Architecture +------------ +The search architecture is devided into 2 parts. +One part is responsible for **indexing** the documents and projects and +the other part is responsible for querying the Index to show the proper results to users. +We use the `django-elasticsearch-dsl`_ package mostly to the keep the search working. +`django-elasticsearch-dsl`_ is a wrapper around `elasticsearch-dsl`_ for easy configuration +with Django. + +Indexing +^^^^^^^^ +All the Sphinx documents are indexed into Elasticsearch after the build is successful. +Currently, we do not index MkDocs documents to elasticsearch, but +`any kind of help is welcome `_. + +How we index documentations +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +After any build is successfully finished, `HTMLFile` objects are created for each of the +``HTML`` files and the old version's `HTMLFile` object is deleted. By default, +`django-elasticsearch-dsl`_ package listens to the `post_create`/`post_delete` signals +to index/delete documents, but it has performance drawbacks as it send HTTP request whenever +any `HTMLFile` objects is created or deleted. To optimize the performance, `bulk_post_create` +and `bulk_post_delete` Signals_ are dispatched with list of `HTMLFIle` objects so its possible +to bulk index documents in elasticsearch ( `bulk_post_create` signal is dispatched for created +and `bulk_post_delete` is dispatched for deleted objects). Both of the signals are dispatched +with the list of the instances of `HTMLFile` in `instance_list` parameter. + +We listen to the `bulk_post_create` and `bulk_post_delete` signals in our `Search` application +and index/delete the documentation content from the `HTMLFile` instances. + + +How we index projects +~~~~~~~~~~~~~~~~~~~~~ +We also index project information in our search index so that the user can search for projects +from the main site. `django-elasticsearch-dsl`_ listen `post_create` and `post_delete` signals of +`Project` model and index/delete into Elasticsearch accordingly. + + +Elasticsearch Document +~~~~~~~~~~~~~~~~~~~~~~ + +`elasticsearch-dsl`_ provides model-like wrapper for the `Elasticsearch document`_. +As per requirements of `django-elasticsearch-dsl`_, it is stored in the +`readthedocs/search/documents.py` file. + + **ProjectDocument:** It is used for indexing projects. Signal listener of + `django-elasticsearch-dsl`_ listens to the `post_save` signal of `Project` model and + then index/delete into Elasticsearch. + + **PageDocument**: It is used for indexing documentation of projects. By default, the auto + indexing is turned off by `ignore_signals = settings.ES_PAGE_IGNORE_SIGNALS`. + `settings.ES_PAGE_IGNORE_SIGNALS` is `False` both in development and production. + As mentioned above, our `Search` app listens to the `bulk_post_create` and `bulk_post_delete` + signals and indexes/deleted documentation into Elasticsearch. The signal listeners are in + the `readthedocs/search/signals.py` file. Both of the signals are dispatched + after a successful documentation build. + + The fields and ES Datatypes are specified in the `PageDocument`. The indexable data is taken + from `processed_json` property of `HTMLFile`. This property provides python dictionary with + document data like `title`, `headers`, `content` etc. + + +.. _Elasticsearch: https://www.elastic.co/products/elasticsearch +.. _Elasticsearch 6.3: https://www.elastic.co/guide/en/elasticsearch/reference/6.3/index.html +.. _GitHub Repository: https://github.com/rtfd/readthedocs.org/tree/master/readthedocs/search +.. _Elasticsearch document: https://www.elastic.co/guide/en/elasticsearch/guide/current/document.html +.. _django-elasticsearch-dsl: https://github.com/sabricot/django-elasticsearch-dsl +.. _elasticsearch-dsl: http://elasticsearch-dsl.readthedocs.io/en/latest/ +.. _Signals: https://docs.djangoproject.com/en/2.1/topics/signals/ diff --git a/docs/index.rst b/docs/index.rst index 7f3d2307e17..5515d948a6f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -114,6 +114,7 @@ to help you create fantastic documentation for your project. changelog install + development/search architecture tests docs diff --git a/docs/install.rst b/docs/install.rst index d7234bf9283..d8f7180d728 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -19,8 +19,8 @@ Additionally Read the Docs depends on: * `Redis`_ * `Elasticsearch`_ (only if you want full support for searching inside the site) - * Ubuntu users could install this package by following :doc:`/custom_installs/elasticsearch`. - + * Follow :doc:`/development/search` documentation for more instruction. +` .. note:: If you plan to import Python 2 projects to your RTD, @@ -56,8 +56,6 @@ you need these libraries. .. tab:: CentOS/RHEL 7 - Install:: - sudo yum install python-devel python-pip libxml2-devel libxslt-devel .. tab:: Other OS diff --git a/docs/settings.rst b/docs/settings.rst index ef6d4d2797d..eb1d53c584a 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -100,3 +100,81 @@ ALLOW_ADMIN Default: :djangosetting:`ALLOW_ADMIN` Whether to include `django.contrib.admin` in the URL's. + + +ELASTICSEARCH_DSL +----------------- + +Default: + +.. code-block:: python + + { + 'default': { + 'hosts': '127.0.0.1:9200' + }, + } + +Settings for elasticsearch connection. +This settings then pass to `elasticsearch-dsl-py.connections.configure`_ + + +ES_INDEXES +---------- + +Default: + +.. code-block:: python + + { + 'project': { + 'name': 'project_index', + 'settings': {'number_of_shards': 5, + 'number_of_replicas': 0 + } + }, + 'page': { + 'name': 'page_index', + 'settings': { + 'number_of_shards': 5, + 'number_of_replicas': 0, + } + }, + } + +Define the elasticsearch name and settings of all the index separately. +The key is the type of index, like ``project`` or ``page`` and the value is another +dictionary containing ``name`` and ``settings``. Here the ``name`` is the index name +and the ``settings`` is used for configuring the particular index. + + +ES_TASK_CHUNK_SIZE +------------------ + +Default: :djangosetting:`ES_TASK_CHUNK_SIZE` + +The maximum number of data send to each elasticsearch indexing celery task. +This has been used while running ``elasticsearch_reindex`` management command. + + +ES_PAGE_IGNORE_SIGNALS +---------------------- + +Default: ``False`` + +This settings is used to determine whether to index each page separately into elasticsearch. +If the setting is ``True``, each ``HTML`` page will not be indexed separately but will be +indexed by bulk indexing. + + +ELASTICSEARCH_DSL_AUTOSYNC +-------------------------- + +Default: ``True`` + +This setting is used for automatically indexing objects to elasticsearch. +``False`` by default in development so it is possible to create +project and build documentations without having elasticsearch. + + +.. _elasticsearch-dsl-py.connections.configure: https://elasticsearch-dsl.readthedocs.io/en/stable/configuration.html#multiple-clusters \ No newline at end of file diff --git a/readthedocs/core/management/commands/provision_elasticsearch.py b/readthedocs/core/management/commands/provision_elasticsearch.py deleted file mode 100644 index 9f29fa37a9e..00000000000 --- a/readthedocs/core/management/commands/provision_elasticsearch.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Provision Elastic Search""" - -from __future__ import absolute_import -import logging - -from django.core.management.base import BaseCommand - -from readthedocs.search.indexes import Index, PageIndex, ProjectIndex, SectionIndex - -log = logging.getLogger(__name__) - - -class Command(BaseCommand): - - help = __doc__ - - def handle(self, *args, **options): - """Provision new ES instance""" - index = Index() - index_name = index.timestamped_index() - - log.info("Creating indexes..") - index.create_index(index_name) - index.update_aliases(index_name) - - log.info("Updating mappings..") - proj = ProjectIndex() - proj.put_mapping() - page = PageIndex() - page.put_mapping() - sec = SectionIndex() - sec.put_mapping() - log.info("Done!") diff --git a/readthedocs/core/management/commands/reindex_elasticsearch.py b/readthedocs/core/management/commands/reindex_elasticsearch.py deleted file mode 100644 index 7a5f25a065a..00000000000 --- a/readthedocs/core/management/commands/reindex_elasticsearch.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Reindex Elastic Search indexes""" - -from __future__ import absolute_import -import logging -from optparse import make_option - -from django.core.management.base import BaseCommand -from django.core.management.base import CommandError -from django.conf import settings - -from readthedocs.builds.constants import LATEST -from readthedocs.builds.models import Version -from readthedocs.projects.tasks import update_search - -log = logging.getLogger(__name__) - - -class Command(BaseCommand): - - help = __doc__ - - def add_arguments(self, parser): - parser.add_argument( - '-p', - dest='project', - default='', - help='Project to index' - ) - - def handle(self, *args, **options): - """Build/index all versions or a single project's version""" - project = options['project'] - - queryset = Version.objects.all() - - if project: - queryset = queryset.filter(project__slug=project) - if not queryset.exists(): - raise CommandError( - 'No project with slug: {slug}'.format(slug=project)) - log.info("Building all versions for %s", project) - elif getattr(settings, 'INDEX_ONLY_LATEST', True): - queryset = queryset.filter(slug=LATEST) - - for version in queryset: - log.info("Reindexing %s", version) - try: - commit = version.project.vcs_repo(version.slug).commit - except: # noqa - # An exception can be thrown here in production, but it's not - # documented what the exception here is - commit = None - - try: - update_search(version.pk, commit, - delete_non_commit_files=False) - except Exception as e: - log.exception('Reindex failed for %s, %s', version, e) diff --git a/readthedocs/core/static-src/core/js/doc-embed/search.js b/readthedocs/core/static-src/core/js/doc-embed/search.js index 8ed0971b306..9b21d17e9b2 100644 --- a/readthedocs/core/static-src/core/js/doc-embed/search.js +++ b/readthedocs/core/static-src/core/js/doc-embed/search.js @@ -24,43 +24,43 @@ function attach_elastic_search_query(data) { search_url.href = api_host; search_url.pathname = '/api/v2/docsearch/'; search_url.search = '?q=' + $.urlencode(query) + '&project=' + project + - '&version=' + version + '&language=' + language; + '&version=' + version + '&language=' + language; search_def - .then(function (results) { - var hits = results.hits || {}; - var hit_list = hits.hits || []; + .then(function (data) { + var hit_list = data.results || []; + var total_count = data.count || 0; if (hit_list.length) { - for (var n in hit_list) { - var hit = hit_list[n]; - var fields = hit.fields || {}; + for (var i = 0; i < hit_list.length; i += 1) { + var doc = hit_list[i]; + var highlight = doc.highlight; var list_item = $('
  • '); - var item_url = document.createElement('a'); - var highlight = hit.highlight; - - item_url.href += fields.link + - DOCUMENTATION_OPTIONS.FILE_SUFFIX; - item_url.search = '?highlight=' + $.urlencode(query); - - // Result list elements - list_item.append( - $('') - .attr('href', item_url) - .html(fields.title) - ); - // fields.project is returned as an array - if (fields.project.indexOf(project) === -1) { - list_item.append( - $('') - .text(" (from project " + fields.project + ")") - ); + + // Creating the result from elements + var link = doc.link + DOCUMENTATION_OPTIONS.FILE_SUFFIX + + '?highlight=' + $.urlencode(query); + + var item = $('', {'href': link}); + item.html(doc.title); + list_item.append(item); + + // If the document is from subproject, add extra information + if (doc.project !== project) { + var text = " (from project " + doc.project + ")"; + var extra = $('', {'text': text}); + + list_item.append(extra); } - if (highlight.content.length) { - var content = $('
    ') - .html(xss(highlight.content[0])); - content.find('em').addClass('highlighted'); - list_item.append(content); + + // Show highlighted texts + if (highlight.content) { + var content_text = xss(highlight.content[0]); + var contents = $('
    '); + + contents.html(content_text); + contents.find('em').addClass('highlighted'); + list_item.append(contents); } Search.output.append(list_item); @@ -74,7 +74,7 @@ function attach_elastic_search_query(data) { } else { Search.status.text( - _('Search finished, found %s page(s) matching the search query.').replace('%s', hit_list.length) + _('Search finished, found %s page(s) matching the search query.').replace('%s', total_count) ); } }) @@ -96,11 +96,10 @@ function attach_elastic_search_query(data) { withCredentials: true, }, complete: function (resp, status_code) { - if (typeof (resp.responseJSON) === 'undefined' || - typeof (resp.responseJSON.results) === 'undefined') { + if (status_code !== 'success' || resp.responseJSON.count === 0) { return search_def.reject(); } - return search_def.resolve(resp.responseJSON.results); + return search_def.resolve(resp.responseJSON); } }) .fail(function (resp, status_code, error) { diff --git a/readthedocs/projects/admin.py b/readthedocs/projects/admin.py index d55954fdbcf..661bc16b949 100644 --- a/readthedocs/projects/admin.py +++ b/readthedocs/projects/admin.py @@ -25,6 +25,7 @@ EmailHook, EnvironmentVariable, Feature, + HTMLFile, ImportedFile, Project, ProjectRelationship, @@ -229,3 +230,4 @@ class EnvironmentVariableAdmin(admin.ModelAdmin): admin.site.register(Feature, FeatureAdmin) admin.site.register(EmailHook) admin.site.register(WebHook) +admin.site.register(HTMLFile, ImportedFileAdmin) diff --git a/readthedocs/projects/managers.py b/readthedocs/projects/managers.py new file mode 100644 index 00000000000..c8b524702b9 --- /dev/null +++ b/readthedocs/projects/managers.py @@ -0,0 +1,7 @@ +from django.db import models + + +class HTMLFileManager(models.Manager): + + def get_queryset(self): + return super(HTMLFileManager, self).get_queryset().filter(name__endswith='.html') diff --git a/readthedocs/projects/migrations/0032_add_htmlfile.py b/readthedocs/projects/migrations/0032_add_htmlfile.py new file mode 100644 index 00000000000..79e5948fbd2 --- /dev/null +++ b/readthedocs/projects/migrations/0032_add_htmlfile.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.16 on 2018-11-06 18:09 +from __future__ import unicode_literals + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('projects', '0031_add_modified_date_importedfile'), + ] + + operations = [ + migrations.CreateModel( + name='HTMLFile', + fields=[ + ], + options={ + 'proxy': True, + 'indexes': [], + }, + bases=('projects.importedfile',), + ), + ] diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index cb189941791..d3c06048fd4 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -7,13 +7,15 @@ import fnmatch import logging import os -from builtins import object # pylint: disable=redefined-builtin +from builtins import object # pylint: disable=redefined-builtin from django.conf import settings from django.contrib.auth.models import User from django.urls import NoReverseMatch, reverse from django.db import models +from django.utils import timezone from django.utils.encoding import python_2_unicode_compatible +from django.utils.functional import cached_property from django.utils.translation import ugettext_lazy as _ from django_extensions.db.models import TimeStampedModel from future.backports.urllib.parse import urlparse # noqa @@ -25,6 +27,7 @@ from readthedocs.core.utils import broadcast, slugify from readthedocs.projects import constants from readthedocs.projects.exceptions import ProjectConfigurationError +from readthedocs.projects.managers import HTMLFileManager from readthedocs.projects.querysets import ( ChildRelatedProjectQuerySet, FeatureQuerySet, ProjectQuerySet, RelatedProjectQuerySet) @@ -32,6 +35,7 @@ from readthedocs.projects.validators import validate_domain_name, validate_repository_url from readthedocs.projects.version_handling import determine_stable_version from readthedocs.restapi.client import api +from readthedocs.search.parse_json import process_file from readthedocs.vcs_support.backends import backend_cls from readthedocs.vcs_support.utils import Lock, NonBlockingLock @@ -906,6 +910,50 @@ def __str__(self): return '%s: %s' % (self.name, self.project) +class HTMLFile(ImportedFile): + + """ + Imported HTML file Proxy model. + + This tracks only the HTML files for indexing to search. + """ + + class Meta(object): + proxy = True + + objects = HTMLFileManager() + + @cached_property + def json_file_path(self): + basename = os.path.splitext(self.path)[0] + file_path = basename + '.fjson' + + full_json_path = self.project.get_production_media_path(type_='json', + version_slug=self.version.slug, + include_file=False) + + file_path = os.path.join(full_json_path, file_path) + return file_path + + def get_processed_json(self): + file_path = self.json_file_path + try: + return process_file(file_path) + except Exception: + log.warning('Unhandled exception during search processing file: %s' % file_path) + return { + 'headers': [], + 'content': '', + 'path': file_path, + 'title': '', + 'sections': [] + } + + @cached_property + def processed_json(self): + return self.get_processed_json() + + class Notification(models.Model): project = models.ForeignKey(Project, related_name='%(class)s_notifications') diff --git a/readthedocs/projects/signals.py b/readthedocs/projects/signals.py index 6ef49f9e67c..0c94d464b82 100644 --- a/readthedocs/projects/signals.py +++ b/readthedocs/projects/signals.py @@ -14,3 +14,7 @@ project_import = django.dispatch.Signal(providing_args=["project"]) files_changed = django.dispatch.Signal(providing_args=["project", "files"]) + +bulk_post_create = django.dispatch.Signal(providing_args=["instance_list"]) + +bulk_post_delete = django.dispatch.Signal(providing_args=["instance_list"]) diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index be3e56e3a18..73f2408e433 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -14,6 +14,7 @@ ) import datetime +import fnmatch import hashlib import json import logging @@ -65,20 +66,20 @@ from readthedocs.doc_builder.python_environments import Conda, Virtualenv from readthedocs.projects.models import APIProject from readthedocs.restapi.client import api as api_v2 -from readthedocs.restapi.utils import index_search_request -from readthedocs.search.parse_json import process_all_json_files from readthedocs.vcs_support import utils as vcs_support_utils from readthedocs.worker import app - from .constants import LOG_TEMPLATE from .exceptions import RepositoryError from .models import Domain, ImportedFile, Project +from .models import HTMLFile from .signals import ( after_build, after_vcs, before_build, before_vcs, files_changed, + bulk_post_create, + bulk_post_delete ) log = logging.getLogger(__name__) @@ -993,42 +994,6 @@ def move_files(version_pk, hostname, html=False, localmedia=False, Syncer.copy(from_path, to_path, host=hostname) -@app.task(queue='web') -def update_search(version_pk, commit, delete_non_commit_files=True): - """ - Task to update search indexes. - - :param version_pk: Version id to update - :param commit: Commit that updated index - :param delete_non_commit_files: Delete files not in commit from index - """ - version = Version.objects.get(pk=version_pk) - - if 'sphinx' in version.project.documentation_type: - page_list = process_all_json_files(version, build_dir=False) - else: - log.debug( - 'Unknown documentation type: %s', - version.project.documentation_type - ) - return - - log_msg = ' '.join([page['path'] for page in page_list]) - log.info("(Search Index) Sending Data: %s [%s]", version.project.slug, - log_msg) - index_search_request( - version=version, - page_list=page_list, - commit=commit, - project_scale=0, - page_scale=0, - # Don't index sections to speed up indexing. - # They aren't currently exposed anywhere. - section=False, - delete=delete_non_commit_files, - ) - - @app.task(queue='web') def symlink_project(project_pk): project = Project.objects.get(pk=project_pk) @@ -1137,20 +1102,27 @@ def _manage_imported_files(version, path, commit): :param commit: Commit that updated path """ changed_files = set() + created_html_files = [] for root, __, filenames in os.walk(path): for filename in filenames: + if fnmatch.fnmatch(filename, '*.html'): + model_class = HTMLFile + else: + model_class = ImportedFile + dirpath = os.path.join(root.replace(path, '').lstrip('/'), filename.lstrip('/')) full_path = os.path.join(root, filename) md5 = hashlib.md5(open(full_path, 'rb').read()).hexdigest() try: - obj, __ = ImportedFile.objects.get_or_create( + # pylint: disable=unpacking-non-sequence + obj, __ = model_class.objects.get_or_create( project=version.project, version=version, path=dirpath, name=filename, ) - except ImportedFile.MultipleObjectsReturned: + except model_class.MultipleObjectsReturned: log.warning('Error creating ImportedFile') continue if obj.md5 != md5: @@ -1159,10 +1131,28 @@ def _manage_imported_files(version, path, commit): if obj.commit != commit: obj.commit = commit obj.save() + + if model_class == HTMLFile: + # the `obj` is HTMLFile, so add it to the list + created_html_files.append(obj) + + # Send bulk_post_create signal for bulk indexing to Elasticsearch + bulk_post_create.send(sender=HTMLFile, instance_list=created_html_files) + + # Delete the HTMLFile first from previous commit and + # send bulk_post_delete signal for bulk removing from Elasticsearch + delete_queryset = (HTMLFile.objects.filter(project=version.project, version=version) + .exclude(commit=commit)) + # Keep the objects into memory to send it to signal + instance_list = list(delete_queryset) + # Safely delete from database + delete_queryset.delete() + # Always pass the list of instance, not queryset. + bulk_post_delete.send(sender=HTMLFile, instance_list=instance_list) + # Delete ImportedFiles from previous versions - ImportedFile.objects.filter(project=version.project, - version=version - ).exclude(commit=commit).delete() + (ImportedFile.objects.filter(project=version.project, version=version) + .exclude(commit=commit).delete()) changed_files = [ resolve_path( version.project, filename=file, version_slug=version.slug, @@ -1353,7 +1343,6 @@ def sync_callback(_, version_pk, commit, *args, **kwargs): The first argument is the result from previous tasks, which we discard. """ fileify(version_pk, commit=commit) - update_search(version_pk, commit=commit) @app.task() diff --git a/readthedocs/projects/views/public.py b/readthedocs/projects/views/public.py index 90f5ef978b1..dfc4eecf202 100644 --- a/readthedocs/projects/views/public.py +++ b/readthedocs/projects/views/public.py @@ -27,7 +27,7 @@ from readthedocs.builds.models import Version from readthedocs.builds.views import BuildTriggerMixin from readthedocs.projects.models import ImportedFile, Project -from readthedocs.search.indexes import PageIndex +from readthedocs.search.documents import PageDocument from readthedocs.search.views import LOG_TEMPLATE from .base import ProjectOnboardMixin @@ -219,6 +219,7 @@ def elastic_project_search(request, project_slug): project = get_object_or_404(queryset, slug=project_slug) version_slug = request.GET.get('version', LATEST) query = request.GET.get('q', None) + results = None if query: user = '' if request.user.is_authenticated: @@ -234,48 +235,11 @@ def elastic_project_search(request, project_slug): )) if query: - - kwargs = {} - body = { - 'query': { - 'bool': { - 'should': [ - {'match': {'title': {'query': query, 'boost': 10}}}, - {'match': {'headers': {'query': query, 'boost': 5}}}, - {'match': {'content': {'query': query}}}, - ] - } - }, - 'highlight': { - 'fields': { - 'title': {}, - 'headers': {}, - 'content': {}, - } - }, - 'fields': ['title', 'project', 'version', 'path'], - 'filter': { - 'and': [ - {'term': {'project': project_slug}}, - {'term': {'version': version_slug}}, - ] - }, - 'size': 50, # TODO: Support pagination. - } - - # Add routing to optimize search by hitting the right shard. - kwargs['routing'] = project_slug - - results = PageIndex().search(body, **kwargs) - else: - results = {} - - if results: - # pre and post 1.0 compat - for num, hit in enumerate(results['hits']['hits']): - for key, val in list(hit['fields'].items()): - if isinstance(val, list): - results['hits']['hits'][num]['fields'][key] = val[0] + req = PageDocument.simple_search(query=query) + filtered_query = (req.filter('term', project=project.slug) + .filter('term', version=version_slug)) + paginated_query = filtered_query[:50] + results = paginated_query.execute() return render( request, diff --git a/readthedocs/restapi/urls.py b/readthedocs/restapi/urls.py index c8cdf6cd21e..eb67521c0d9 100644 --- a/readthedocs/restapi/urls.py +++ b/readthedocs/restapi/urls.py @@ -14,15 +14,7 @@ from rest_framework import routers from readthedocs.constants import pattern_opts -from readthedocs.restapi import views -from readthedocs.restapi.views import ( - core_views, - footer_views, - integrations, - search_views, - task_views, -) - +from readthedocs.restapi.views import (core_views, footer_views, task_views, integrations) from .views.model_views import ( BuildCommandViewSet, BuildViewSet, @@ -67,25 +59,6 @@ url(r'footer_html/', footer_views.footer_html, name='footer_html'), ] -search_urls = [ - url( - r'index_search/', - search_views.index_search, - name='index_search', - ), - url(r'search/$', views.search_views.search, name='api_search'), - url( - r'search/project/$', - search_views.project_search, - name='api_project_search', - ), - url( - r'search/section/$', - search_views.section_search, - name='api_section_search', - ), -] - task_urls = [ url( r'jobs/status/(?P[^/]+)/', @@ -134,18 +107,11 @@ ), ] + urlpatterns += function_urls -urlpatterns += search_urls urlpatterns += task_urls urlpatterns += integration_urls -if 'readthedocsext.search' in settings.INSTALLED_APPS: - # pylint: disable=import-error - from readthedocsext.search.docsearch import DocSearch - api_search_urls = [ - url(r'^docsearch/$', DocSearch.as_view(), name='doc_search'), - ] - urlpatterns += api_search_urls if 'readthedocsext.donate' in settings.INSTALLED_APPS: # pylint: disable=import-error diff --git a/readthedocs/restapi/utils.py b/readthedocs/restapi/utils.py index 8637cd1779b..9e7bae5bc18 100644 --- a/readthedocs/restapi/utils.py +++ b/readthedocs/restapi/utils.py @@ -8,7 +8,6 @@ unicode_literals, ) -import hashlib import logging from rest_framework.pagination import PageNumberPagination @@ -23,7 +22,6 @@ TAG, ) from readthedocs.builds.models import Version -from readthedocs.search.indexes import PageIndex, ProjectIndex, SectionIndex log = logging.getLogger(__name__) @@ -174,119 +172,6 @@ def delete_versions(project, version_data): return set() -def index_search_request( - version, page_list, commit, project_scale, page_scale, section=True, - delete=True): - """ - Update search indexes with build output JSON. - - In order to keep sub-projects all indexed on the same shard, indexes will be - updated using the parent project's slug as the routing value. - """ - # TODO refactor this function - # pylint: disable=too-many-locals - project = version.project - - log_msg = ' '.join([page['path'] for page in page_list]) - log.info( - 'Updating search index: project=%s pages=[%s]', - project.slug, - log_msg, - ) - - project_obj = ProjectIndex() - project_obj.index_document( - data={ - 'id': project.pk, - 'name': project.name, - 'slug': project.slug, - 'description': project.description, - 'lang': project.language, - 'author': [user.username for user in project.users.all()], - 'url': project.get_absolute_url(), - 'tags': None, - 'weight': project_scale, - }) - - page_obj = PageIndex() - section_obj = SectionIndex() - index_list = [] - section_index_list = [] - routes = [project.slug] - routes.extend([p.parent.slug for p in project.superprojects.all()]) - for page in page_list: - log.debug('Indexing page: %s:%s', project.slug, page['path']) - to_hash = '-'.join([project.slug, version.slug, page['path']]) - page_id = hashlib.md5(to_hash.encode('utf-8')).hexdigest() - index_list.append({ - 'id': page_id, - 'project': project.slug, - 'version': version.slug, - 'path': page['path'], - 'title': page['title'], - 'headers': page['headers'], - 'content': page['content'], - 'taxonomy': None, - 'commit': commit, - 'weight': page_scale + project_scale, - }) - if section: - for sect in page['sections']: - id_to_hash = '-'.join([ - project.slug, - version.slug, - page['path'], - sect['id'], - ]) - section_index_list.append({ - 'id': (hashlib.md5(id_to_hash.encode('utf-8')).hexdigest()), - 'project': project.slug, - 'version': version.slug, - 'path': page['path'], - 'page_id': sect['id'], - 'title': sect['title'], - 'content': sect['content'], - 'weight': page_scale, - }) - for route in routes: - section_obj.bulk_index( - section_index_list, - parent=page_id, - routing=route, - ) - - for route in routes: - page_obj.bulk_index(index_list, parent=project.slug, routing=route) - - if delete: - log.info('Deleting files not in commit: %s', commit) - # TODO: AK Make sure this works - delete_query = { - 'query': { - 'bool': { - 'must': [ - { - 'term': { - 'project': project.slug, - }, - }, - { - 'term': { - 'version': version.slug, - }, - }, - ], - 'must_not': { - 'term': { - 'commit': commit, - }, - }, - }, - }, - } - page_obj.delete_document(body=delete_query) - - class RemoteOrganizationPagination(PageNumberPagination): page_size = 25 diff --git a/readthedocs/restapi/views/search_views.py b/readthedocs/restapi/views/search_views.py deleted file mode 100644 index abe36174097..00000000000 --- a/readthedocs/restapi/views/search_views.py +++ /dev/null @@ -1,151 +0,0 @@ -"""Endpoints related to searching through projects, sections, etc.""" - -from __future__ import absolute_import -import logging - -from rest_framework import decorators, permissions, status -from rest_framework.renderers import JSONRenderer -from rest_framework.response import Response - -from readthedocs.builds.constants import LATEST -from readthedocs.builds.models import Version -from readthedocs.projects.models import Project, ProjectRelationship -from readthedocs.search.lib import search_file, search_project, search_section -from readthedocs.restapi import utils - - -log = logging.getLogger(__name__) - - -@decorators.api_view(['POST']) -@decorators.permission_classes((permissions.IsAdminUser,)) -@decorators.renderer_classes((JSONRenderer,)) -def index_search(request): - """Add things to the search index.""" - data = request.data['data'] - version_pk = data['version_pk'] - commit = data.get('commit') - version = Version.objects.get(pk=version_pk) - - project_scale = 1 - page_scale = 1 - - utils.index_search_request( - version=version, page_list=data['page_list'], commit=commit, - project_scale=project_scale, page_scale=page_scale) - - return Response({'indexed': True}) - - -@decorators.api_view(['GET']) -@decorators.permission_classes((permissions.AllowAny,)) -@decorators.renderer_classes((JSONRenderer,)) -def search(request): - """Perform search, supplement links by resolving project domains.""" - project_slug = request.GET.get('project', None) - version_slug = request.GET.get('version', LATEST) - query = request.GET.get('q', None) - if project_slug is None or query is None: - return Response({'error': 'Need project and q'}, - status=status.HTTP_400_BAD_REQUEST) - try: - project = Project.objects.get(slug=project_slug) - except Project.DoesNotExist: - return Response({'error': 'Project not found'}, - status=status.HTTP_404_NOT_FOUND) - log.debug("(API Search) %s", query) - results = search_file(request=request, project_slug=project_slug, - version_slug=version_slug, query=query) - - if results is None: - return Response({'error': 'Project not found'}, - status=status.HTTP_404_NOT_FOUND) - - # Supplement result paths with domain information on project - hits = results.get('hits', {}).get('hits', []) - for (n, hit) in enumerate(hits): - fields = hit.get('fields', {}) - search_project = fields.get('project')[0] - search_version = fields.get('version')[0] - path = fields.get('path')[0] - canonical_url = project.get_docs_url(version_slug=version_slug) - if search_project != project_slug: - try: - subproject = project.subprojects.get(child__slug=search_project) - canonical_url = subproject.child.get_docs_url( - version_slug=search_version - ) - except ProjectRelationship.DoesNotExist: - pass - results['hits']['hits'][n]['fields']['link'] = ( - canonical_url + path - ) - - return Response({'results': results}) - - -@decorators.api_view(['GET']) -@decorators.permission_classes((permissions.AllowAny,)) -@decorators.renderer_classes((JSONRenderer,)) -def project_search(request): - query = request.GET.get('q', None) - if query is None: - return Response({'error': 'Need project and q'}, status=status.HTTP_400_BAD_REQUEST) - log.debug("(API Project Search) %s", (query)) - results = search_project(request=request, query=query) - return Response({'results': results}) - - -@decorators.api_view(['GET']) -@decorators.permission_classes((permissions.AllowAny,)) -@decorators.renderer_classes((JSONRenderer,)) -def section_search(request): - """ - Section search. - - Queries with query ``q`` across all documents and projects. Queries can be - limited to a single project or version by using the ``project`` and - ``version`` GET arguments in your request. - - When you search, you will have a ``project`` facet, which includes the - number of matching sections per project. When you search inside a project, - the ``path`` facet will show the number of matching sections per page. - - Possible GET args - ----------------- - - q **(required)** - The query string **Required** - - project - A project slug - - version - A version slug - - path - A file path slug - - - Example:: - - GET /api/v2/search/section/?q=virtualenv&project=django - """ - query = request.GET.get('q', None) - if not query: - return Response( - {'error': 'Search term required. Use the "q" GET arg to search. '}, - status=status.HTTP_400_BAD_REQUEST) - project_slug = request.GET.get('project', None) - version_slug = request.GET.get('version', LATEST) - path = request.GET.get('path', None) - log.debug("(API Section Search) [%s:%s] %s", project_slug, version_slug, - query) - results = search_section( - request=request, - query=query, - project_slug=project_slug, - version_slug=version_slug, - path=path, - ) - return Response({'results': results}) diff --git a/readthedocs/search/__init__.py b/readthedocs/search/__init__.py index e69de29bb2d..552c9337386 100644 --- a/readthedocs/search/__init__.py +++ b/readthedocs/search/__init__.py @@ -0,0 +1 @@ +default_app_config = 'readthedocs.search.apps.SearchConfig' diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py new file mode 100644 index 00000000000..cea004334a6 --- /dev/null +++ b/readthedocs/search/api.py @@ -0,0 +1,58 @@ +from rest_framework import generics +from rest_framework import exceptions +from rest_framework.exceptions import ValidationError + +from readthedocs.projects.models import Project +from readthedocs.search.documents import PageDocument +from readthedocs.search.filters import SearchFilterBackend +from readthedocs.search.pagination import SearchPagination +from readthedocs.search.serializers import PageSearchSerializer +from readthedocs.search.utils import get_project_list_or_404 + + +class PageSearchAPIView(generics.ListAPIView): + pagination_class = SearchPagination + filter_backends = [SearchFilterBackend] + serializer_class = PageSearchSerializer + + def get_queryset(self): + """ + Return Elasticsearch DSL Search object instead of Django Queryset. + + Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern. + So for searching, its possible to return ``Search`` object instead of queryset. + The ``filter_backends`` and ``pagination_class`` is compatible with ``Search`` + """ + # Validate all the required params are there + self.validate_query_params() + query = self.request.query_params.get('q', '') + queryset = PageDocument.simple_search(query=query) + return queryset + + def validate_query_params(self): + required_query_params = {'q', 'project', 'version'} # python `set` literal is `{}` + request_params = set(self.request.query_params.keys()) + missing_params = required_query_params - request_params + if missing_params: + errors = {} + for param in missing_params: + errors[param] = ["This query param is required"] + + raise ValidationError(errors) + + def get_serializer_context(self): + context = super(PageSearchAPIView, self).get_serializer_context() + context['projects_url'] = self.get_all_projects_url() + return context + + def get_all_projects_url(self): + version_slug = self.request.query_params.get('version') + project_slug = self.request.query_params.get('project') + return {project_slug: '/foo/bar/'} + all_projects = get_project_list_or_404(project_slug=project_slug, user=self.request.user) + projects_url = {} + + for project in all_projects: + projects_url[project.slug] = project.get_docs_url(version_slug=version_slug) + + return projects_url diff --git a/readthedocs/search/apps.py b/readthedocs/search/apps.py new file mode 100644 index 00000000000..108a2ecc69f --- /dev/null +++ b/readthedocs/search/apps.py @@ -0,0 +1,10 @@ +"""Project app config""" + +from django.apps import AppConfig + + +class SearchConfig(AppConfig): + name = 'readthedocs.search' + + def ready(self): + from .signals import index_html_file, remove_html_file diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py new file mode 100644 index 00000000000..770c9cb9384 --- /dev/null +++ b/readthedocs/search/documents.py @@ -0,0 +1,125 @@ +from django.conf import settings +from django_elasticsearch_dsl import DocType, Index, fields +from elasticsearch_dsl.query import SimpleQueryString, Bool + +from readthedocs.projects.models import Project, HTMLFile +from readthedocs.search.faceted_search import ProjectSearch, FileSearch +from .mixins import RTDDocTypeMixin + +project_conf = settings.ES_INDEXES['project'] +project_index = Index(project_conf['name']) +project_index.settings(**project_conf['settings']) + +page_conf = settings.ES_INDEXES['page'] +page_index = Index(page_conf['name']) +page_index.settings(**page_conf['settings']) + + +@project_index.doc_type +class ProjectDocument(RTDDocTypeMixin, DocType): + + class Meta(object): + model = Project + fields = ('name', 'slug', 'description') + + url = fields.TextField(attr='get_absolute_url') + users = fields.NestedField(properties={ + 'username': fields.TextField(), + 'id': fields.IntegerField(), + }) + language = fields.KeywordField() + + @classmethod + def faceted_search(cls, query, language=None, using=None, index=None): + kwargs = { + 'using': using or cls._doc_type.using, + 'index': index or cls._doc_type.index, + 'doc_types': [cls], + 'model': cls._doc_type.model, + 'query': query + } + + if language: + kwargs['filters'] = {'language': language} + + return ProjectSearch(**kwargs) + + +@page_index.doc_type +class PageDocument(RTDDocTypeMixin, DocType): + + class Meta(object): + model = HTMLFile + fields = ('commit',) + ignore_signals = settings.ES_PAGE_IGNORE_SIGNALS + + project = fields.KeywordField(attr='project.slug') + version = fields.KeywordField(attr='version.slug') + + title = fields.TextField(attr='processed_json.title') + headers = fields.TextField(attr='processed_json.headers') + content = fields.TextField(attr='processed_json.content') + path = fields.KeywordField(attr='processed_json.path') + + # Fields to perform search with weight + search_fields = ['title^10', 'headers^5', 'content'] + # Exclude some files to not index + excluded_files = ['search.html', 'genindex.html', 'py-modindex.html'] + + @classmethod + def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None): + es_query = cls.get_es_query(query=query) + kwargs = { + 'using': using or cls._doc_type.using, + 'index': index or cls._doc_type.index, + 'doc_types': [cls], + 'model': cls._doc_type.model, + 'query': es_query, + 'fields': cls.search_fields + } + filters = {} + + if projects_list: + filters['project'] = projects_list + if versions_list: + filters['version'] = versions_list + + kwargs['filters'] = filters + + return FileSearch(**kwargs) + + @classmethod + def simple_search(cls, query, using=None, index=None): + es_search = cls.search(using=using, index=index) + es_query = cls.get_es_query(query=query) + highlighted_fields = [f.split('^', 1)[0] for f in cls.search_fields] + + es_search = es_search.query(es_query).highlight(*highlighted_fields) + return es_search + + @classmethod + def get_es_query(cls, query): + """Return the Elasticsearch query generated from the query string""" + all_queries = [] + + # Need to search for both 'AND' and 'OR' operations + # The score of AND should be higher as it satisfies both OR and AND + for operator in ['AND', 'OR']: + query_string = SimpleQueryString(query=query, fields=cls.search_fields, + default_operator=operator) + all_queries.append(query_string) + + # Run bool query with should, so it returns result where either of the query matches + bool_query = Bool(should=all_queries) + + return bool_query + + def get_queryset(self): + """Overwrite default queryset to filter certain files to index""" + queryset = super(PageDocument, self).get_queryset() + + # Do not index files that belong to non sphinx project + # Also do not index certain files + queryset = (queryset.filter(project__documentation_type__contains='sphinx') + .exclude(name__in=self.excluded_files)) + return queryset diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py new file mode 100644 index 00000000000..4a14cb8c541 --- /dev/null +++ b/readthedocs/search/faceted_search.py @@ -0,0 +1,44 @@ +from elasticsearch_dsl import FacetedSearch, TermsFacet +from elasticsearch_dsl.query import SimpleQueryString, Bool + + +class RTDFacetedSearch(FacetedSearch): + + """Overwrite the initialization in order too meet our needs""" + + # TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916 + # See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916 + + def __init__(self, using, index, doc_types, model, fields=None, **kwargs): + self.using = using + self.index = index + self.doc_types = doc_types + self._model = model + if fields: + self.fields = fields + super(RTDFacetedSearch, self).__init__(**kwargs) + + +class ProjectSearch(RTDFacetedSearch): + fields = ['name^5', 'description'] + facets = { + 'language': TermsFacet(field='language') + } + + +class FileSearch(RTDFacetedSearch): + facets = { + 'project': TermsFacet(field='project'), + 'version': TermsFacet(field='version') + } + + def query(self, search, query): + """ + Add query part to ``search`` + + Overriding because we pass ES Query object instead of string + """ + if query: + search = search.query(query) + + return search diff --git a/readthedocs/search/filters.py b/readthedocs/search/filters.py new file mode 100644 index 00000000000..ba0154def93 --- /dev/null +++ b/readthedocs/search/filters.py @@ -0,0 +1,20 @@ +from rest_framework import filters + + +class SearchFilterBackend(filters.BaseFilterBackend): + + """Filter search result with project""" + + def filter_queryset(self, request, queryset, view): + """Overwrite the method to compatible with Elasticsearch DSL Search object.""" + # ``queryset`` is actually a Elasticsearch DSL ``Search`` object. + # So change the variable name + es_search = queryset + version_slug = request.query_params.get('version') + projects_info = view.get_all_projects_url() + project_slug_list = list(projects_info.keys()) + # Elasticsearch ``terms`` query can take multiple values as list, + # while ``term`` query takes single value. + filtered_es_search = (es_search.filter('terms', project=project_slug_list) + .filter('term', version=version_slug)) + return filtered_es_search diff --git a/readthedocs/search/indexes.py b/readthedocs/search/indexes.py deleted file mode 100644 index 48e4baecc5e..00000000000 --- a/readthedocs/search/indexes.py +++ /dev/null @@ -1,374 +0,0 @@ -""" -Search indexing classes to index into Elasticsearch. - -Django settings that should be defined: - - `ES_HOSTS`: A list of hosts where Elasticsearch lives. E.g. - ['192.168.1.1:9200', '192.168.2.1:9200'] - - `ES_DEFAULT_NUM_REPLICAS`: An integer of the number of replicas. - - `ES_DEFAULT_NUM_SHARDS`: An integer of the number of shards. - - -TODO: Handle page removal case in Page. - -""" -from __future__ import absolute_import -from builtins import object - -from django.utils import timezone - -from elasticsearch import Elasticsearch, exceptions -from elasticsearch.helpers import bulk_index - -from django.conf import settings - - -class Index(object): - - """Base class to define some common methods across indexes.""" - - # The _index and _type define the URL path to Elasticsearch, e.g.: - # http://localhost:9200/{_index}/{_type}/_search - _index = 'readthedocs' - _type = None - - def __init__(self): - self.es = Elasticsearch(settings.ES_HOSTS) - - def get_settings(self, settings_override=None): - """ - Returns settings to be passed to ES create_index. - - If `settings_override` is provided, this will use `settings_override` - to override the defaults defined here. - - """ - default_settings = { - 'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS, - 'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS, - 'refresh_interval': '5s', - 'store.compress.tv': True, - 'store.compress.stored': True, - 'analysis': self.get_analysis(), - } - if settings_override: - default_settings.update(settings_override) - - return default_settings - - def get_analysis(self): - """ - Returns the analysis dict to be used in settings for create_index. - - For languages that ES supports we define either the minimal or light - stemming, which isn't as aggressive as the snowball stemmer. We also - define the stopwords for that language. - - For all languages we've customized we're using the ICU plugin. - - """ - analyzers = {} - filters = {} - - # The default is used for fields that need ICU but are composed of - # many languages. - analyzers['default_icu'] = { - 'type': 'custom', - 'tokenizer': 'icu_tokenizer', - 'filter': ['word_delimiter', 'icu_folding', 'icu_normalizer'], - } - - # Customize the word_delimiter filter to set various options. - filters['custom_word_delimiter'] = { - 'type': 'word_delimiter', - 'preserve_original': True, - } - - return { - 'analyzer': analyzers, - 'filter': filters, - } - - def timestamped_index(self): - return '{0}-{1}'.format( - self._index, timezone.now().strftime('%Y%m%d%H%M%S')) - - def create_index(self, index=None): - """ - Creates index. - - This uses `get_settings` and `get_mappings` to define the index. - - """ - index = index or self._index - body = { - 'settings': self.get_settings(), - } - self.es.indices.create(index=index, body=body) - - def refresh_index(self, index=None): - index = index or self._index - self.es.indices.refresh(index=index) - - def put_mapping(self, index=None): - index = index or self._index - self.es.indices.put_mapping(self._type, self.get_mapping(), index) - - def bulk_index(self, data, index=None, chunk_size=500, parent=None, - routing=None): - """ - Given a list of documents, uses Elasticsearch bulk indexing. - - For each doc this calls `extract_document`, then indexes. - - `chunk_size` defaults to the elasticsearch lib's default. Override per - your document size as needed. - - """ - index = index or self._index - docs = [] - for d in data: - source = self.extract_document(d) - doc = { - '_index': index, - '_type': self._type, - '_id': source['id'], - '_source': source, - } - if parent: - doc['_parent'] = parent - if routing: - doc['_routing'] = routing - docs.append(doc) - - # TODO: This doesn't work with the new ES setup. - bulk_index(self.es, docs, chunk_size=chunk_size) - - def index_document(self, data, index=None, parent=None, routing=None): - doc = self.extract_document(data) - kwargs = { - 'index': index or self._index, - 'doc_type': self._type, - 'body': doc, - 'id': doc['id'] - } - if parent: - kwargs['parent'] = parent - if routing: - kwargs['routing'] = routing - self.es.index(**kwargs) - - def delete_index(self, index_name): - - self.es.indices.delete(index=index_name) - - def delete_document(self, body, index=None, parent=None, routing=None): - kwargs = { - 'index': index or self._index, - 'doc_type': self._type, - 'body': body, - } - if parent: - kwargs['parent'] = parent - if routing: - kwargs['routing'] = routing - return self.es.delete_by_query(**kwargs) - - def get_mapping(self): - """Returns the mapping for this _index and _type.""" - raise NotImplementedError() - - def extract_document(self, data): - """Extracts the Elasticsearch document for this object instance.""" - raise NotImplementedError() - - def update_aliases(self, new_index, delete=True): - """ - Points `_index` to `new_index` and deletes `_index` if delete=True. - - The ES `update_aliases` is atomic. - """ - old_index = None - - # Get current alias, if any. - try: - aliases = self.es.indices.get_alias(name=self._index) - if aliases and list(aliases.keys()): - old_index = list(aliases.keys())[0] - except exceptions.NotFoundError: - pass - - actions = [] - if old_index: - actions.append({'remove': {'index': old_index, - 'alias': self._index}}) - actions.append({'add': {'index': new_index, 'alias': self._index}}) - - self.es.indices.update_aliases(body={'actions': actions}) - - # Delete old index if any and if specified. - if delete and old_index: - self.es.indices.delete(index=old_index) - - def search(self, body, **kwargs): - return self.es.search(index=self._index, doc_type=self._type, - body=body, **kwargs) - - -class ProjectIndex(Index): - - """Search index configuration for Projects""" - - _type = 'project' - - def get_mapping(self): - mapping = { - self._type: { - # Disable _all field to reduce index size. - '_all': {'enabled': False}, - 'properties': { - 'id': {'type': 'long'}, - 'name': {'type': 'string', 'analyzer': 'default_icu'}, - 'description': {'type': 'string', 'analyzer': 'default_icu'}, - - 'slug': {'type': 'string', 'index': 'not_analyzed'}, - 'lang': {'type': 'string', 'index': 'not_analyzed'}, - 'tags': {'type': 'string', 'index': 'not_analyzed'}, - 'privacy': {'type': 'string', 'index': 'not_analyzed'}, - 'author': { - 'type': 'string', - 'analyzer': 'default_icu', - 'fields': { - 'raw': { - 'type': 'string', - 'index': 'not_analyzed', - }, - }, - }, - 'url': {'type': 'string', 'index': 'not_analyzed'}, - # Add a weight field to enhance relevancy scoring. - 'weight': {'type': 'float'}, - } - } - } - - return mapping - - def extract_document(self, data): - doc = {} - - attrs = ('id', 'name', 'slug', 'description', 'lang', 'tags', 'author', 'url') - for attr in attrs: - doc[attr] = data.get(attr, '') - - # Add project boost. - doc['weight'] = data.get('weight', 1.0) - - return doc - - -class PageIndex(Index): - - """Search index configuration for Pages""" - - _type = 'page' - _parent = 'project' - - def get_mapping(self): - mapping = { - self._type: { - # Disable _all field to reduce index size. - '_all': {'enabled': False}, - # Associate a page with a project. - '_parent': {'type': self._parent}, - 'properties': { - 'id': {'type': 'string', 'index': 'not_analyzed'}, - 'sha': {'type': 'string', 'index': 'not_analyzed'}, - 'project': {'type': 'string', 'index': 'not_analyzed'}, - 'version': {'type': 'string', 'index': 'not_analyzed'}, - 'path': {'type': 'string', 'index': 'not_analyzed'}, - 'taxonomy': {'type': 'string', 'index': 'not_analyzed'}, - 'commit': {'type': 'string', 'index': 'not_analyzed'}, - - 'title': {'type': 'string', 'analyzer': 'default_icu'}, - 'headers': {'type': 'string', 'analyzer': 'default_icu'}, - 'content': {'type': 'string', 'analyzer': 'default_icu'}, - # Add a weight field to enhance relevancy scoring. - 'weight': {'type': 'float'}, - } - } - } - - return mapping - - def extract_document(self, data): - doc = {} - - attrs = ('id', 'project', 'title', 'headers', 'version', 'path', - 'content', 'taxonomy', 'commit') - for attr in attrs: - doc[attr] = data.get(attr, '') - - # Add page boost. - doc['weight'] = data.get('weight', 1.0) - - return doc - - -class SectionIndex(Index): - - """Search index configuration for Sections""" - - _type = 'section' - _parent = 'page' - - def get_mapping(self): - mapping = { - self._type: { - # Disable _all field to reduce index size. - '_all': {'enabled': False}, - # Associate a section with a page. - '_parent': {'type': self._parent}, - # Commenting this out until we need it. - # 'suggest': { - # "type": "completion", - # "index_analyzer": "simple", - # "search_analyzer": "simple", - # "payloads": True, - # }, - 'properties': { - 'id': {'type': 'string', 'index': 'not_analyzed'}, - 'project': {'type': 'string', 'index': 'not_analyzed'}, - 'version': {'type': 'string', 'index': 'not_analyzed'}, - 'path': {'type': 'string', 'index': 'not_analyzed'}, - 'page_id': {'type': 'string', 'index': 'not_analyzed'}, - 'commit': {'type': 'string', 'index': 'not_analyzed'}, - 'title': {'type': 'string', 'analyzer': 'default_icu'}, - 'content': {'type': 'string', 'analyzer': 'default_icu'}, - 'blocks': { - 'type': 'object', - 'properties': { - 'code': {'type': 'string', 'analyzer': 'default_icu'} - } - }, - # Add a weight field to enhance relevancy scoring. - 'weight': {'type': 'float'}, - } - } - } - - return mapping - - def extract_document(self, data): - doc = {} - - attrs = ('id', 'project', 'title', 'page_id', 'version', 'path', 'content', 'commit') - for attr in attrs: - doc[attr] = data.get(attr, '') - - # Add page boost. - doc['weight'] = data.get('weight', 1.0) - - return doc diff --git a/readthedocs/search/lib.py b/readthedocs/search/lib.py deleted file mode 100644 index 8500a829b03..00000000000 --- a/readthedocs/search/lib.py +++ /dev/null @@ -1,250 +0,0 @@ -"""Utilities related to searching Elastic.""" -from __future__ import absolute_import -from __future__ import print_function -from pprint import pprint - -from django.conf import settings - -from .indexes import PageIndex, ProjectIndex, SectionIndex - -from readthedocs.builds.constants import LATEST -from readthedocs.projects.models import Project -from readthedocs.search.signals import (before_project_search, - before_file_search, - before_section_search) - - -def search_project(request, query, language=None): - """Search index for projects matching query.""" - body = { - "query": { - "bool": { - "should": [ - {"match": {"name": {"query": query, "boost": 10}}}, - {"match": {"description": {"query": query}}}, - ] - }, - }, - "facets": { - "language": { - "terms": {"field": "lang"}, - }, - }, - "highlight": { - "fields": { - "name": {}, - "description": {}, - } - }, - "fields": ["name", "slug", "description", "lang", "url"], - "size": 50 # TODO: Support pagination. - } - - if language: - body['facets']['language']['facet_filter'] = {"term": {"lang": language}} - body['filter'] = {"term": {"lang": language}} - - before_project_search.send(request=request, sender=ProjectIndex, body=body) - - return ProjectIndex().search(body) - - -def search_file(request, query, project_slug=None, version_slug=LATEST, taxonomy=None): - """ - Search index for files matching query. - - Raises a 404 error on missing project - - :param request: request instance - :param query: string to query for - :param project_slug: :py:class:`Project` slug - :param version_slug: slug for :py:class:`Project` version slug - :param taxonomy: taxonomy for search - """ - kwargs = {} - body = { - "query": { - "bool": { - "should": [ - {"match_phrase": { - "title": { - "query": query, - "boost": 10, - "slop": 2, - }, - }}, - {"match_phrase": { - "headers": { - "query": query, - "boost": 5, - "slop": 3, - }, - }}, - {"match_phrase": { - "content": { - "query": query, - "slop": 5, - }, - }}, - ] - } - }, - "facets": { - "taxonomy": { - "terms": {"field": "taxonomy"}, - }, - "project": { - "terms": {"field": "project"}, - }, - "version": { - "terms": {"field": "version"}, - }, - }, - "highlight": { - "fields": { - "title": {}, - "headers": {}, - "content": {}, - } - }, - "fields": ["title", "project", "version", "path"], - "size": 50 # TODO: Support pagination. - } - - if project_slug or version_slug or taxonomy: - final_filter = {"and": []} - - if project_slug: - try: - project = (Project.objects - .api(request.user) - .get(slug=project_slug)) - project_slugs = [project.slug] - # We need to use the obtuse syntax here because the manager - # doesn't pass along to ProjectRelationships - project_slugs.extend(s.slug for s - in Project.objects.public( - request.user).filter( - superprojects__parent__slug=project.slug)) - final_filter['and'].append({"terms": {"project": project_slugs}}) - - # Add routing to optimize search by hitting the right shard. - # This purposely doesn't apply routing if the project has more - # than one parent project. - if project.superprojects.exists(): - if project.superprojects.count() == 1: - kwargs['routing'] = (project.superprojects.first() - .parent.slug) - else: - kwargs['routing'] = project_slug - except Project.DoesNotExist: - return None - - if version_slug: - final_filter['and'].append({'term': {'version': version_slug}}) - - if taxonomy: - final_filter['and'].append({'term': {'taxonomy': taxonomy}}) - - body['filter'] = final_filter - body['facets']['project']['facet_filter'] = final_filter - body['facets']['version']['facet_filter'] = final_filter - body['facets']['taxonomy']['facet_filter'] = final_filter - - if settings.DEBUG: - print("Before Signal") - pprint(body) - before_file_search.send(request=request, sender=PageIndex, body=body) - if settings.DEBUG: - print("After Signal") - pprint(body) - - return PageIndex().search(body, **kwargs) - - -def search_section(request, query, project_slug=None, version_slug=LATEST, - path=None): - """ - Search for a section of content. - - When you search, you will have a ``project`` facet, which includes the - number of matching sections per project. When you search inside a project, - the ``path`` facet will show the number of matching sections per page. - - :param request: Request instance - :param query: string to use in query - :param project_slug: :py:class:`Project` instance - :param version_slug: :py:class:`Project` version instance - :param taxonomy: search taxonomy - """ - kwargs = {} - body = { - "query": { - "bool": { - "should": [ - {"match_phrase": { - "title": { - "query": query, - "boost": 10, - "slop": 2, - }, - }}, - {"match_phrase": { - "content": { - "query": query, - "slop": 5, - }, - }}, - ] - } - }, - "facets": { - "project": { - "terms": {"field": "project"}, - "facet_filter": { - "term": {"version": version_slug}, - } - }, - }, - "highlight": { - "fields": { - "title": {}, - "content": {}, - } - }, - "fields": ["title", "project", "version", "path", "page_id", "content"], - "size": 10 # TODO: Support pagination. - } - - if project_slug: - body['filter'] = { - "and": [ - {"term": {"project": project_slug}}, - {"term": {"version": version_slug}}, - ] - } - body['facets']['path'] = { - "terms": {"field": "path"}, - "facet_filter": { - "term": {"project": project_slug}, - } - }, - # Add routing to optimize search by hitting the right shard. - kwargs['routing'] = project_slug - - if path: - body['filter'] = { - "and": [ - {"term": {"path": path}}, - ] - } - - if path and not project_slug: - # Show facets when we only have a path - body['facets']['path'] = { - "terms": {"field": "path"} - } - - before_section_search.send(request=request, sender=PageIndex, body=body) - - return SectionIndex().search(body, **kwargs) diff --git a/readthedocs/search/management/__init__.py b/readthedocs/search/management/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/readthedocs/search/management/commands/__init__.py b/readthedocs/search/management/commands/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py new file mode 100644 index 00000000000..47b15cd4f15 --- /dev/null +++ b/readthedocs/search/management/commands/reindex_elasticsearch.py @@ -0,0 +1,123 @@ +import datetime +import logging + +from celery import chord, chain +from django.apps import apps +from django.conf import settings +from django.core.management import BaseCommand +from django.utils import timezone +from django_elasticsearch_dsl.registries import registry + +from ...tasks import (index_objects_to_es, switch_es_index, create_new_es_index, + index_missing_objects) +from ...utils import get_chunk + +log = logging.getLogger(__name__) + + +class Command(BaseCommand): + + @staticmethod + def _get_indexing_tasks(app_label, model_name, queryset, document_class, index_name): + total = queryset.count() + chunks = get_chunk(total, settings.ES_TASK_CHUNK_SIZE) + + for chunk in chunks: + data = { + 'app_label': app_label, + 'model_name': model_name, + 'document_class': document_class, + 'index_name': index_name, + 'chunk': chunk + } + yield index_objects_to_es.si(**data) + + def _run_reindex_tasks(self, models, queue): + apply_async_kwargs = {'priority': 0} + if queue: + log.info('Adding indexing tasks to queue {0}'.format(queue)) + apply_async_kwargs['queue'] = queue + else: + log.info('Adding indexing tasks to default queue') + + index_time = timezone.now() + timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + + for doc in registry.get_documents(models): + queryset = doc().get_queryset() + # Get latest object from the queryset + + app_label = queryset.model._meta.app_label + model_name = queryset.model.__name__ + + index_name = doc._doc_type.index + new_index_name = "{}_{}".format(index_name, timestamp) + + pre_index_task = create_new_es_index.si(app_label=app_label, + model_name=model_name, + index_name=index_name, + new_index_name=new_index_name) + + indexing_tasks = self._get_indexing_tasks(app_label=app_label, model_name=model_name, + queryset=queryset, + document_class=str(doc), + index_name=new_index_name) + + post_index_task = switch_es_index.si(app_label=app_label, model_name=model_name, + index_name=index_name, + new_index_name=new_index_name) + + # Task to run in order to add the objects + # that has been inserted into database while indexing_tasks was running + # We pass the creation time of latest object, so its possible to index later items + missed_index_task = index_missing_objects.si(app_label=app_label, + model_name=model_name, + document_class=str(doc), + index_generation_time=index_time) + + # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chords + chord_tasks = chord(header=indexing_tasks, body=post_index_task) + if queue: + pre_index_task.set(queue=queue) + chord_tasks.set(queue=queue) + missed_index_task.set(queue=queue) + # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chain + chain(pre_index_task, chord_tasks, missed_index_task).apply_async(**apply_async_kwargs) + + message = ("Successfully issued tasks for {}.{}, total {} items" + .format(app_label, model_name, queryset.count())) + log.info(message) + + def add_arguments(self, parser): + parser.add_argument( + '--queue', + dest='queue', + action='store', + help="Set the celery queue name for the task." + ) + parser.add_argument( + '--models', + dest='models', + type=str, + nargs='*', + help=("Specify the model to be updated in elasticsearch." + "The format is .") + ) + + def handle(self, *args, **options): + """ + Index models into Elasticsearch index asynchronously using celery. + + You can specify model to get indexed by passing + `--model .` parameter. + Otherwise, it will reindex all the models + """ + models = None + if options['models']: + models = [apps.get_model(model_name) for model_name in options['models']] + + queue = None + if options.get('queue'): + queue = options['queue'] + + self._run_reindex_tasks(models=models, queue=queue) diff --git a/readthedocs/search/mixins.py b/readthedocs/search/mixins.py new file mode 100644 index 00000000000..05a9bb8650b --- /dev/null +++ b/readthedocs/search/mixins.py @@ -0,0 +1,68 @@ +from django.db import models +from django.core.paginator import Paginator + + +class RTDDocTypeMixin(object): + + """ + Override some methods of DocType of DED + + Changelog as following: + - Do not index object that not exist in the provided queryset + - Take additional argument in update method `index_name` to update specific index + Issues: + - https://github.com/sabricot/django-elasticsearch-dsl/issues/111 + """ + + def _prepare_action(self, object_instance, action, index_name=None): + """Overwrite to take `index_name` from parameters for setting index dynamically""" + return { + '_op_type': action, + '_index': index_name or str(self._doc_type.index), + '_type': self._doc_type.mapping.doc_type, + '_id': object_instance.pk, + '_source': ( + self.prepare(object_instance) if action != 'delete' else None + ), + } + + def _get_actions(self, object_list, action, index_name=None): + """Overwrite to take `index_name` from parameters for setting index dynamically""" + if self._doc_type.queryset_pagination is not None: + paginator = Paginator( + object_list, self._doc_type.queryset_pagination + ) + for page in paginator.page_range: + for object_instance in paginator.page(page).object_list: + yield self._prepare_action(object_instance, action, index_name) + else: + for object_instance in object_list: + yield self._prepare_action(object_instance, action, index_name) + + def update(self, thing, refresh=None, action='index', index_name=None, **kwargs): + """Update each document in ES for a model, iterable of models or queryset""" + if refresh is True or ( + refresh is None and self._doc_type.auto_refresh + ): + kwargs['refresh'] = True + + # TODO: remove this overwrite when the issue has been fixed + # https://github.com/sabricot/django-elasticsearch-dsl/issues/111 + if isinstance(thing, models.Model): + # Its a model instance. + + # Do not need to check if its a delete action + # Because while delete action, the object is already remove from database + if action != 'delete': + queryset = self.get_queryset() + obj = queryset.filter(pk=thing.pk) + if not obj.exists(): + return None + + object_list = [thing] + else: + object_list = thing + + return self.bulk( + self._get_actions(object_list, action, index_name=index_name), **kwargs + ) diff --git a/readthedocs/search/pagination.py b/readthedocs/search/pagination.py new file mode 100644 index 00000000000..2805ca67e59 --- /dev/null +++ b/readthedocs/search/pagination.py @@ -0,0 +1,7 @@ +from rest_framework.pagination import PageNumberPagination + + +class SearchPagination(PageNumberPagination): + page_size = 25 + page_size_query_param = 'page_size' + max_page_size = 100 diff --git a/readthedocs/search/parse_json.py b/readthedocs/search/parse_json.py index 9b19a7e7cb3..f6205c269fc 100644 --- a/readthedocs/search/parse_json.py +++ b/readthedocs/search/parse_json.py @@ -43,7 +43,7 @@ def process_all_json_files(version, build_dir=True): def process_headers(data, filename): """Read headers from toc data.""" headers = [] - if 'toc' in data: + if data.get('toc', False): for element in PyQuery(data['toc'])('a'): headers.append(recurse_while_none(element)) if None in headers: diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py new file mode 100644 index 00000000000..7aa8f01c93f --- /dev/null +++ b/readthedocs/search/serializers.py @@ -0,0 +1,23 @@ +from rest_framework import serializers + +from readthedocs.projects.models import Project + + +class PageSearchSerializer(serializers.Serializer): + project = serializers.CharField() + version = serializers.CharField() + title = serializers.CharField() + path = serializers.CharField() + link = serializers.SerializerMethodField() + highlight = serializers.SerializerMethodField() + + def get_link(self, obj): + projects_url = self.context.get('projects_url') + if projects_url: + docs_url = projects_url[obj.project] + return docs_url + obj.path + + def get_highlight(self, obj): + highlight = getattr(obj.meta, 'highlight', None) + if highlight: + return highlight.to_dict() diff --git a/readthedocs/search/signals.py b/readthedocs/search/signals.py index 6abdf64cce9..6e335e06d0a 100644 --- a/readthedocs/search/signals.py +++ b/readthedocs/search/signals.py @@ -1,7 +1,37 @@ """We define custom Django signals to trigger before executing searches.""" from __future__ import absolute_import import django.dispatch +from django.dispatch import receiver +from django_elasticsearch_dsl.apps import DEDConfig +from django_elasticsearch_dsl.registries import registry + +from readthedocs.projects.models import HTMLFile +from readthedocs.projects.signals import bulk_post_create, bulk_post_delete +from readthedocs.search.documents import PageDocument +from readthedocs.search.tasks import index_objects_to_es before_project_search = django.dispatch.Signal(providing_args=["body"]) before_file_search = django.dispatch.Signal(providing_args=["body"]) before_section_search = django.dispatch.Signal(providing_args=["body"]) + + +@receiver(bulk_post_create, sender=HTMLFile) +def index_html_file(instance_list, **_): + kwargs = { + 'app_label': HTMLFile._meta.app_label, + 'model_name': HTMLFile.__name__, + 'document_class': str(PageDocument), + 'index_name': None, # No need to change the index name + 'objects_id': [obj.id for obj in instance_list], + } + + # Do not index if autosync is disabled globally + if DEDConfig.autosync_enabled(): + index_objects_to_es(**kwargs) + + +@receiver(bulk_post_delete, sender=HTMLFile) +def remove_html_file(instance_list, **_): + # Do not index if autosync is disabled globally + if DEDConfig.autosync_enabled(): + registry.delete(instance_list) diff --git a/readthedocs/search/tasks.py b/readthedocs/search/tasks.py new file mode 100644 index 00000000000..b9da9a1b743 --- /dev/null +++ b/readthedocs/search/tasks.py @@ -0,0 +1,115 @@ +import logging + +from django.apps import apps +from django_elasticsearch_dsl.registries import registry + +from readthedocs.worker import app + +log = logging.getLogger(__name__) + + +def _get_index(indices, index_name): + """ + Get Index from all the indices + + :param indices: DED indices list + :param index_name: Name of the index + :return: DED Index + """ + for index in indices: + if str(index) == index_name: + return index + + +def _get_document(model, document_class): + """ + Get DED document class object from the model and name of document class + + :param model: The model class to find the document + :param document_class: the name of the document class. + :return: DED DocType object + """ + documents = registry.get_documents(models=[model]) + + for document in documents: + if str(document) == document_class: + return document + + +@app.task(queue='web') +def create_new_es_index(app_label, model_name, index_name, new_index_name): + model = apps.get_model(app_label, model_name) + indices = registry.get_indices(models=[model]) + old_index = _get_index(indices=indices, index_name=index_name) + new_index = old_index.clone(name=new_index_name) + new_index.create() + + +@app.task(queue='web') +def switch_es_index(app_label, model_name, index_name, new_index_name): + model = apps.get_model(app_label, model_name) + indices = registry.get_indices(models=[model]) + old_index = _get_index(indices=indices, index_name=index_name) + new_index = old_index.clone(name=new_index_name) + old_index_actual_name = None + + if old_index.exists(): + # Alias can not be used to delete an index. + # https://www.elastic.co/guide/en/elasticsearch/reference/6.0/indices-delete-index.html + # So get the index actual name to delete it + old_index_info = old_index.get() + # The info is a dictionary and the key is the actual name of the index + old_index_actual_name = list(old_index_info.keys())[0] + + # Put alias into the new index name and delete the old index if its exist + new_index.put_alias(name=index_name) + if old_index_actual_name: + old_index.connection.indices.delete(index=old_index_actual_name) + + +@app.task(queue='web') +def index_objects_to_es(app_label, model_name, document_class, index_name, + chunk=None, objects_id=None): + + assert not (chunk and objects_id), "You can not pass both chunk and objects_id" + + model = apps.get_model(app_label, model_name) + document = _get_document(model=model, document_class=document_class) + doc_obj = document() + + # WARNING: This must use the exact same queryset as from where we get the ID's + # There is a chance there is a race condition here as the ID's may change as the task runs, + # so we need to think through this a bit more and probably pass explicit ID's, + # but there are performance issues with that on large model sets + queryset = doc_obj.get_queryset() + if chunk: + # Chunk is a tuple with start and end index of queryset + start = chunk[0] + end = chunk[1] + queryset = queryset[start:end] + elif objects_id: + queryset = queryset.filter(id__in=objects_id) + + log.info("Indexing model: {}, '{}' objects".format(model.__name__, queryset.count())) + doc_obj.update(queryset.iterator(), index_name=index_name) + + +@app.task(queue='web') +def index_missing_objects(app_label, model_name, document_class, index_generation_time): + """ + Task to insure that none of the object is missed from indexing. + + The object ids are sent to `index_objects_to_es` task for indexing. + While the task is running, new objects can be created/deleted in database + and they will not be in the tasks for indexing into ES. + This task will index all the objects that got into DB after the `latest_indexed` timestamp + to ensure that everything is in ES index. + """ + model = apps.get_model(app_label, model_name) + document = _get_document(model=model, document_class=document_class) + queryset = document().get_queryset().exclude(modified_date__lte=index_generation_time) + document().update(queryset.iterator()) + + log.info("Indexed {} missing objects from model: {}'".format(queryset.count(), model.__name__)) + + # TODO: Figure out how to remove the objects from ES index that has been deleted diff --git a/readthedocs/search/tests/conftest.py b/readthedocs/search/tests/conftest.py index 59961f3a7e2..e7981d33df8 100644 --- a/readthedocs/search/tests/conftest.py +++ b/readthedocs/search/tests/conftest.py @@ -1,45 +1,42 @@ -import random -import string +import json +import os from random import shuffle import pytest +from django.core.management import call_command from django_dynamic_fixture import G -from readthedocs.projects.models import Project -from readthedocs.search.indexes import Index, ProjectIndex, PageIndex, SectionIndex -from .dummy_data import DUMMY_PAGE_JSON, ALL_PROJECTS +from readthedocs.projects.models import Project, HTMLFile +from .dummy_data import ALL_PROJECTS, PROJECT_DATA_FILES -@pytest.fixture(autouse=True) -def mock_elastic_index(mocker): - index_name = ''.join([random.choice(string.ascii_letters) for _ in range(5)]) - mocker.patch.object(Index, '_index', index_name.lower()) +@pytest.fixture() +def es_index(): + call_command('search_index', '--delete', '-f') + call_command('search_index', '--create') + + yield + call_command('search_index', '--delete', '-f') @pytest.fixture(autouse=True) -def es_index(mock_elastic_index): - # Create the index. - index = Index() - index_name = index.timestamped_index() - index.create_index(index_name) - index.update_aliases(index_name) - # Update mapping - proj = ProjectIndex() - proj.put_mapping() - page = PageIndex() - page.put_mapping() - sec = SectionIndex() - sec.put_mapping() - - yield index - index.delete_index(index_name=index_name) +def all_projects(es_index, mock_processed_json, db, settings): + settings.ELASTICSEARCH_DSL_AUTOSYNC = True + projects_list = [] + for project_slug in ALL_PROJECTS: + project = G(Project, slug=project_slug, name=project_slug) + for file_basename in PROJECT_DATA_FILES[project.slug]: + # file_basename in config are without extension so add html extension + file_name = file_basename + '.html' + version = project.versions.all()[0] + f = G(HTMLFile, project=project, version=version, name=file_name) + f.save() -@pytest.fixture -def all_projects(): - projects = [G(Project, slug=project_slug, name=project_slug) for project_slug in ALL_PROJECTS] - shuffle(projects) - return projects + projects_list.append(project) + + shuffle(projects_list) + return projects_list @pytest.fixture @@ -48,16 +45,19 @@ def project(all_projects): return all_projects[0] -def get_dummy_page_json(version, *args, **kwargs): - dummy_page_json = DUMMY_PAGE_JSON - project_name = version.project.name - return dummy_page_json.get(project_name) +def get_dummy_processed_json(instance): + project_slug = instance.project.slug + basename = os.path.splitext(instance.name)[0] + file_name = basename + '.json' + current_path = os.path.abspath(os.path.dirname(__file__)) + file_path = os.path.join(current_path, "data", project_slug, file_name) + if os.path.exists(file_path): + with open(file_path) as f: + return json.load(f) -@pytest.fixture(autouse=True) -def mock_parse_json(mocker): - # patch the function from `projects.tasks` because it has been point to there - # http://www.voidspace.org.uk/python/mock/patch.html#where-to-patch - mocked_function = mocker.patch('readthedocs.projects.tasks.process_all_json_files') - mocked_function.side_effect = get_dummy_page_json +@pytest.fixture(autouse=True) +def mock_processed_json(mocker): + mocked_function = mocker.patch.object(HTMLFile, 'get_processed_json', autospec=True) + mocked_function.side_effect = get_dummy_processed_json diff --git a/readthedocs/search/tests/data/docs/story.json b/readthedocs/search/tests/data/docs/story.json index 69226b65209..fc191d5e0ea 100644 --- a/readthedocs/search/tests/data/docs/story.json +++ b/readthedocs/search/tests/data/docs/story.json @@ -1,7 +1,6 @@ { - "content": "Philosophy\nRead the Docs is Open Source software. We have licensed the code base as MIT, which provides almost no restrictions on the use of the code.\nHowever, as a project there are things that we care about more than others. We built Read the Docs to support in the Open Source community. The code is open for people to contribute to, so that they may build features into https://readthedocs.org that they want. We also believe sharing the code openly is a valuable learning tool, especially for demonsrating how to collaborate and maintain an enormous website.\nOfficial Support\nThe time of the core developers of Read the Docs is limited. We provide official support for the following things:\nLocal development on the Python code base\nUsage of https://readthedocs.org for Open Source projects\nBug fixes in the code base, as it applies to running it on https://readthedocs.org\nUnsupported\nThere are use cases that we don\u2019t support, because it doesn\u2019t further our goal of promoting in the Open Source Community.\nWe do not support:\nSpecific usage of Sphinx and Mkdocs, that don\u2019t affect our hosting\nCustom s of Read the Docs at your company\n of Read the Docs on other platforms\nAny issues outside of the Read the Docs Python Code\nRationale\nRead the Docs was founded to improve in the Open Source Community. We fully recognize and allow the code to be used for internal installs at companies, but we will not spend our time supporting it. Our time is limited, and we want to spend it on the mission that we set out to originally support.\nIf you feel strongly about installing Read the Docs internal to a company, we will happily link to third party resources on this topic. Please open an issue with a proposal if you want to take on this task.", + "content": "ReadtheDocsPhilosophy\nRead the Docs is Open Source software. We have licensed the code base as MIT, which provides almost no restrictions on the use of the code.\nHowever, as a project there are things that we care about more than others. We built Read the Docs to support in the Open Source community. The code is open for people to contribute to, so that they may build features into https://readthedocs.org that they want. We also believe sharing the code openly is a valuable learning tool, especially for demonsrating how to collaborate and maintain an enormous website.\nOfficial website Support\nThe time of the core developers of Read the Docs is limited. We provide official developers support for the following things:\nLocal development on the Python code base\nUsage of https://readthedocs.org for Open Source projects\nBug fixes in the code base, as it applies to running it on https://readthedocs.org\nUnsupported\nThere are use cases that we don\u2019t support, because it doesn\u2019t further our goal of promoting in the Open Source Community.\nWe do not support:\nSpecific usage of Sphinx and Mkdocs, that don\u2019t affect our hosting\nCustom s of Read the Docs at your company\n of Read the Docs on other platforms\nAny issues outside of the Read the Docs Python Code\nRationale\nRead the Docs was founded to improve in the Open Source Community. We fully recognize and allow the code to be used for internal installs at companies, but we will not spend our time supporting it. Our time is limited, and we want to spend it on the mission that we set out to originally support.\nIf you feel strongly about installing Read the Docs internal to a company, we will happily link to third party resources on this topic. Please open an issue with a proposal if you want to take on this task.", "headers": [ - "Official Support", "Unsupported", "Rationale" ], diff --git a/readthedocs/search/tests/data/docs/wiping.json b/readthedocs/search/tests/data/docs/wiping.json index a54889e05fa..ac1cebca67e 100644 --- a/readthedocs/search/tests/data/docs/wiping.json +++ b/readthedocs/search/tests/data/docs/wiping.json @@ -1,5 +1,5 @@ { - "content": "Wiping a Build Environment\nSometimes it happen that your Builds start failing because the build environment where the is created is stale or broken. This could happen for a couple of different reasons like pip not upgrading a package properly or a corrupted cached Python package.\nIn any of these cases (and many others), the solution could be just wiping out the existing build environment files and allow Read the Docs to create a new fresh one.\nFollow these steps to wipe the build environment:\nGo to Versions\nClick on the Edit button of the version you want to wipe on the right side of the page\nGo to the bottom of the page and click the wipe link, next to the \u201cSave\u201d button\nNote\nBy wiping the build environment, all the rst, md, and code files associated with it will be removed but not the already built (HTML and PDF files). Your will still online after wiping the build environment.\nNow you can re-build the version with a fresh build environment!", + "content": "ReadtheDocsWiping a Build Environment\nSometimes it happen that your Builds start failing because the build environment where the is created is stale or broken. This could happen for a couple of different reasons like pip not upgrading a package properly or a corrupted cached Python package.\nIn any of these cases (and many others), the solution could be just wiping out the existing build environment files and allow Read the Docs to create a new fresh one.\nFollow these steps to wipe the build environment:\nGo to Versions\nClick on the Edit button of the version you want to wipe on the right side of the page\nGo to the bottom of the page and click the wipe link, next to the \u201cSave\u201d button\nNote\nBy wiping the build environment, all the rst, md, and code files associated with it will be removed but not the already built (HTML and PDF files). Your will still online after wiping the build environment.\nNow you can re-build the version with a fresh build environment!", "headers": [ "Wiping a Build Environment" ], diff --git a/readthedocs/search/tests/data/kuma/docker.json b/readthedocs/search/tests/data/kuma/docker.json index 3f86764073a..eb218b4dfb0 100644 --- a/readthedocs/search/tests/data/kuma/docker.json +++ b/readthedocs/search/tests/data/kuma/docker.json @@ -1,5 +1,5 @@ { - "content": "kuma-Docker Docker is used for development and (soon) for deployment.\nDocker Images\nDocker images are used in development, usually with the local working files mounted in the images to set behaviour.\nImages are built by Jenkins, after tests pass, and are published to quay.io. We try to store the configuration in the environment, so that the published images can be used in deployments by setting environment variables to deployment-specific values.\nHere are some of the images used in the Kuma project:\nkuma\nThe kuma Docker image builds on the kuma_base image, installing a kuma branch and building the assets needed for running as a webservice. The environment can be customized for different deployments.\nThe image can be recreated locally with make build-kuma.\nThe image tagged latest is used by default for development. It can be created locally with make build-kuma VERSION=latest. The latest image is created from the master branch in Jenkins and published to quay.io.\nkuma_base\nThe kuma_base Docker image contains the OS and libraries (C, Python, and Node.js) that support the kuma project. The kuma image extends this by installing the kuma source and building assets needed for production.\nThe image can be recreated locally with make build-base.\nThe image tagged latest is used by default for development. It can be created localled with make build-base VERSION=latest. The latest image is created from the master branch in Jenkins and published to quay.io\nkumascript\nThe kumascript Docker image contains the kumascript rendering engine and support files. The environment can be customized for different deployments.\nThe image can be recreated locally with make build-kumascript.\nThe image tagged latest is used by default for development. It can be created locally with make build-kumascript KS_VERSION=latest. The latest image is created from the master branch in Jenkins and published to quay.io.\nintegration-tests\nThe integration-tests Docker image contains browser-based integration tests that check the functionality of a running Kuma deployment.\nThe image can be recreated locally with docker build -f docker/images/integration-tests/ ., but this is only necessary for image development. Most developer will follow the Client-side testing to develop and run these integration tests.\nThe image is built and used in Jenkins in the stage-integration-tests and prod-integration-tests pipelines, configured by scripts in the Jenkinsfiles folder. It is not published to quay.io.", + "content": "kumadocker Docker is used for development and (soon) for deployment.\nDocker Images\nDocker images are used in development, usually with the local working files mounted in the images to set behaviour.\nImages are built by Jenkins, after tests pass, and are published to quay.io. We try to store the configuration in the environment, so that the published images can be used in deployments by setting environment variables to deployment-specific values.\nHere are some of the images used in the Kuma project:\nkuma\nThe kuma Docker image builds on the kuma_base image, installing a kuma branch and building the assets needed for running as a webservice. The environment can be customized for different deployments.\nThe image can be recreated locally with make build-kuma.\nThe image tagged latest is used by default for development. It can be created locally with make build-kuma VERSION=latest. The latest image is created from the master branch in Jenkins and published to quay.io.\nkuma_base\nThe kuma_base Docker image contains the OS and libraries (C, Python, and Node.js) that support the kuma project. The kuma image extends this by installing the kuma source and building assets needed for production.\nThe image can be recreated locally with make build-base.\nThe image tagged latest is used by default for development. It can be created localled with make build-base VERSION=latest. The latest image is created from the master branch in Jenkins and published to quay.io\nkumascript\nThe kumascript Docker image contains the kumascript rendering engine and support files. The environment can be customized for different deployments.\nThe image can be recreated locally with make build-kumascript.\nThe image tagged latest is used by default for development. It can be created locally with make build-kumascript KS_VERSION=latest. The latest image is created from the master branch in Jenkins and published to quay.io.\nintegration-tests\nThe integration-tests Docker image contains browser-based integration tests that check the functionality of a running Kuma deployment.\nThe image can be recreated locally with docker build -f docker/images/integration-tests/ ., but this is only necessary for image development. Most developer will follow the Client-side testing to develop and run these integration tests.\nThe image is built and used in Jenkins in the stage-integration-tests and prod-integration-tests pipelines, configured by scripts in the Jenkinsfiles folder. It is not published to quay.io.", "headers": [ "Docker", "Docker Images", diff --git a/readthedocs/search/tests/data/kuma/documentation.json b/readthedocs/search/tests/data/kuma/documentation.json index 310a01d05c8..6add1596dc3 100644 --- a/readthedocs/search/tests/data/kuma/documentation.json +++ b/readthedocs/search/tests/data/kuma/documentation.json @@ -1,5 +1,5 @@ { - "content": "kuma-Documentation This documentation is generated and published at Read the Docs whenever the master branch is updated. GitHub can render our .rst documents as ReStructuredText, which is close enough to Sphinx for most code reviews, without features like links between documents.\nIt is occasionally necessary to generate the documentation locally. It is easiest to do this with a virtualenv on the host system, using only to regenerate the MDN Sphinx template. If you are not comfortable with that style of development, it can be done entirely in using -compose.\nGenerating documentation\nSphinx uses a Makefile in the docs subfolder to build documentation in several formats. MDN only uses the HTML format, and the generated document index is at docs/_build/html/index.html.\nTo generate the documentation in a virtualenv on the host machine, first install the requirements:\npip install -r requirements/docs.txt\nThen switch to the docs folder to use the Makefile:\ncd docs make html python -m webbrowser file://${PWD}/_build/html/index.html\nTo generate the documentation with :\n-compose run --rm --user $(id -u) web sh -c \"\\ virtualenv /tmp/.venvs/docs && \\ . /tmp/.venvs/docs/bin/activate && \\ pip install -r /app/requirements/docs.txt && \\ cd /app/docs && \\ make html\" python -m webbrowser file://${PWD}/docs/_build/html/index.html\nA virtualenv is required, to avoid a pip bug when changing the version of a system-installed package.", + "content": "kumadocumentation This documentation is generated and published at Read the Docs whenever the master branch is updated. GitHub can render our .rst documents as ReStructuredText, which is close enough to Sphinx for most code reviews, without features like links between documents.\nIt is occasionally necessary to generate the documentation locally. It is easiest to do this with a virtualenv on the host system, using only to regenerate the MDN Sphinx template. If you are not comfortable with that style of development, it can be done entirely in using -compose.\nGenerating documentation\nSphinx uses a Makefile in the docs subfolder to build documentation in several formats. MDN only uses the HTML format, and the generated document index is at docs/_build/html/index.html.\nTo generate the documentation in a virtualenv on the host machine, first install the requirements:\npip install -r requirements/docs.txt\nThen switch to the docs folder to use the Makefile:\ncd docs make html python -m webbrowser file://${PWD}/_build/html/index.html\nTo generate the documentation with :\n-compose run --rm --user $(id -u) web sh -c \"\\ virtualenv /tmp/.venvs/docs && \\ . /tmp/.venvs/docs/bin/activate && \\ pip install -r /app/requirements/docs.txt && \\ cd /app/docs && \\ make html\" python -m webbrowser file://${PWD}/docs/_build/html/index.html\nA virtualenv is required, to avoid a pip bug when changing the version of a system-installed package.", "headers": [ "Documentation", "Generating documentation" diff --git a/readthedocs/search/tests/data/pipeline/installation.json b/readthedocs/search/tests/data/pipeline/installation.json index 30fb78d1d78..c3a1bb7a5f1 100644 --- a/readthedocs/search/tests/data/pipeline/installation.json +++ b/readthedocs/search/tests/data/pipeline/installation.json @@ -1,5 +1,5 @@ { - "content": "Pipeline-Installation Either check out Pipeline from GitHub or to pull a release off PyPI\npip install django-pipeline\nAdd \u2018pipeline\u2019 to your INSTALLED_APPS\nINSTALLED_APPS = ( 'pipeline', )\nUse a pipeline storage for STATICFILES_STORAGE\nSTATICFILES_STORAGE = 'pipeline.storage.PipelineCachedStorage'\nAdd the PipelineFinder to STATICFILES_FINDERS\nSTATICFILES_FINDERS = ( 'django.contrib.staticfiles.finders.FileSystemFinder', 'django.contrib.staticfiles.finders.AppDirectoriesFinder', 'pipeline.finders.PipelineFinder', )\nNote\nYou need to use Django>=1.7 to be able to use this version of pipeline.\nUpgrading from 1.3\nTo upgrade from pipeline 1.3, you will need to follow these steps:\nUpdate templates to use the new syntax\n{# pipeline<1.4 #} {% load compressed %} {% compressed_js 'group' %} {% compressed_css 'group' %}\n{# pipeline>=1.4 #} {% load pipeline %} {% javascript 'group' %} {% stylesheet 'group' %}\nAdd the PipelineFinder to STATICFILES_FINDERS\nSTATICFILES_FINDERS = ( 'django.contrib.staticfiles.finders.FileSystemFinder', 'django.contrib.staticfiles.finders.AppDirectoriesFinder', 'pipeline.finders.PipelineFinder', )\nUpgrading from 1.5\nTo upgrade from pipeline 1.5, you will need update all your PIPELINE_* settings and move them under the new PIPELINE setting. See Configuration.\nRecommendations\nPipeline\u2019s default CSS and JS compressor is Yuglify. Yuglify wraps UglifyJS and cssmin, applying the default YUI configurations to them. It can be downloaded from: https://github.com/yui/yuglify/.\nIf you do not install yuglify, make sure to disable the compressor in your settings.", + "content": "PipelineInstallation Official Either check out Pipeline from GitHub or to pull a release off PyPI\npip install django-pipeline\nAdd \u2018pipeline\u2019 to your INSTALLED_APPS\nINSTALLED_APPS = ( 'pipeline', )\nUse a pipeline storage for STATICFILES_STORAGE\nSTATICFILES_STORAGE = 'pipeline.storage.PipelineCachedStorage'\nAdd the PipelineFinder to STATICFILES_FINDERS\nSTATICFILES_FINDERS = ( 'django.contrib.staticfiles.finders.FileSystemFinder', 'django.contrib.staticfiles.finders.AppDirectoriesFinder', 'pipeline.finders.PipelineFinder', )\nNote\nYou need to use Django>=1.7 to be able to use this version of pipeline.\nUpgrading from 1.3\nTo upgrade from pipeline 1.3, you will need to follow these steps:\nUpdate templates to use the new syntax\n{# pipeline<1.4 #} {% load compressed %} {% compressed_js 'group' %} {% compressed_css 'group' %}\n{# pipeline>=1.4 #} {% load pipeline %} {% javascript 'group' %} {% stylesheet 'group' %}\nAdd the PipelineFinder to STATICFILES_FINDERS\nSTATICFILES_FINDERS = ( 'django.contrib.staticfiles.finders.FileSystemFinder', 'django.contrib.staticfiles.finders.AppDirectoriesFinder', 'pipeline.finders.PipelineFinder', )\nUpgrading from 1.5\nTo upgrade from pipeline 1.5, you will need update all your PIPELINE_* settings and move them under the new PIPELINE setting. See Configuration.\nRecommendations\nPipeline\u2019s default CSS and JS compressor is Yuglify. Yuglify wraps UglifyJS and cssmin, applying the default YUI configurations to them. It can be downloaded from: https://github.com/yui/yuglify/.\nIf you do not install yuglify, make sure to disable the compressor in your settings.", "headers": [ "Installation", "Upgrading from 1.3", diff --git a/readthedocs/search/tests/data/pipeline/signals.json b/readthedocs/search/tests/data/pipeline/signals.json index 3bf3a80537c..b8113de979f 100644 --- a/readthedocs/search/tests/data/pipeline/signals.json +++ b/readthedocs/search/tests/data/pipeline/signals.json @@ -1,5 +1,5 @@ { - "content": "pipeline-Signals List of all signals sent by pipeline.\ncss_compressed\npipeline.signals.css_compressed\nWhenever a css package is compressed, this signal is sent after the compression.\nArguments sent with this signal :\nsender:\nThe Packager class that compressed the group.\npackage:\nThe package actually compressed.\njs_compressed\npipeline.signals.js_compressed\nWhenever a js package is compressed, this signal is sent after the compression.\nArguments sent with this signal :\nsender:\nThe Packager class that compressed the group.\npackage:\nThe package actually compressed.", + "content": "pipelineSignals List of all signals sent by pipeline.\ncss_compressed\npipeline.signals.css_compressed\nWhenever a css package is compressed, this signal is sent after the compression.\nArguments sent with this signal :\nsender:\nThe Packager class that compressed the group.\npackage:\nThe package actually compressed.\njs_compressed\npipeline.signals.js_compressed\nWhenever a js package is compressed, this signal is sent after the compression.\nArguments sent with this signal :\nsender:\nThe Packager class that compressed the group.\npackage:\nThe package actually compressed.", "headers": [ "Signals", "css_compressed", diff --git a/readthedocs/search/tests/dummy_data.py b/readthedocs/search/tests/dummy_data.py index fbd4eed1f11..ed1d5c7e2f6 100644 --- a/readthedocs/search/tests/dummy_data.py +++ b/readthedocs/search/tests/dummy_data.py @@ -1,28 +1,7 @@ -import json -import os - -_DATA_FILES = { - 'pipeline': ['installation.json', 'signals.json'], - 'kuma': ['documentation.json', 'docker.json'], - 'docs': ['story.json', 'wiping.json'], +PROJECT_DATA_FILES = { + 'pipeline': ['installation', 'signals'], + 'kuma': ['documentation', 'docker'], + 'docs': ['story', 'wiping'], } - -def _get_dummy_json(): - dictionary = {} - for key, value in _DATA_FILES.items(): - data = [] - for file_name in value: - current_path = os.path.abspath(os.path.dirname(__file__)) - path = os.path.join(current_path, "data", key, file_name) - with open(path) as f: - content = json.load(f) - data.append(content) - - dictionary[key] = data - - return dictionary - - -DUMMY_PAGE_JSON = _get_dummy_json() -ALL_PROJECTS = DUMMY_PAGE_JSON.keys() +ALL_PROJECTS = PROJECT_DATA_FILES.keys() diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py new file mode 100644 index 00000000000..2e4219f169e --- /dev/null +++ b/readthedocs/search/tests/test_api.py @@ -0,0 +1,135 @@ +import pytest +from django.core.urlresolvers import reverse +from django_dynamic_fixture import G + + +from readthedocs.builds.models import Version +from readthedocs.projects.models import HTMLFile +from readthedocs.search.tests.utils import get_search_query_from_project_file + + +@pytest.mark.django_db +@pytest.mark.search +class TestDocumentSearch(object): + + def __init__(self): + # This reverse needs to be inside the ``__init__`` method because from + # the Corporate site we don't define this URL if ``-ext`` module is not + # installed + self.url = reverse('doc_search') + + @pytest.mark.parametrize('data_type', ['content', 'headers', 'title']) + @pytest.mark.parametrize('page_num', [0, 1]) + def test_search_works(self, api_client, project, data_type, page_num): + query = get_search_query_from_project_file(project_slug=project.slug, page_num=page_num, + data_type=data_type) + + version = project.versions.all()[0] + search_params = {'project': project.slug, 'version': version.slug, 'q': query} + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + data = resp.data['results'] + assert len(data) == 1 + project_data = data[0] + assert project_data['project'] == project.slug + + # Check highlight return correct object + all_highlights = project_data['highlight'][data_type] + for highlight in all_highlights: + # Make it lower because our search is case insensitive + assert query.lower() in highlight.lower() + + def test_doc_search_filter_by_project(self, api_client): + """Test Doc search result are filtered according to project""" + + # `Github` word is present both in `kuma` and `pipeline` files + # so search with this phrase but filter through `kuma` project + search_params = {'q': 'GitHub', 'project': 'kuma', 'version': 'latest'} + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + data = resp.data['results'] + assert len(data) == 1 + assert data[0]['project'] == 'kuma' + + def test_doc_search_filter_by_version(self, api_client, project): + """Test Doc search result are filtered according to version""" + query = get_search_query_from_project_file(project_slug=project.slug) + latest_version = project.versions.all()[0] + # Create another version + dummy_version = G(Version, project=project) + # Create HTMLFile same as the latest version + latest_version_files = HTMLFile.objects.all().filter(version=latest_version) + for f in latest_version_files: + f.version = dummy_version + # Make primary key to None, so django will create new object + f.pk = None + f.save() + + search_params = {'q': query, 'project': project.slug, 'version': dummy_version.slug} + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + data = resp.data['results'] + assert len(data) == 1 + assert data[0]['project'] == project.slug + + def test_doc_search_pagination(self, api_client, project): + """Test Doc search result can be paginated""" + latest_version = project.versions.all()[0] + html_file = HTMLFile.objects.filter(version=latest_version)[0] + title = html_file.processed_json['title'] + query = title.split()[0] + + # Create 30 more same html file + for _ in range(30): + # Make primary key to None, so django will create new object + html_file.pk = None + html_file.save() + + search_params = {'q': query, 'project': project.slug, 'version': latest_version.slug} + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + # Check the count is 31 (1 existing and 30 new created) + assert resp.data['count'] == 31 + # Check there are next url + assert resp.data['next'] is not None + # There should be only 25 data as the pagination is 25 by default + assert len(resp.data['results']) == 25 + + # Add `page_size` parameter and check the data is paginated accordingly + search_params['page_size'] = 5 + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + assert len(resp.data['results']) == 5 + + def test_doc_search_without_parameters(self, api_client, project): + """Hitting Document Search endpoint without query parameters should return error""" + resp = api_client.get(self.url) + assert resp.status_code == 400 + # Check error message is there + assert sorted(['q', 'project', 'version']) == sorted(resp.data.keys()) + + def test_doc_search_subprojects(self, api_client, all_projects): + """Test Document search return results from subprojects also""" + project = all_projects[0] + subproject = all_projects[1] + version = project.versions.all()[0] + # Add another project as subproject of the project + project.add_subproject(subproject) + + # Now search with subproject content but explicitly filter by the parent project + query = get_search_query_from_project_file(project_slug=subproject.slug) + search_params = {'q': query, 'project': project.slug, 'version': version.slug} + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + data = resp.data['results'] + assert len(data) == 1 + assert data[0]['project'] == subproject.slug + # Check the link is the subproject document link + document_link = subproject.get_docs_url(version_slug=version.slug) + assert document_link in data[0]['link'] diff --git a/readthedocs/search/tests/test_faceted_search.py b/readthedocs/search/tests/test_faceted_search.py new file mode 100644 index 00000000000..1c523e11b16 --- /dev/null +++ b/readthedocs/search/tests/test_faceted_search.py @@ -0,0 +1,50 @@ +import pytest + +from readthedocs.search.documents import PageDocument + + +@pytest.mark.django_db +@pytest.mark.search +class TestFileSearch(object): + + @pytest.mark.parametrize('case', ['upper', 'lower', 'title']) + def test_search_exact_match(self, client, project, case): + """Check quoted query match exact phrase with case insensitively + + Making a query with quoted text like ``"foo bar"`` should match + exactly ``foo bar`` or ``Foo Bar`` etc + """ + # `Github` word is present both in `kuma` and `pipeline` files + # But the phrase Github can is available only in kuma docs. + # So search with this phrase to check + query_text = r'"GitHub can"' + cased_query = getattr(query_text, case) + query = cased_query() + + page_search = PageDocument.faceted_search(query=query) + results = page_search.execute() + + assert len(results) == 1 + assert results[0]['project'] == 'kuma' + assert results[0]['path'] == 'documentation' + + def test_search_combined_result(self, client, project): + """Check search result are combined of both `AND` and `OR` operator + + If query is `Foo Bar` then the result should be as following order: + + - Where both `Foo Bar` is present + - Where `Foo` or `Bar` is present + """ + query = 'Official Support' + page_search = PageDocument.faceted_search(query=query) + results = page_search.execute() + assert len(results) == 3 + + result_paths = [r.path for r in results] + # ``open-source-philosophy`` page has both ``Official Support`` words + # ``docker`` page has ``Support`` word + # ``installation`` page has ``Official`` word + expected_paths = ['open-source-philosophy', 'docker', 'installation'] + + assert result_paths == expected_paths diff --git a/readthedocs/search/tests/test_views.py b/readthedocs/search/tests/test_views.py index 07444a731fb..99852f16239 100644 --- a/readthedocs/search/tests/test_views.py +++ b/readthedocs/search/tests/test_views.py @@ -1,26 +1,21 @@ -# -*- coding: utf-8 -*- import pytest -from django.core.management import call_command -from django.urls import reverse_lazy +from django.core.urlresolvers import reverse from django_dynamic_fixture import G + from pyquery import PyQuery as pq + from readthedocs.builds.constants import LATEST from readthedocs.builds.models import Version -from readthedocs.projects.models import Project +from readthedocs.projects.models import Project, HTMLFile from readthedocs.search.tests.utils import get_search_query_from_project_file @pytest.mark.django_db @pytest.mark.search -class TestElasticSearch(object): - - url = reverse_lazy('search') - - def _reindex_elasticsearch(self, es_index): - call_command('reindex_elasticsearch') - es_index.refresh_index() +class TestProjectSearch(object): + url = reverse('search') def _get_search_result(self, url, client, search_params): resp = client.get(url, search_params) @@ -30,21 +25,16 @@ def _get_search_result(self, url, client, search_params): result = page.find('.module-list-wrapper .module-item-title') return result, page - @pytest.fixture(autouse=True) - def elastic_index(self, mock_parse_json, all_projects, es_index): - self._reindex_elasticsearch(es_index=es_index) - def test_search_by_project_name(self, client, project): result, _ = self._get_search_result(url=self.url, client=client, search_params={'q': project.name}) assert project.name.encode('utf-8') in result.text().encode('utf-8') - def test_search_project_show_languages(self, client, project, es_index): + def test_search_project_show_languages(self, client, project): """Test that searching project should show all available languages""" # Create a project in bn and add it as a translation G(Project, language='bn', name=project.name) - self._reindex_elasticsearch(es_index=es_index) result, page = self._get_search_result(url=self.url, client=client, search_params={'q': project.name}) @@ -54,11 +44,10 @@ def test_search_project_show_languages(self, client, project, es_index): assert len(content) == 2 assert 'bn' in content.text() - def test_search_project_filter_language(self, client, project, es_index): + def test_search_project_filter_language(self, client, project): """Test that searching project filtered according to language""" # Create a project in bn and add it as a translation translate = G(Project, language='bn', name=project.name) - self._reindex_elasticsearch(es_index=es_index) search_params = {'q': project.name, 'language': 'bn'} result, page = self._get_search_result(url=self.url, client=client, @@ -68,21 +57,86 @@ def test_search_project_filter_language(self, client, project, es_index): assert len(result) == 1 content = page.find('.navigable .language-list') - # There should be 1 languages - assert len(content) == 1 + # There should be 2 languages because both `en` and `bn` should show there + assert len(content) == 2 assert 'bn' in content.text() + +@pytest.mark.django_db +@pytest.mark.search +class TestPageSearch(object): + url = reverse('search') + + def _get_search_result(self, url, client, search_params): + resp = client.get(url, search_params) + assert resp.status_code == 200 + + page = pq(resp.content) + result = page.find('.module-list-wrapper .search-result-item') + return result, page + @pytest.mark.parametrize('data_type', ['content', 'headers', 'title']) @pytest.mark.parametrize('page_num', [0, 1]) - def test_search_by_file_content(self, client, project, data_type, page_num): + def test_file_search(self, client, project, data_type, page_num): query = get_search_query_from_project_file(project_slug=project.slug, page_num=page_num, data_type=data_type) result, _ = self._get_search_result(url=self.url, client=client, search_params={'q': query, 'type': 'file'}) assert len(result) == 1 + assert query in result.text() + + @pytest.mark.parametrize('case', ['upper', 'lower', 'title']) + def test_file_search_case_insensitive(self, client, project, case): + """Check File search is case insensitive + + It tests with uppercase, lowercase and camelcase + """ + query_text = get_search_query_from_project_file(project_slug=project.slug) + + cased_query = getattr(query_text, case) + query = cased_query() + + result, _ = self._get_search_result(url=self.url, client=client, + search_params={'q': query, 'type': 'file'}) + + assert len(result) == 1 + # Check the actual text is in the result, not the cased one + assert query_text in result.text() + + def test_file_search_exact_match(self, client, project): + """Check quoted query match exact phrase + + Making a query with quoted text like ``"foo bar"`` should match + exactly ``foo bar`` phrase. + """ + + # `Github` word is present both in `kuma` and `pipeline` files + # But the phrase Github can is available only in kuma docs. + # So search with this phrase to check + query = r'"GitHub can"' + + result, _ = self._get_search_result(url=self.url, client=client, + search_params={'q': query, 'type': 'file'}) + + assert len(result) == 1 + + def test_page_search_not_return_removed_page(self, client, project): + """Check removed page are not in the search index""" + query = get_search_query_from_project_file(project_slug=project.slug) + # Make a query to check it returns result + result, _ = self._get_search_result(url=self.url, client=client, + search_params={'q': query, 'type': 'file'}) + assert len(result) == 1 + + # Delete all the HTML files of the project + HTMLFile.objects.filter(project=project).delete() + # Run the query again and this time there should not be any result + result, _ = self._get_search_result(url=self.url, client=client, + search_params={'q': query, 'type': 'file'}) + assert len(result) == 0 - def test_file_search_show_projects(self, client): + def test_file_search_show_projects(self, client, all_projects): """Test that search result page shows list of projects while searching for files""" # `Github` word is present both in `kuma` and `pipeline` files @@ -134,7 +188,6 @@ def test_file_search_show_versions(self, client, all_projects, es_index, setting project = all_projects[0] # Create some versions of the project versions = [G(Version, project=project) for _ in range(3)] - self._reindex_elasticsearch(es_index=es_index) query = get_search_query_from_project_file(project_slug=project.slug) @@ -166,7 +219,6 @@ def test_file_search_subprojects(self, client, all_projects, es_index): subproject = all_projects[1] # Add another project as subproject of the project project.add_subproject(subproject) - self._reindex_elasticsearch(es_index=es_index) # Now search with subproject content but explicitly filter by the parent project query = get_search_query_from_project_file(project_slug=subproject.slug) diff --git a/readthedocs/search/tests/utils.py b/readthedocs/search/tests/utils.py index a48ea83dd74..0a049944ea0 100644 --- a/readthedocs/search/tests/utils.py +++ b/readthedocs/search/tests/utils.py @@ -1,4 +1,4 @@ -from readthedocs.search.tests.dummy_data import DUMMY_PAGE_JSON +from readthedocs.projects.models import HTMLFile def get_search_query_from_project_file(project_slug, page_num=0, data_type='title'): @@ -6,8 +6,9 @@ def get_search_query_from_project_file(project_slug, page_num=0, data_type='titl Query is generated from the value of `data_type` """ - all_pages = DUMMY_PAGE_JSON[project_slug] - file_data = all_pages[page_num] + html_file = HTMLFile.objects.filter(project__slug=project_slug).order_by('id')[page_num] + + file_data = html_file.processed_json query_data = file_data[data_type] if data_type in ['headers']: diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index a742a341912..f935ef3de65 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -11,8 +11,10 @@ import json from builtins import next, range +from django.shortcuts import get_object_or_404 from pyquery import PyQuery +from readthedocs.projects.models import Project log = logging.getLogger(__name__) @@ -306,3 +308,24 @@ def parse_sections(documentation_type, content): return '' return sections + + +# TODO: Rewrite all the views using this in Class Based View, +# and move this function to a mixin +def get_project_list_or_404(project_slug, user): + """Return list of project and its subprojects.""" + queryset = Project.objects.api(user).only('slug') + + project = get_object_or_404(queryset, slug=project_slug) + subprojects = queryset.filter(superprojects__parent_id=project.id) + + project_list = list(subprojects) + [project] + return project_list + + +def get_chunk(total, chunk_size): + """Yield successive `chunk_size` chunks""" + # Based on https://stackoverflow.com/a/312464 + # licensed under cc by-sa 3.0 + for i in range(0, total, chunk_size): + yield (i, i + chunk_size) diff --git a/readthedocs/search/views.py b/readthedocs/search/views.py index bac1969e80e..a5a2e0c332b 100644 --- a/readthedocs/search/views.py +++ b/readthedocs/search/views.py @@ -11,7 +11,8 @@ from django.shortcuts import render from readthedocs.builds.constants import LATEST -from readthedocs.search import lib as search_lib +from readthedocs.search.documents import ProjectDocument, PageDocument +from readthedocs.search.utils import get_project_list_or_404 log = logging.getLogger(__name__) LOG_TEMPLATE = u'(Elastic Search) [{user}:{type}] [{project}:{version}:{language}] {msg}' @@ -45,26 +46,23 @@ def elastic_search(request): if user_input.query: if user_input.type == 'project': - results = search_lib.search_project( - request, user_input.query, language=user_input.language) + project_search = ProjectDocument.faceted_search(query=user_input.query, + language=user_input.language) + results = project_search.execute() + facets = results.facets elif user_input.type == 'file': - results = search_lib.search_file( - request, user_input.query, project_slug=user_input.project, - version_slug=user_input.version, taxonomy=user_input.taxonomy) + kwargs = {} + if user_input.project: + projects_list = get_project_list_or_404(project_slug=user_input.project, + user=request.user) + project_slug_list = [project.slug for project in projects_list] + kwargs['projects_list'] = project_slug_list + if user_input.version: + kwargs['versions_list'] = user_input.version - if results: - # pre and post 1.0 compat - for num, hit in enumerate(results['hits']['hits']): - for key, val in list(hit['fields'].items()): - if isinstance(val, list): - results['hits']['hits'][num]['fields'][key] = val[0] - - if 'facets' in results: - for facet_type in ['project', 'version', 'taxonomy', 'language']: - if facet_type in results['facets']: - facets[facet_type] = collections.OrderedDict() - for term in results['facets'][facet_type]['terms']: - facets[facet_type][term['term']] = term['count'] + page_search = PageDocument.faceted_search(query=user_input.query, **kwargs) + results = page_search.execute() + facets = results.facets if settings.DEBUG: print(pprint(results)) @@ -87,7 +85,7 @@ def elastic_search(request): template_vars = user_input._asdict() template_vars.update({ 'results': results, - 'facets': facets, + 'facets': facets }) return render( request, diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 1f55e560474..cd01eb3b499 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -84,6 +84,7 @@ def INSTALLED_APPS(self): # noqa 'django_extensions', 'messages_extends', 'tastypie', + 'django_elasticsearch_dsl', # our apps 'readthedocs.projects', @@ -99,6 +100,7 @@ def INSTALLED_APPS(self): # noqa 'readthedocs.notifications', 'readthedocs.integrations', 'readthedocs.analytics', + 'readthedocs.search', # allauth @@ -319,8 +321,51 @@ def USE_PROMOS(self): # noqa # Elasticsearch settings. ES_HOSTS = ['127.0.0.1:9200'] - ES_DEFAULT_NUM_REPLICAS = 0 - ES_DEFAULT_NUM_SHARDS = 5 + ELASTICSEARCH_DSL = { + 'default': { + 'hosts': '127.0.0.1:9200' + }, + } + # Chunk size for elasticsearch reindex celery tasks + ES_TASK_CHUNK_SIZE = 100 + ES_PAGE_IGNORE_SIGNALS = True + + # ANALYZER = 'analysis': { + # 'analyzer': { + # 'default_icu': { + # 'type': 'custom', + # 'tokenizer': 'icu_tokenizer', + # 'filter': ['word_delimiter', 'icu_folding', 'icu_normalizer'], + # } + # } + # } + + ES_INDEXES = { + 'project': { + 'name': 'project_index', + # We do not have much data in the project index, therefore only 1 shard with + # 1 replica is appropriate project index + 'settings': {'number_of_shards': 1, + 'number_of_replicas': 1 + } + }, + 'page': { + 'name': 'page_index', + 'settings': { + # We have 3 nodes, therefore having 3 shards and each one having 2 replicas + # will be good fit for our infrastructure. So all the 9(3*3) shards will be + # allocated to 3 nodes. Therefore, if one nodes get failed, the data will be + # inside other nodes and Elasticsearch can serve properly. + 'number_of_shards': 3, + 'number_of_replicas': 2, + "index": { + "sort.field": ["project", "version"] + } + } + }, + } + # Disable auto refresh for increasing index performance + ELASTICSEARCH_DSL_AUTO_REFRESH = False ALLOWED_HOSTS = ['*'] diff --git a/readthedocs/settings/dev.py b/readthedocs/settings/dev.py index 7fa4dafe959..93b08d6bbc6 100644 --- a/readthedocs/settings/dev.py +++ b/readthedocs/settings/dev.py @@ -48,6 +48,9 @@ def DATABASES(self): # noqa 'test:8000', ) + # Disable auto syncing elasticsearch documents in development + ELASTICSEARCH_DSL_AUTOSYNC = False + @property def LOGGING(self): # noqa - avoid pep8 N802 logging = super(CommunityDevSettings, self).LOGGING diff --git a/readthedocs/settings/test.py b/readthedocs/settings/test.py index f49dc8584b1..ee8132c053f 100644 --- a/readthedocs/settings/test.py +++ b/readthedocs/settings/test.py @@ -16,6 +16,17 @@ class CommunityTestSettings(CommunityDevSettings): DEBUG = False TEMPLATE_DEBUG = False + ES_PAGE_IGNORE_SIGNALS = False + ELASTICSEARCH_DSL_AUTOSYNC = False + ELASTICSEARCH_DSL_AUTO_REFRESH = True + + @property + def ES_INDEXES(self): # noqa - avoid pep8 N802 + es_indexes = super(CommunityTestSettings, self).ES_INDEXES + for index_conf in es_indexes.values(): + index_conf['name'] = "test_{}".format(index_conf['name']) + + return es_indexes @property def LOGGING(self): # noqa - avoid pep8 N802 diff --git a/readthedocs/templates/search/elastic_project_search.html b/readthedocs/templates/search/elastic_project_search.html index f1e57a6a6bf..1ce48e6e2b1 100644 --- a/readthedocs/templates/search/elastic_project_search.html +++ b/readthedocs/templates/search/elastic_project_search.html @@ -44,14 +44,16 @@

    {% blocktrans with query=query|default:"" %}Results for {{ query }}{% endblo