readthedocs · ericholscher · Jun 19, 2018 · Jun 8, 2018 · Jun 8, 2018 · Jun 9, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -4,7 +4,7 @@ python:
   - 3.6
 sudo: false
 env:
-  - ES_VERSION=1.3.9 ES_DOWNLOAD_URL=https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch-${ES_VERSION}.tar.gz
+  - ES_VERSION=6.2.4 ES_DOWNLOAD_URL=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ES_VERSION}.tar.gz
 matrix:
   include:
     - python: 2.7
@@ -42,3 +42,4 @@ notifications:
 branches:
   only:
   - master
+  - search_upgrade
diff --git a/readthedocs/projects/apps.py b/readthedocs/projects/apps.py
@@ -9,5 +9,7 @@ class ProjectsConfig(AppConfig):
     def ready(self):
         from readthedocs.projects import tasks
         from readthedocs.worker import app
+        from .signals import pre_save_html_file #noqa
+
         app.tasks.register(tasks.SyncRepositoryTask)
         app.tasks.register(tasks.UpdateDocsTask)
diff --git a/readthedocs/projects/managers.py b/readthedocs/projects/managers.py
@@ -0,0 +1,7 @@
+from django.db import models
+
+
+class HTMLFileManager(models.Manager):
+
+    def get_queryset(self):
+        return super(HTMLFileManager, self).get_queryset().filter(is_html=True)
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -7,8 +7,8 @@
 import fnmatch
 import logging
 import os
-from builtins import object  # pylint: disable=redefined-builtin
 
+from builtins import object  # pylint: disable=redefined-builtin
 from django.conf import settings
 from django.contrib.auth.models import User
 from django.core.urlresolvers import NoReverseMatch, reverse
@@ -24,6 +24,7 @@
 from readthedocs.core.utils import broadcast, slugify
 from readthedocs.projects import constants
 from readthedocs.projects.exceptions import ProjectConfigurationError
+from readthedocs.projects.managers import HTMLFileManager
 from readthedocs.projects.querysets import (
     ChildRelatedProjectQuerySet, FeatureQuerySet, ProjectQuerySet,
     RelatedProjectQuerySet)
@@ -902,6 +903,7 @@ class ImportedFile(models.Model):
     path = models.CharField(_('Path'), max_length=255)
     md5 = models.CharField(_('MD5 checksum'), max_length=255)
     commit = models.CharField(_('Commit'), max_length=255)
+    is_html = models.BooleanField(default=False)
 
     def get_absolute_url(self):
         return resolve(project=self.project, version_slug=self.version.slug, filename=self.path)
@@ -910,6 +912,20 @@ def __str__(self):
         return '%s: %s' % (self.name, self.project)
 
 
+class HTMLFile(ImportedFile):
+
+    """
+    Imported HTML file Proxy model.
+
+    This tracks only the HTML files for indexing to search.
+    """
+
+    class Meta(object):
+        proxy = True
+
+    objects = HTMLFileManager()
+
+
 class Notification(models.Model):
     project = models.ForeignKey(Project,
                                 related_name='%(class)s_notifications')

diff --git a/readthedocs/projects/signals.py b/readthedocs/projects/signals.py
@@ -2,10 +2,11 @@
 
 from __future__ import absolute_import
 import django.dispatch
+from django.db.models.signals import pre_save
 from django.dispatch import receiver
 
 from readthedocs.oauth.utils import attach_webhook
-
+from .models import HTMLFile
 
 before_vcs = django.dispatch.Signal(providing_args=["version"])
 after_vcs = django.dispatch.Signal(providing_args=["version"])
@@ -25,3 +26,8 @@ def handle_project_import(sender, **kwargs):
     request = kwargs.get('request')
 
     attach_webhook(project=project, request=request)
+
+
+@receiver(pre_save, sender=HTMLFile)
+def pre_save_html_file(sender, instance, *args, **kwargs):
+    instance.is_html = True
diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py
@@ -8,6 +8,7 @@
 from __future__ import absolute_import
 
 import datetime
+import fnmatch
 import hashlib
 import json
 import logging
@@ -29,7 +30,7 @@
 
 from .constants import LOG_TEMPLATE
 from .exceptions import RepositoryError
-from .models import ImportedFile, Project, Domain
+from .models import ImportedFile, Project, Domain, HTMLFile
 from .signals import before_vcs, after_vcs, before_build, after_build, files_changed
 from readthedocs.builds.constants import (LATEST,
                                           BUILD_STATE_CLONING,
@@ -943,18 +944,23 @@ def _manage_imported_files(version, path, commit):
     changed_files = set()
     for root, __, filenames in os.walk(path):
         for filename in filenames:
+            if fnmatch.fnmatch(filename, '*.html'):
+                model_class = HTMLFile
+            else:
+                model_class = ImportedFile
+
             dirpath = os.path.join(root.replace(path, '').lstrip('/'),
                                    filename.lstrip('/'))
             full_path = os.path.join(root, filename)
             md5 = hashlib.md5(open(full_path, 'rb').read()).hexdigest()
             try:
-                obj, __ = ImportedFile.objects.get_or_create(
+                obj, __ = model_class.objects.get_or_create(
                     project=version.project,
                     version=version,
                     path=dirpath,
                     name=filename,
                 )
-            except ImportedFile.MultipleObjectsReturned:
+            except model_class.MultipleObjectsReturned:
                 log.warning('Error creating ImportedFile')
                 continue
             if obj.md5 != md5:
@@ -963,6 +969,12 @@ def _manage_imported_files(version, path, commit):
             if obj.commit != commit:
                 obj.commit = commit
             obj.save()
+
+    # Delete the HTMLFile first
+    HTMLFile.objects.filter(project=version.project,
+                            version=version
+                            ).exclude(commit=commit).delete()
+
     # Delete ImportedFiles from previous versions
     ImportedFile.objects.filter(project=version.project,
                                 version=version
@@ -1173,7 +1185,7 @@ def sync_callback(_, version_pk, commit, *args, **kwargs):
     The first argument is the result from previous tasks, which we discard.
     """
     fileify(version_pk, commit=commit)
-    update_search(version_pk, commit=commit)
+    # update_search(version_pk, commit=commit)
 
 
 @app.task()

diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -0,0 +1,60 @@
+from django_elasticsearch_dsl import DocType, Index, fields
+
+from readthedocs.projects.models import Project, HTMLFile
+
+from readthedocs.search.faceted_search import ProjectSearch
+
+project_index = Index('project')
+
+project_index.settings(
+    number_of_shards=1,
+    number_of_replicas=0
+)
+
+
+@project_index.doc_type
+class ProjectDocument(DocType):
+
+    class Meta(object):
+        model = Project
+        fields = ('name', 'slug', 'description')
+
+    url = fields.TextField()
+    users = fields.NestedField(properties={
+        'username': fields.TextField(),
+        'id': fields.IntegerField(),
+    })
+    language = fields.KeywordField()
+
+    def prepare_url(self, instance):
+        return instance.get_absolute_url()
+
+    @classmethod
+    def faceted_search(cls, query, language=None, using=None, index=None):
+        kwargs = {
+            'using': using or cls._doc_type.using,
+            'index': index or cls._doc_type.index,
+            'doc_types': [cls],
+            'model': cls._doc_type.model,
+            'query': query
+        }
+
+        if language:
+            kwargs['filters'] = {'language': language}
+
+        return ProjectSearch(**kwargs)
+
+
+page_index = Index('page')
+
+page_index.settings(
+    number_of_shards=1,
+    number_of_replicas=0
+)
+
+
+@page_index.doc_type
+class PageDocument(DocType):
+
+    class Meta(object):
+        model = HTMLFile
diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py
@@ -0,0 +1,15 @@
+from elasticsearch_dsl import FacetedSearch, TermsFacet
+
+
+class ProjectSearch(FacetedSearch):
+    fields = ['name^5', 'description']
+    facets = {
+        'language': TermsFacet(field='language')
+    }
+
+    def __init__(self, using, index, doc_types, model, **kwargs):
+        self.using = using
+        self.index = index
+        self.doc_types = doc_types
+        self._model = model
+        super(ProjectSearch, self).__init__(**kwargs)
diff --git a/readthedocs/search/indexes.py b/readthedocs/search/indexes.py
@@ -19,7 +19,6 @@
 import datetime
 
 from elasticsearch import Elasticsearch, exceptions
-from elasticsearch.helpers import bulk_index
 
 from django.conf import settings
 
@@ -143,7 +142,7 @@ def bulk_index(self, data, index=None, chunk_size=500, parent=None,
             docs.append(doc)
 
         # TODO: This doesn't work with the new ES setup.
-        bulk_index(self.es, docs, chunk_size=chunk_size)
+        # bulk_index(self.es, docs, chunk_size=chunk_size)
 
     def index_document(self, data, index=None, parent=None, routing=None):
         doc = self.extract_document(data)

diff --git a/readthedocs/search/tests/conftest.py b/readthedocs/search/tests/conftest.py
@@ -3,6 +3,7 @@
 from random import shuffle
 
 import pytest
+from django.core.management import call_command
 from django_dynamic_fixture import G
 
 from readthedocs.projects.models import Project
@@ -16,27 +17,17 @@ def mock_elastic_index(mocker):
     mocker.patch.object(Index, '_index', index_name.lower())
 
 
-@pytest.fixture(autouse=True)
-def es_index(mock_elastic_index):
-    # Create the index.
-    index = Index()
-    index_name = index.timestamped_index()
-    index.create_index(index_name)
-    index.update_aliases(index_name)
-    # Update mapping
-    proj = ProjectIndex()
-    proj.put_mapping()
-    page = PageIndex()
-    page.put_mapping()
-    sec = SectionIndex()
-    sec.put_mapping()
-
-    yield index
-    index.delete_index(index_name=index_name)
+@pytest.fixture()
+def es_index():
+    call_command('search_index', '--delete', '-f')
+    call_command('search_index', '--create')
+
+    yield
+    call_command('search_index', '--delete', '-f')
 
 
 @pytest.fixture
-def all_projects():
+def all_projects(es_index):
     projects = [G(Project, slug=project_slug, name=project_slug) for project_slug in ALL_PROJECTS]
     shuffle(projects)
     return projects