Merge pull request readthedocs#4615 from safwanrahman/search_fix

fixing the indexing
safwanrahman · Sep 15, 2018 · a508020 · a508020
2 parents 9a78698 + ad2d174
commit a508020
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 22 deletions.
diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py
@@ -29,11 +29,6 @@
 from django.utils.translation import ugettext_lazy as _
 from slumber.exceptions import HttpClientError
 
-from .constants import LOG_TEMPLATE
-from .exceptions import RepositoryError
-from .models import ImportedFile, Project, Domain, Feature, HTMLFile
-from .signals import before_vcs, after_vcs, before_build, after_build, files_changed, \
-    bulk_post_create, bulk_post_delete
 from readthedocs.builds.constants import (
     BUILD_STATE_BUILDING, BUILD_STATE_CLONING, BUILD_STATE_FINISHED,
     BUILD_STATE_INSTALLING, LATEST, LATEST_VERBOSE_NAME, STABLE_VERBOSE_NAME)
@@ -59,12 +54,12 @@
 from readthedocs.search.parse_json import process_all_json_files
 from readthedocs.vcs_support import utils as vcs_support_utils
 from readthedocs.worker import app
-
 from .constants import LOG_TEMPLATE
 from .exceptions import RepositoryError
-from .models import Domain, Feature, ImportedFile, Project
-from .signals import (
-    after_build, after_vcs, before_build, before_vcs, files_changed)
+from .models import Domain, ImportedFile, Project
+from .models import HTMLFile
+from .signals import (after_build, after_vcs, before_build, before_vcs,
+                      bulk_post_create, bulk_post_delete, files_changed)
 
 log = logging.getLogger(__name__)
 

diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -10,7 +10,7 @@
 
 from ...tasks import (index_objects_to_es, switch_es_index, create_new_es_index,
                       index_missing_objects)
-from ...utils import chunk_queryset
+from ...utils import get_chunk
 
 log = logging.getLogger(__name__)
 
@@ -19,16 +19,16 @@ class Command(BaseCommand):
 
     @staticmethod
     def _get_indexing_tasks(app_label, model_name, queryset, document_class, index_name):
-        queryset = queryset.values_list('id', flat=True)
-        chunked_queryset = chunk_queryset(queryset, settings.ES_TASK_CHUNK_SIZE)
+        total = queryset.count()
+        chunks = get_chunk(total, settings.ES_TASK_CHUNK_SIZE)
 
-        for chunk in chunked_queryset:
+        for chunk in chunks:
             data = {
                 'app_label': app_label,
                 'model_name': model_name,
                 'document_class': document_class,
                 'index_name': index_name,
-                'objects_id': list(chunk)
+                'chunk': chunk
             }
             yield index_objects_to_es.si(**data)
 

diff --git a/readthedocs/search/tasks.py b/readthedocs/search/tasks.py
@@ -68,14 +68,26 @@ def switch_es_index(app_label, model_name, index_name, new_index_name):
 
 
 @app.task(queue='web')
-def index_objects_to_es(app_label, model_name, document_class, index_name, objects_id):
+def index_objects_to_es(app_label, model_name, document_class, index_name,
+                        chunk=None, objects_id=None):
+
+    assert not (chunk and objects_id), "You can not pass both chunk and objects_id"
+
     model = apps.get_model(app_label, model_name)
     document = _get_document(model=model, document_class=document_class)
 
     # Use queryset from model as the ids are specific
-    queryset = model.objects.all().filter(id__in=objects_id).iterator()
-    log.info("Indexing model: {}, id:'{}'".format(model.__name__, objects_id))
-    document().update(queryset, index_name=index_name)
+    queryset = model.objects.all()
+    if chunk:
+        # Chunk is a tuple with start and end index of queryset
+        start = chunk[0]
+        end = chunk[1]
+        queryset = queryset[start:end]
+    elif objects_id:
+        queryset = queryset.filter(id__in=objects_id)
+
+    log.info("Indexing model: {}, '{}' objects".format(model.__name__, queryset.count()))
+    document().update(queryset.iterator(), index_name=index_name)
 
 
 @app.task(queue='web')

diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
@@ -323,10 +323,9 @@ def get_project_list_or_404(project_slug, user):
     return project_list
 
 
-def chunk_queryset(queryset, chunk_size):
-    """Yield successive `chunk_size` chunks of queryset."""
+def get_chunk(total, chunk_size):
+    """Yield successive `chunk_size` chunks"""
     # Based on https://stackoverflow.com/a/312464
     # licensed under cc by-sa 3.0
-    total = queryset.count()
     for i in range(0, total, chunk_size):
-        yield queryset[i:i + chunk_size]
+        yield (i, i + chunk_size)