Merge pull request #4615 from safwanrahman/search_fix

fixing the indexing
readthedocs · Sep 7, 2018 · 3ca137c · 3ca137c
2 parents cbc5322 + c09aced
commit 3ca137c
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 14 deletions.
diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py
@@ -59,7 +59,7 @@
 from .exceptions import RepositoryError
 from .models import Domain, Feature, ImportedFile, Project, HTMLFile
 from .signals import (
-    after_build, after_vcs, before_build, before_vcs, 
+    after_build, after_vcs, before_build, before_vcs,
     files_changed, bulk_post_create, bulk_post_delete)
 
 log = logging.getLogger(__name__)

diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -10,7 +10,7 @@
 
 from ...tasks import (index_objects_to_es, switch_es_index, create_new_es_index,
                       index_missing_objects)
-from ...utils import chunk_queryset
+from ...utils import get_chunk
 
 log = logging.getLogger(__name__)
 
@@ -19,16 +19,16 @@ class Command(BaseCommand):
 
     @staticmethod
     def _get_indexing_tasks(app_label, model_name, queryset, document_class, index_name):
-        queryset = queryset.values_list('id', flat=True)
-        chunked_queryset = chunk_queryset(queryset, settings.ES_TASK_CHUNK_SIZE)
+        total = queryset.count()
+        chunks = get_chunk(total, settings.ES_TASK_CHUNK_SIZE)
 
-        for chunk in chunked_queryset:
+        for chunk in chunks:
             data = {
                 'app_label': app_label,
                 'model_name': model_name,
                 'document_class': document_class,
                 'index_name': index_name,
-                'objects_id': list(chunk)
+                'chunk': chunk
             }
             yield index_objects_to_es.si(**data)
 

diff --git a/readthedocs/search/tasks.py b/readthedocs/search/tasks.py
@@ -68,14 +68,26 @@ def switch_es_index(app_label, model_name, index_name, new_index_name):
 
 
 @app.task(queue='web')
-def index_objects_to_es(app_label, model_name, document_class, index_name, objects_id):
+def index_objects_to_es(app_label, model_name, document_class, index_name,
+                        chunk=None, objects_id=None):
+
+    assert not (chunk and objects_id), "You can not pass both chunk and objects_id"
+
     model = apps.get_model(app_label, model_name)
     document = _get_document(model=model, document_class=document_class)
 
     # Use queryset from model as the ids are specific
-    queryset = model.objects.all().filter(id__in=objects_id).iterator()
-    log.info("Indexing model: {}, id:'{}'".format(model.__name__, objects_id))
-    document().update(queryset, index_name=index_name)
+    queryset = model.objects.all()
+    if chunk:
+        # Chunk is a tuple with start and end index of queryset
+        start = chunk[0]
+        end = chunk[1]
+        queryset = queryset[start:end]
+    elif objects_id:
+        queryset = queryset.filter(id__in=objects_id)
+
+    log.info("Indexing model: {}, '{}' objects".format(model.__name__, queryset.count()))
+    document().update(queryset.iterator(), index_name=index_name)
 
 
 @app.task(queue='web')

diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
@@ -323,10 +323,9 @@ def get_project_list_or_404(project_slug, user):
     return project_list
 
 
-def chunk_queryset(queryset, chunk_size):
-    """Yield successive `chunk_size` chunks of queryset."""
+def get_chunk(total, chunk_size):
+    """Yield successive `chunk_size` chunks"""
     # Based on https://stackoverflow.com/a/312464
     # licensed under cc by-sa 3.0
-    total = queryset.count()
     for i in range(0, total, chunk_size):
-        yield queryset[i:i + chunk_size]
+        yield (i, i + chunk_size)