Skip to content

Commit

Permalink
Merge pull request #5939 from dojutsu-user/indexing-speedup
Browse files Browse the repository at this point in the history
Indexing speedup
  • Loading branch information
ericholscher committed Jul 16, 2019
2 parents 80f74e8 + 985488d commit a457fc0
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 20 deletions.
38 changes: 26 additions & 12 deletions readthedocs/search/management/commands/reindex_elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from ...tasks import (index_objects_to_es, switch_es_index, create_new_es_index,
index_missing_objects)
from ...utils import get_chunk

log = logging.getLogger(__name__)

Expand All @@ -19,17 +18,32 @@ class Command(BaseCommand):

@staticmethod
def _get_indexing_tasks(app_label, model_name, index_name, queryset, document_class):
total = queryset.count()
chunks = get_chunk(total, settings.ES_TASK_CHUNK_SIZE)

for chunk in chunks:
data = {
'app_label': app_label,
'model_name': model_name,
'document_class': document_class,
'index_name': index_name,
'chunk': chunk
}
chunk_size = settings.ES_TASK_CHUNK_SIZE
qs_iterator = queryset.only('pk').iterator()
is_iterator_empty = False

data = {
'app_label': app_label,
'model_name': model_name,
'document_class': document_class,
'index_name': index_name,
}

while not is_iterator_empty:
objects_id = []

try:
for _ in range(chunk_size):
pk = next(qs_iterator).pk
objects_id.append(pk)

if pk % 5000 == 0:
log.info('Total: %s', pk)

except StopIteration:
is_iterator_empty = True

data['objects_id'] = objects_id
yield index_objects_to_es.si(**data)

def _run_reindex_tasks(self, models, queue):
Expand Down
8 changes: 0 additions & 8 deletions readthedocs/search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,6 @@ def get_project_list_or_404(project_slug, user, version_slug=None):
return project_list


def get_chunk(total, chunk_size):
"""Yield successive `chunk_size` chunks."""
# Based on https://stackoverflow.com/a/312464
# licensed under cc by-sa 3.0
for i in range(0, total, chunk_size):
yield (i, i + chunk_size)


def _get_index(indices, index_name):
"""
Get Index from all the indices.
Expand Down

0 comments on commit a457fc0

Please sign in to comment.