Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixing the indexing #4615

Merged
merged 5 commits into from Sep 7, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 5 additions & 5 deletions readthedocs/search/management/commands/reindex_elasticsearch.py
Expand Up @@ -10,7 +10,7 @@

from ...tasks import (index_objects_to_es, switch_es_index, create_new_es_index,
index_missing_objects)
from ...utils import chunk_queryset
from ...utils import get_chunk

log = logging.getLogger(__name__)

Expand All @@ -19,16 +19,16 @@ class Command(BaseCommand):

@staticmethod
def _get_indexing_tasks(app_label, model_name, queryset, document_class, index_name):
queryset = queryset.values_list('id', flat=True)
chunked_queryset = chunk_queryset(queryset, settings.ES_TASK_CHUNK_SIZE)
total = queryset.count()
chunks = get_chunk(total, settings.ES_TASK_CHUNK_SIZE)

for chunk in chunked_queryset:
for chunk in chunks:
data = {
'app_label': app_label,
'model_name': model_name,
'document_class': document_class,
'index_name': index_name,
'objects_id': list(chunk)
'chunk': chunk
}
yield index_objects_to_es.si(**data)

Expand Down
20 changes: 16 additions & 4 deletions readthedocs/search/tasks.py
Expand Up @@ -68,14 +68,26 @@ def switch_es_index(app_label, model_name, index_name, new_index_name):


@app.task(queue='web')
def index_objects_to_es(app_label, model_name, document_class, index_name, objects_id):
def index_objects_to_es(app_label, model_name, document_class, index_name,
chunk=None, objects_id=None):

assert not chunk and objects_id, "You can not pass both chunk and objects_id"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think this needs a (chunk and objects_id) to apply not properly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry! my bad!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ericholscher fixed!


model = apps.get_model(app_label, model_name)
document = _get_document(model=model, document_class=document_class)

# Use queryset from model as the ids are specific
queryset = model.objects.all().filter(id__in=objects_id).iterator()
log.info("Indexing model: {}, id:'{}'".format(model.__name__, objects_id))
document().update(queryset, index_name=index_name)
queryset = model.objects.all()
if chunk:
# Chunk is a tuple with start and end index of queryset
start = chunk[0]
end = chunk[1]
queryset = queryset[start:end]
elif objects_id:
queryset = queryset.filter(id__in=objects_id)

log.info("Indexing model: {}, '{}' objects".format(model.__name__, queryset.count()))
document().update(queryset.iterator(), index_name=index_name)


@app.task(queue='web')
Expand Down
7 changes: 3 additions & 4 deletions readthedocs/search/utils.py
Expand Up @@ -323,10 +323,9 @@ def get_project_list_or_404(project_slug, user):
return project_list


def chunk_queryset(queryset, chunk_size):
"""Yield successive `chunk_size` chunks of queryset."""
def get_chunk(total, chunk_size):
"""Yield successive `chunk_size` chunks"""
# Based on https://stackoverflow.com/a/312464
# licensed under cc by-sa 3.0
total = queryset.count()
for i in range(0, total, chunk_size):
yield queryset[i:i + chunk_size]
yield (i, i + chunk_size)