Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix #4265] Port Document search API for Elasticsearch 6.x #4309

Merged
merged 9 commits into from Jul 6, 2018
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions conftest.py
@@ -1,6 +1,7 @@
import logging

import pytest
from rest_framework.test import APIClient


def pytest_addoption(parser):
Expand All @@ -17,3 +18,8 @@ def pytest_configure(config):
@pytest.fixture(autouse=True)
def settings_modification(settings):
settings.CELERY_ALWAYS_EAGER = True


@pytest.fixture
def api_client():
return APIClient()
14 changes: 3 additions & 11 deletions readthedocs/restapi/urls.py
Expand Up @@ -48,7 +48,7 @@
url(r'index_search/',
search_views.index_search,
name='index_search'),
url(r'search/$', views.search_views.search, name='api_search'),
url(r'^search/$', views.search_views.search, name='api_search'),
url(r'search/project/$',
search_views.project_search,
name='api_project_search'),
Expand Down Expand Up @@ -85,20 +85,12 @@
name='api_webhook'),
]


urlpatterns += function_urls
urlpatterns += search_urls
urlpatterns += task_urls
urlpatterns += search_urls
urlpatterns += integration_urls

try:
from readthedocsext.search.docsearch import DocSearch
api_search_urls = [
url(r'^docsearch/$', DocSearch.as_view(), name='doc_search'),
]
urlpatterns += api_search_urls
except ImportError:
pass

try:
from readthedocsext.donate.restapi.urls import urlpatterns as sustainability_urls

Expand Down
67 changes: 67 additions & 0 deletions readthedocs/search/api.py
@@ -0,0 +1,67 @@
from rest_framework import generics
from rest_framework import exceptions
from rest_framework.exceptions import ValidationError

from readthedocs.projects.models import Project
from readthedocs.search.documents import PageDocument
from readthedocs.search.filters import SearchFilterBackend
from readthedocs.search.pagination import SearchPagination
from readthedocs.search.serializers import PageSearchSerializer


class PageSearchAPIView(generics.ListAPIView):
pagination_class = SearchPagination
filter_backends = [SearchFilterBackend]
serializer_class = PageSearchSerializer

def get_queryset(self):
"""
Return Elasticsearch DSL Search object instead of Django Queryset.

Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern.
So for searching, its possible to return ``Search`` object instead of queryset.
The ``filter_backends`` and ``pagination_class`` is compatible with ``Search``
"""
# Validate all the required params are there
self.validate_query_params()
query = self.request.query_params.get('query', '')
queryset = PageDocument.simple_search(query=query)
return queryset

def validate_query_params(self):
required_query_params = {'query', 'project', 'version'} # python `set` literal is `{}`
request_params = set(self.request.query_params.keys())
missing_params = required_query_params - request_params
if missing_params:
errors = {}
for param in missing_params:
errors[param] = ["This query param is required"]

raise ValidationError(errors)

def get_serializer_context(self):
context = super(PageSearchAPIView, self).get_serializer_context()
context['projects_info'] = self.get_projects_info()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

projects_info I think is a bad name. It seems like we're just getting the docs_url from it for now, so we should probably be more clear about that. Perhaps context['project_urls'] or something?

return context

def _get_all_projects(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this the same logic as in get_project_slug_list_or_404? It should probably use the util function for this I think? Perhaps it could be get_project_list_or_404 and we can pass slug_only param to it or something?

"""Return list of project and its subprojects."""
project_slug = self.request.query_params.get('project')
queryset = Project.objects.api(self.request.user).only('slug')

project = generics.get_object_or_404(queryset, slug=project_slug)
subprojects = queryset.filter(superprojects__parent_id=project.id)

project_list = list(subprojects) + [project]
return project_list

def get_projects_info(self):
version_slug = self.request.query_params.get('version')
all_projects = self._get_all_projects()
projects_info = {}

for project in all_projects:
data = {'docs_url': project.get_docs_url(version_slug=version_slug)}
projects_info[project.slug] = data

return projects_info
34 changes: 33 additions & 1 deletion readthedocs/search/documents.py
@@ -1,5 +1,6 @@
from django.conf import settings
from django_elasticsearch_dsl import DocType, Index, fields
from elasticsearch_dsl.query import SimpleQueryString, Bool

from readthedocs.projects.models import Project, HTMLFile
from .conf import SEARCH_EXCLUDED_FILE
Expand Down Expand Up @@ -60,14 +61,19 @@ class Meta(object):
content = fields.TextField(attr='processed_json.content')
path = fields.TextField(attr='processed_json.path')

# Fields to perform search with weight
search_fields = ['title^10', 'headers^5', 'content']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this used by the DocType class directly, or are we storing it here to add later? If the prior, we should probably set it in an __init__ method or somewhere else, so it doesn't get confused with the other data here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its not actually used by DocType class, but we are using it in class method. I dont know if keeping it in __init__ method will make it work in class method!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hrm, good point. I guess it's fine for now then.


@classmethod
def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None):
es_query = cls.get_es_query(query=query)
kwargs = {
'using': using or cls._doc_type.using,
'index': index or cls._doc_type.index,
'doc_types': [cls],
'model': cls._doc_type.model,
'query': query
'query': es_query,
'fields': cls.search_fields
}
filters = {}

Expand All @@ -80,6 +86,32 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non

return FileSearch(**kwargs)

@classmethod
def simple_search(cls, query, using=None, index=None):
es_search = cls.search(using=using, index=index)
es_query = cls.get_es_query(query=query)
highlighted_fields = [f.split('^', 1)[0] for f in cls.search_fields]

es_search = es_search.query(es_query).highlight(*highlighted_fields)
return es_search

@classmethod
def get_es_query(cls, query):
"""Return the Elasticsearch query generated from the query string"""
all_queries = []

# Need to search for both 'AND' and 'OR' operations
# The score of AND should be higher as it satisfies both OR and AND
for operator in ['AND', 'OR']:
query_string = SimpleQueryString(query=query, fields=cls.search_fields,
default_operator=operator)
all_queries.append(query_string)

# Run bool query with should, so it returns result where either of the query matches
bool_query = Bool(should=all_queries)

return bool_query

def get_queryset(self):
"""Overwrite default queryset to filter certain files to index"""
queryset = super(PageDocument, self).get_queryset()
Expand Down
24 changes: 9 additions & 15 deletions readthedocs/search/faceted_search.py
Expand Up @@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch):
# TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916
# See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916

def __init__(self, using, index, doc_types, model, **kwargs):
def __init__(self, using, index, doc_types, model, fields=None, **kwargs):
self.using = using
self.index = index
self.doc_types = doc_types
self._model = model
if fields:
self.fields = fields
super(RTDFacetedSearch, self).__init__(**kwargs)


Expand All @@ -25,26 +27,18 @@ class ProjectSearch(RTDFacetedSearch):


class FileSearch(RTDFacetedSearch):
fields = ['title^10', 'headers^5', 'content']
facets = {
'project': TermsFacet(field='project'),
'version': TermsFacet(field='version')
}

def query(self, search, query):
"""Add query part to ``search``"""
"""
Add query part to ``search``

Overriding because we pass ES Query object instead of string
"""
if query:
all_queries = []

# Need to search for both 'AND' and 'OR' operations
# The score of AND should be higher as it comes first
for operator in ['AND', 'OR']:
query_string = SimpleQueryString(query=query, fields=self.fields,
default_operator=operator)
all_queries.append(query_string)

# Run bool query with should, so it returns result where either of the query matches
bool_query = Bool(should=all_queries)
search = search.query(bool_query)
search = search.query(query)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels like we shouldn't be overwriting the object here. Will we return something invalid if there is no query because we have the same object name?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have actually a mixed feeling about this. Do you have any idea about how to do this withtout overriding the search method?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean mostly just the name of the object. search = search.query(query) -- is that similar to a queryset = queryset.filter()? It just feels like a weird bit of syntax to rewrite the name of the object over again.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. search = search.query(query) -- is that similar to a queryset = queryset.filter(). If we do not overwrite the same object, then another variable need to be assigned and check if that is not None


return search
20 changes: 20 additions & 0 deletions readthedocs/search/filters.py
@@ -0,0 +1,20 @@
from rest_framework import filters


class SearchFilterBackend(filters.BaseFilterBackend):

"""Filter search result with project"""

def filter_queryset(self, request, queryset, view):
"""Overwrite the method to compatible with Elasticsearch DSL Search object."""
# ``queryset`` is actually a Elasticsearch DSL ``Search`` object.
# So change the variable name
es_search = queryset
version_slug = request.query_params.get('version')
projects_info = view.get_projects_info()
project_slug_list = projects_info.keys()
# Elasticsearch ``terms`` query can take multiple values as list,
# while ``term`` query takes single value.
filtered_es_search = (es_search.filter('terms', project=project_slug_list)
.filter('term', version=version_slug))
return filtered_es_search
7 changes: 7 additions & 0 deletions readthedocs/search/pagination.py
@@ -0,0 +1,7 @@
from rest_framework.pagination import PageNumberPagination


class SearchPagination(PageNumberPagination):
page_size = 10
page_size_query_param = 'page_size'
max_page_size = 100
24 changes: 24 additions & 0 deletions readthedocs/search/serializers.py
@@ -0,0 +1,24 @@
from rest_framework import serializers

from readthedocs.projects.models import Project


class PageSearchSerializer(serializers.Serializer):
project = serializers.CharField()
version = serializers.CharField()
title = serializers.CharField()
path = serializers.CharField()
link = serializers.SerializerMethodField()
highlight = serializers.SerializerMethodField()

def get_link(self, obj):
projects_info = self.context.get('projects_info')
if projects_info:
project_data = projects_info[obj.project]
docs_url = project_data['docs_url']
return docs_url + obj.path

def get_highlight(self, obj):
highlight = getattr(obj.meta, 'highlight', None)
if highlight:
return highlight.to_dict()