/
documents.py
128 lines (102 loc) · 4.06 KB
/
documents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import structlog
from django.conf import settings
from django_elasticsearch_dsl import Document, Index, fields
from elasticsearch import Elasticsearch
from readthedocs.projects.models import HTMLFile, Project
project_conf = settings.ES_INDEXES["project"]
project_index = Index(project_conf["name"])
project_index.settings(**project_conf["settings"])
page_conf = settings.ES_INDEXES["page"]
page_index = Index(page_conf["name"])
page_index.settings(**page_conf["settings"])
log = structlog.get_logger(__name__)
class RTDDocTypeMixin:
def update(self, *args, **kwargs):
# Hack a fix to our broken connection pooling
# This creates a new connection on every request,
# but actually works :)
log.debug("Hacking Elastic indexing to fix connection pooling")
self.using = Elasticsearch(**settings.ELASTICSEARCH_DSL["default"])
super().update(*args, **kwargs)
@project_index.document
class ProjectDocument(RTDDocTypeMixin, Document):
"""Document representation of a Project."""
# Metadata
url = fields.TextField(attr="get_absolute_url")
users = fields.NestedField(
properties={
"username": fields.TextField(),
"id": fields.IntegerField(),
}
)
language = fields.KeywordField()
name = fields.TextField(attr="name")
slug = fields.TextField(attr="slug")
description = fields.TextField(attr="description")
modified_model_field = "modified_date"
def get_queryset(self):
"""
Additional filtering of default queryset.
Don't include delisted projects.
This will also break in-doc search for these projects,
but it's not a priority to find a solution for this as long as "delisted" projects are
understood to be projects with a negative reason for being delisted.
"""
return super().get_queryset().exclude(delisted=True).exclude(is_spam=True)
class Django:
model = Project
fields = []
ignore_signals = True
@page_index.document
class PageDocument(RTDDocTypeMixin, Document):
"""
Document representation of a Page.
Some text fields use the simple analyzer instead of the default (standard).
Simple analyzer will break the text in non-letter characters,
so a text like ``python.submodule`` will be broken like [python, submodule]
instead of [python.submodule].
See more at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html # noqa
Some text fields use the ``with_positions_offsets`` term vector,
this is to have faster highlighting on big documents.
See more at https://www.elastic.co/guide/en/elasticsearch/reference/7.9/term-vector.html
"""
# Metadata
project = fields.KeywordField(attr="project.slug")
version = fields.KeywordField(attr="version.slug")
doctype = fields.KeywordField(attr="version.documentation_type")
path = fields.KeywordField(attr="processed_json.path")
full_path = fields.KeywordField(attr="path")
rank = fields.IntegerField()
# Searchable content
title = fields.TextField(
attr="processed_json.title",
)
sections = fields.NestedField(
attr="processed_json.sections",
properties={
"id": fields.KeywordField(),
"title": fields.TextField(),
"content": fields.TextField(
term_vector="with_positions_offsets",
),
},
)
modified_model_field = "modified_date"
class Django:
model = HTMLFile
fields = ("commit", "build")
ignore_signals = True
def prepare_rank(self, html_file):
if not (-10 <= html_file.rank <= 10):
return 0
return html_file.rank
def get_queryset(self):
"""Don't include ignored files and delisted projects."""
queryset = super().get_queryset()
queryset = (
queryset.exclude(ignore=True)
.exclude(project__delisted=True)
.exclude(project__is_spam=True)
.select_related("version", "project")
)
return queryset