Skip to content

Commit

Permalink
Analytics: don't record page views for PR previews (#11065)
Browse files Browse the repository at this point in the history
* Analytics: don't record page views for PR previews

* Tests

* Fix log

* Comments
  • Loading branch information
stsewd committed Jan 29, 2024
1 parent 7fcd568 commit 96fc90b
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 31 deletions.
8 changes: 4 additions & 4 deletions readthedocs/analytics/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,25 @@ class PageViewManager(models.Manager):

"""Manager for PageView model."""

def register_page_view(self, project, version, path, full_path, status):
def register_page_view(self, project, version, filename, path, status):
"""Track page view with the given parameters."""
# TODO: remove after the migration of duplicate records has been completed.
if project.has_feature(Feature.DISABLE_PAGEVIEWS):
return

# Normalize paths to avoid duplicates.
filename = "/" + filename.lstrip("/")
path = "/" + path.lstrip("/")
full_path = "/" + full_path.lstrip("/")

page_view, created = self.get_or_create(
project=project,
version=version,
path=path,
path=filename,
date=timezone.now().date(),
status=status,
defaults={
"view_count": 1,
"full_path": full_path,
"full_path": path,
},
)
if not created:
Expand Down
24 changes: 19 additions & 5 deletions readthedocs/analytics/proxied_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from functools import lru_cache
from urllib.parse import urlparse

import structlog
from django.http import JsonResponse
from django.shortcuts import get_object_or_404
from rest_framework import status
Expand All @@ -13,8 +14,10 @@
from readthedocs.core.mixins import CDNCacheControlMixin
from readthedocs.core.unresolver import UnresolverError, unresolve
from readthedocs.core.utils.extend import SettingsOverrideObject
from readthedocs.core.utils.requests import is_suspicious_request
from readthedocs.projects.models import Project

log = structlog.get_logger(__name__) # noqa

class BaseAnalyticsView(CDNCacheControlMixin, APIView):

Expand Down Expand Up @@ -51,6 +54,7 @@ def _get_version(self):
return version

def get(self, request, *args, **kwargs):
# TODO: Use absolute_uri only, we don't need project and version.
project = self._get_project()
version = self._get_version()
absolute_uri = self.request.GET.get("absolute_uri")
Expand All @@ -69,24 +73,34 @@ def get(self, request, *args, **kwargs):

def increase_page_view_count(self, project, version, absolute_uri):
"""Increase the page view count for the given project."""
if is_suspicious_request(self.request):
log.info(
"Suspicious request, not recording pageview.",
url=absolute_uri,
)
return

# Don't allow tracking page views from external domains.
if self.request.unresolved_domain.is_from_external_domain:
return

try:
unresolved = unresolve(absolute_uri)
except UnresolverError:
# If we were unable to resolve the URL, it
# isn't pointing to a valid RTD project.
return

if not unresolved.filename:
# Don't track external versions.
if version.is_external or not unresolved.filename:
return

path = unresolved.filename
full_path = urlparse(absolute_uri).path

path = urlparse(absolute_uri).path
PageView.objects.register_page_view(
project=project,
version=version,
filename=unresolved.filename,
path=path,
full_path=full_path,
status=200,
)

Expand Down
19 changes: 18 additions & 1 deletion readthedocs/analytics/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from django.utils import timezone
from django_dynamic_fixture import get

from readthedocs.builds.constants import EXTERNAL
from readthedocs.builds.models import Version
from readthedocs.projects.constants import PUBLIC
from readthedocs.projects.models import Project
Expand Down Expand Up @@ -99,7 +100,9 @@ def test_get_client_ip_with_remote_addr(self):


@pytest.mark.proxito
@override_settings(PUBLIC_DOMAIN="readthedocs.io")
@override_settings(
PUBLIC_DOMAIN="readthedocs.io", RTD_EXTERNAL_VERSION_DOMAIN="readthedocs.build"
)
class AnalyticsPageViewsTests(TestCase):
def setUp(self):
self.project = get(
Expand Down Expand Up @@ -190,3 +193,17 @@ def test_increase_page_view_count(self):
assert (
PageView.objects.all().order_by("-date").first().view_count == 1
), f"'{self.absolute_uri}' has 1 view tomorrow"

def test_dont_track_external_domains(self):
self.assertEqual(PageView.objects.all().count(), 0)
get(
Version,
slug="123",
type=EXTERNAL,
built=True,
active=True,
)
host = f"{self.project.slug}--123.readthedocs.build"
r = self.client.get(self.url, headers={"host": host})
self.assertEqual(r.status_code, 204)
self.assertEqual(PageView.objects.all().count(), 0)
24 changes: 24 additions & 0 deletions readthedocs/core/utils/requests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import structlog

log = structlog.get_logger(__name__) # noqa


def is_suspicious_request(request) -> bool:
"""
Returns True if the request is suspicious.
This function is used to detect bots and spammers,
we use Cloudflare to detect them.
"""
# This header is set from Cloudflare,
# it goes from 0 to 100, 0 being low risk,
# and values above 10 are bots/spammers.
# https://developers.cloudflare.com/ruleset-engine/rules-language/fields/#dynamic-fields.
threat_score = int(request.headers.get("X-Cloudflare-Threat-Score", 0))
if threat_score > 10:
log.info(
"Suspicious threat score",
threat_score=threat_score,
)
return True
return False
28 changes: 28 additions & 0 deletions readthedocs/proxito/tests/test_full.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,7 @@ def test_track_downloads(self):
@override_settings(
PYTHON_MEDIA=False,
PUBLIC_DOMAIN='readthedocs.io',
RTD_EXTERNAL_VERSION_DOMAIN="dev.readthedocs.build",
)
# We are overriding the storage class instead of using RTD_BUILD_MEDIA_STORAGE,
# since the setting is evaluated just once (first test to use the storage
Expand Down Expand Up @@ -1466,6 +1467,33 @@ def test_track_broken_link(self, storage_exists):
self.assertEqual(pageview.view_count, 1)
self.assertEqual(pageview.status, 404)

@mock.patch.object(BuildMediaFileSystemStorageTest, "exists")
def test_dont_track_external_domains(self, storage_exists):
storage_exists.return_value = False
get(
Feature,
feature_id=Feature.RECORD_404_PAGE_VIEWS,
projects=[self.project],
)
get(
Version,
slug="123",
type=EXTERNAL,
built=True,
active=True,
)
self.assertEqual(PageView.objects.all().count(), 0)

resp = self.client.get(
reverse(
"proxito_404_handler",
kwargs={"proxito_path": "/en/123/"},
),
headers={"host": "project--123.dev.readthedocs.build"},
)
self.assertEqual(resp.status_code, 404)
self.assertEqual(PageView.objects.all().count(), 0)

@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
def test_track_broken_link_custom_404(self, storage_open):
get(
Expand Down
37 changes: 16 additions & 21 deletions readthedocs/proxito/views/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
unresolver,
)
from readthedocs.core.utils.extend import SettingsOverrideObject
from readthedocs.core.utils.requests import is_suspicious_request
from readthedocs.projects.constants import OLD_LANGUAGES_CODE_MAPPING, PRIVATE
from readthedocs.projects.models import Domain, Feature, HTMLFile
from readthedocs.projects.templatetags.projects_tags import sort_version_aware
Expand Down Expand Up @@ -497,13 +498,14 @@ def get(self, request, proxito_path):
# and we don't want to issue infinite redirects.
pass

# Register 404 pages into our database for user's analytics
self._register_broken_link(
project=project,
version=version,
path=filename,
full_path=proxito_path,
)
# Register 404 pages into our database for user's analytics.
if not unresolved_domain.is_from_external_domain:
self._register_broken_link(
project=project,
version=version,
filename=filename,
path=proxito_path,
)

response = self._get_custom_404_page(
request=request,
Expand All @@ -521,33 +523,26 @@ def get(self, request, proxito_path):
path_not_found=proxito_path,
)

def _register_broken_link(self, project, version, path, full_path):
def _register_broken_link(self, project, version, filename, path):
try:
if not project.has_feature(Feature.RECORD_404_PAGE_VIEWS):
return

# This header is set from Cloudflare,
# it goes from 0 to 100, 0 being low risk,
# and values above 10 are bots/spammers.
# https://developers.cloudflare.com/ruleset-engine/rules-language/fields/#dynamic-fields.
threat_score = int(self.request.headers.get("X-Cloudflare-Threat-Score", 0))
if threat_score > 10:
if is_suspicious_request(self.request):
log.info(
"Suspicious threat score, not recording 404.",
threat_score=threat_score,
"Suspicious request, not recording 404.",
)
return

# If the path isn't attached to a version
# it should be the same as the full_path,
# If we don't have a version, the filename is the path,
# otherwise it would be empty.
if not version:
path = full_path
filename = path
PageView.objects.register_page_view(
project=project,
version=version,
filename=filename,
path=path,
full_path=full_path,
status=404,
)
except Exception:
Expand All @@ -556,7 +551,7 @@ def _register_broken_link(self, project, version, path, full_path):
log.exception(
"Error while recording the broken link",
project_slug=project.slug,
full_path=full_path,
path=path,
)

def _get_custom_404_page(self, request, project, version=None):
Expand Down

0 comments on commit 96fc90b

Please sign in to comment.