Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent rate limit errors in wayback machine API #339

Merged
merged 1 commit into from Sep 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
65 changes: 53 additions & 12 deletions bookmarks/services/tasks.py
Expand Up @@ -5,8 +5,9 @@
from django.conf import settings
from django.contrib.auth import get_user_model
from django.contrib.auth.models import User
from waybackpy.exceptions import WaybackError
from waybackpy.exceptions import WaybackError, TooManyRequestsError, NoCDXRecordFound

import bookmarks.services.wayback
from bookmarks.models import Bookmark, UserProfile
from bookmarks.services.website_loader import DEFAULT_USER_AGENT

Expand All @@ -26,6 +27,32 @@ def create_web_archive_snapshot(user: User, bookmark: Bookmark, force_update: bo
_create_web_archive_snapshot_task(bookmark.id, force_update)


def _load_newest_snapshot(bookmark: Bookmark):
try:
logger.debug(f'Load existing snapshot for bookmark. url={bookmark.url}')
cdx_api = bookmarks.services.wayback.CustomWaybackMachineCDXServerAPI(bookmark.url)
existing_snapshot = cdx_api.newest()

if existing_snapshot:
bookmark.web_archive_snapshot_url = existing_snapshot.archive_url
bookmark.save()
logger.debug(f'Using newest snapshot. url={bookmark.url} from={existing_snapshot.datetime_timestamp}')

except NoCDXRecordFound:
logger.error(f'Could not find any snapshots for bookmark. url={bookmark.url}')
except WaybackError as error:
logger.error(f'Failed to load existing snapshot. url={bookmark.url}', exc_info=error)


def _create_snapshot(bookmark: Bookmark):
logger.debug(f'Create new snapshot for bookmark. url={bookmark.url}...')
archive = waybackpy.WaybackMachineSaveAPI(bookmark.url, DEFAULT_USER_AGENT, max_tries=1)
archive.save()
bookmark.web_archive_snapshot_url = archive.archive_url
bookmark.save()
logger.debug(f'Successfully created new snapshot for bookmark:. url={bookmark.url}')


@background()
def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool):
try:
Expand All @@ -37,19 +64,31 @@ def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool):
if bookmark.web_archive_snapshot_url and not force_update:
return

logger.debug(f'Create web archive link for bookmark: {bookmark}...')
# Create new snapshot
try:
_create_snapshot(bookmark)
return
except TooManyRequestsError:
logger.error(
f'Failed to create snapshot due to rate limiting, trying to load newest snapshot as fallback. url={bookmark.url}')
except WaybackError:
logger.error(f'Failed to create snapshot, trying to load newest snapshot as fallback. url={bookmark.url}')

# Load the newest snapshot as fallback
_load_newest_snapshot(bookmark)

archive = waybackpy.WaybackMachineSaveAPI(bookmark.url, DEFAULT_USER_AGENT)

@background()
def _load_web_archive_snapshot_task(bookmark_id: int):
try:
archive.save()
except WaybackError as error:
logger.exception(f'Error creating web archive link for bookmark: {bookmark}...', exc_info=error)
raise

bookmark.web_archive_snapshot_url = archive.archive_url
bookmark.save()
logger.debug(f'Successfully created web archive link for bookmark: {bookmark}...')
bookmark = Bookmark.objects.get(id=bookmark_id)
except Bookmark.DoesNotExist:
return
# Skip if snapshot exists
if bookmark.web_archive_snapshot_url:
return
# Load the newest snapshot
_load_newest_snapshot(bookmark)


def schedule_bookmarks_without_snapshots(user: User):
Expand All @@ -63,4 +102,6 @@ def _schedule_bookmarks_without_snapshots_task(user_id: int):
bookmarks_without_snapshots = Bookmark.objects.filter(web_archive_snapshot_url__exact='', owner=user)

for bookmark in bookmarks_without_snapshots:
_create_web_archive_snapshot_task(bookmark.id, False)
# To prevent rate limit errors from the Wayback API only try to load the latest snapshots instead of creating
# new ones when processing bookmarks in bulk
_load_web_archive_snapshot_task(bookmark.id)
40 changes: 40 additions & 0 deletions bookmarks/services/wayback.py
@@ -0,0 +1,40 @@
import time
from typing import Dict

import waybackpy
import waybackpy.utils
from waybackpy.exceptions import NoCDXRecordFound


class CustomWaybackMachineCDXServerAPI(waybackpy.WaybackMachineCDXServerAPI):
"""
Customized WaybackMachineCDXServerAPI to work around some issues with retrieving the newest snapshot.
See https://github.com/akamhy/waybackpy/issues/176
"""

def newest(self):
unix_timestamp = int(time.time())
self.closest = waybackpy.utils.unix_timestamp_to_wayback_timestamp(unix_timestamp)
self.sort = 'closest'
self.limit = -5

newest_snapshot = None
for snapshot in self.snapshots():
newest_snapshot = snapshot
break

if not newest_snapshot:
raise NoCDXRecordFound(
"Wayback Machine's CDX server did not return any records "
+ "for the query. The URL may not have any archives "
+ " on the Wayback Machine or the URL may have been recently "
+ "archived and is still not available on the CDX server."
)

return newest_snapshot

def add_payload(self, payload: Dict[str, str]) -> None:
super().add_payload(payload)
# Set fastLatest query param, as we are only using this API to get the latest snapshot and using fastLatest
# makes searching for latest snapshots faster
payload['fastLatest'] = 'true'