mal_id/metadata_cache.py

import os
import time
import logging
from typing import Any
from functools import cache
from pathlib import Path
from datetime import datetime
from threading import Lock

import click
import backoff
import requests

from malexport.exporter.mal_session import MalSession
from malexport.exporter.account import Account
from url_cache.core import URLCache, Summary

from mal_id.common import backoff_handler
from mal_id.paths import metadatacache_dir
from mal_id.log import logger


MAL_API_LOCK = Lock()


@backoff.on_exception(
    lambda: backoff.constant(5),
    requests.exceptions.RequestException,
    max_tries=3,
    on_backoff=backoff_handler,
)
def api_request(session: MalSession, url: str, recursed_times: int = 0) -> Any:
    return _api_request(session, url, recursed_times)


def _api_request(session: MalSession, url: str, recursed_times: int = 0) -> Any:
    with MAL_API_LOCK:
        time.sleep(1)
        resp: requests.Response = session.session.get(url)

    # sometimes 400 happens if the alternative titles are empty
    if resp.status_code == 400 and "alternative_titles," in url:
        if recursed_times > 2:
            resp.raise_for_status()
        logger.warning("trying to remove alternative titles and re-requesting")
        url = url.replace("alternative_titles,", "")
        return api_request(session, url, recursed_times + 1)

    # if token expired, refresh
    if resp.status_code == 401:
        logger.warning("token expired, refreshing")
        refresh_token()
        resp.raise_for_status()

    # if this is an unexpected API failure, and not an expected 404/429/400, wait for a while before retrying
    if resp.status_code == 429:
        logger.warning("API rate limit exceeded, waiting")
        time.sleep(60)
        resp.raise_for_status()

    # for any other error, backoff for a minute and then retry
    # if over 5 times, raise the error
    if (
        recursed_times < 5
        and resp.status_code >= 400
        and resp.status_code not in (404,)
    ):
        click.echo(f"Error {resp.status_code}: {resp.text}", err=True)
        time.sleep(60)
        return api_request(session, url, recursed_times + 1)

    # fallthrough raises error if none of the conditions above match
    resp.raise_for_status()

    # if we get here, we have a successful response
    return resp.json()


@cache
def mal_api_session() -> MalSession:
    assert "MAL_USERNAME" in os.environ
    acc = Account.from_username(os.environ["MAL_USERNAME"])
    acc.mal_api_authenticate()
    assert acc.mal_session is not None
    return acc.mal_session


def refresh_token() -> None:
    mal_api_session().refresh_token()


def check_mal() -> bool:
    try:
        logger.info("checking if MAL API is up...")
        resp = mal_api_session().session.get("https://api.myanimelist.net/v2/anime/1")
        if resp.status_code == 401:
            refresh_token()
            return check_mal()
        resp.raise_for_status()
        data = resp.json()
        assert data["id"] == 1
        assert data["title"] == "Cowboy Bebop"
        logger.info("MAL API is up")
        return True
    except requests.exceptions.RequestException as e:
        logger.warning("MAL API is down!", exc_info=e)
        return False


class MetadataCache(URLCache):
    BASE_ANIME_URL = "https://api.myanimelist.net/v2/anime/{}?nsfw=true"

    ANIME_FIELDS = "fields=id,title,main_picture,alternative_titles,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,start_season,broadcast,source,average_episode_duration,rating,pictures,background,related_anime,related_manga,recommendations,studios,statistics"

    BASE_MANGA_URL = r"https://api.myanimelist.net/v2/manga/{}?nsfw=true"

    MANGA_FIELDS = "fields=id,title,main_picture,alternative_titles,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_volumes,num_chapters,authors{first_name,last_name},pictures,background,related_anime,related_manga,recommendations,serialization{name}"

    def __init__(
        self, cache_dir: Path = metadatacache_dir, loglevel: int = logging.INFO
    ) -> None:
        self.mal_session = mal_api_session()
        super().__init__(cache_dir=cache_dir, loglevel=loglevel)

    def request_data(self, url: str, preprocess_url: bool = True) -> Summary:
        mal_id = int(url.split("/")[-1])
        media_type = url.split("/")[-2]
        assert media_type in ("anime", "manga")

        # this is the URL we use as they key, but not the one we cache
        myanimelist_url = url
        del url  # to be safe

        # this is the actual URL we want to request
        if media_type == "anime":
            api_url = self.BASE_ANIME_URL.format(mal_id) + "&" + self.ANIME_FIELDS
        else:
            api_url = self.BASE_MANGA_URL.format(mal_id) + "&" + self.MANGA_FIELDS

        api_url = self.preprocess_url(api_url) if preprocess_url else api_url

        logger.info(f"requesting {api_url}")
        try:
            if "skip_retry" in self.options and self.options["skip_retry"] is True:
                json_data = _api_request(self.mal_session, api_url)
            else:
                json_data = api_request(self.mal_session, api_url)
            # succeeded, return the data
            return Summary(
                url=myanimelist_url,
                data={},
                metadata=json_data,
                timestamp=datetime.now(),
            )
        except requests.exceptions.RequestException as ex:
            logger.exception(f"error requesting {api_url}", exc_info=ex)
            logger.warning(ex.response.text)
            logger.warning(
                "Couldn't cache info, could be deleted or failed to cache because entry data is broken/unapproved causing the MAL API to fail"
            )
            # TODO: this needs more testing to make sure we never overwrite good data
            # prevent a broken entry from removing old, valid data
            #
            # If it has valid but failed now, we should just keep the old valid data
            if self.summary_cache.has(myanimelist_url):
                logger.warning("using existing cached data for this entry")
                sc = self.summary_cache.get(myanimelist_url)
                assert sc is not None
                logger.info("Updating timestamp to prevent re-requesting this entry")
                # check if this has a few keys, i.e. (this isnt {"error": 404})
                if "error" in sc.metadata:
                    # if we had cached an error, then just return the error
                    # TODO: should we update the timestamp here? i dont think it hurts to, as this
                    # is just an error where we have no data. it just prevents possible re-requests
                    # of the same error in the future
                    sc.timestamp = datetime.now()
                    return sc
                else:
                    # we failed to get new data, but have old data
                    # so, just return the old data
                    assert "error" not in sc.metadata and MetadataCache.has_data(
                        sc
                    ), f"{sc.metadata} does not have data"
                    # reusing old data is fine, but we should update the timestamp so
                    # we dont try to refresh it again for a while
                    sc.timestamp = datetime.now()
                    return sc
            else:
                # there is no existing data, and we failed to get new data,
                # so save an error to the cache
                # sanity check to make sure were not overwriting good data
                assert not self.summary_cache.has(myanimelist_url)
                logger.warning(
                    "no existing cached data for this entry, saving error to cache"
                )
                # this just doesnt exist (deleted a long time ago etc.?)
                # no way to get data for this
                return Summary(
                    url=myanimelist_url,
                    data={},
                    metadata={"error": ex.response.status_code},
                    timestamp=datetime.now(),
                )

    def refresh_data(self, url: str) -> Summary:
        uurl = self.preprocess_url(url)
        summary = self.request_data(uurl)
        self.summary_cache.put(uurl, summary)
        return summary

    @staticmethod
    def is_404(summary: Summary) -> bool:
        if "error" in summary.metadata:
            return bool(summary.metadata["error"] == 404)
        return False

    @staticmethod
    def has_data(summary: Summary) -> bool:
        return all(k in summary.metadata for k in ("title", "id"))


@cache
def metadata_cache() -> MetadataCache:
    return MetadataCache()


def request_metadata(
    id_: int,
    entry_type: str,
    /,
    *,
    rerequest_failed: bool = False,
    force_rerequest: bool = False,
    mcache: MetadataCache = metadata_cache(),
) -> Summary:
    assert entry_type in {"anime", "manga"}
    # use this as the key for the cache
    url_key = "https://myanimelist.net/{}/{}".format(entry_type, id_)
    # if this had failed previouly, try again
    #
    # this may never actually be the case, but just want to make sure if we
    # add some refresh mechanism that that does not happen...
    if rerequest_failed:
        sdata = mcache.get(url_key)
        # if theres no data and this isnt a 404, retry
        if not MetadataCache.has_data(sdata) and not MetadataCache.is_404(sdata):
            logger.info("re-requesting failed entry: {}".format(sdata.metadata))
            return mcache.refresh_data(url_key)
    elif force_rerequest:
        logger.info("re-requesting entry")
        try:
            mcache.options["skip_retry"] = True
            dat = mcache.refresh_data(url_key)
        finally:
            mcache.options["skip_retry"] = False
        return dat
    return mcache.get(url_key)


def has_metadata(
    id_: int,
    entry_type: str,
) -> bool:
    assert entry_type in {"anime", "manga"}
    # use this as the key for the cache
    url_key = "https://myanimelist.net/{}/{}".format(entry_type, id_)
    return metadata_cache().summary_cache.has(url_key)