/
metadata_cache.py
267 lines (226 loc) · 10.1 KB
/
metadata_cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import os
import time
import logging
from typing import Any
from functools import cache
from pathlib import Path
from datetime import datetime
from threading import Lock
import click
import backoff
import requests
from malexport.exporter.mal_session import MalSession
from malexport.exporter.account import Account
from url_cache.core import URLCache, Summary
from mal_id.common import backoff_handler
from mal_id.paths import metadatacache_dir
from mal_id.log import logger
MAL_API_LOCK = Lock()
@backoff.on_exception(
lambda: backoff.constant(5),
requests.exceptions.RequestException,
max_tries=3,
on_backoff=backoff_handler,
)
def api_request(session: MalSession, url: str, recursed_times: int = 0) -> Any:
return _api_request(session, url, recursed_times)
def _api_request(session: MalSession, url: str, recursed_times: int = 0) -> Any:
with MAL_API_LOCK:
time.sleep(1)
resp: requests.Response = session.session.get(url)
# sometimes 400 happens if the alternative titles are empty
if resp.status_code == 400 and "alternative_titles," in url:
if recursed_times > 2:
resp.raise_for_status()
logger.warning("trying to remove alternative titles and re-requesting")
url = url.replace("alternative_titles,", "")
return api_request(session, url, recursed_times + 1)
# if token expired, refresh
if resp.status_code == 401:
logger.warning("token expired, refreshing")
refresh_token()
resp.raise_for_status()
# if this is an unexpected API failure, and not an expected 404/429/400, wait for a while before retrying
if resp.status_code == 429:
logger.warning("API rate limit exceeded, waiting")
time.sleep(60)
resp.raise_for_status()
# for any other error, backoff for a minute and then retry
# if over 5 times, raise the error
if (
recursed_times < 5
and resp.status_code >= 400
and resp.status_code not in (404,)
):
click.echo(f"Error {resp.status_code}: {resp.text}", err=True)
time.sleep(60)
return api_request(session, url, recursed_times + 1)
# fallthrough raises error if none of the conditions above match
resp.raise_for_status()
# if we get here, we have a successful response
return resp.json()
@cache
def mal_api_session() -> MalSession:
assert "MAL_USERNAME" in os.environ
acc = Account.from_username(os.environ["MAL_USERNAME"])
acc.mal_api_authenticate()
assert acc.mal_session is not None
return acc.mal_session
def refresh_token() -> None:
mal_api_session().refresh_token()
def check_mal() -> bool:
try:
logger.info("checking if MAL API is up...")
resp = mal_api_session().session.get("https://api.myanimelist.net/v2/anime/1")
if resp.status_code == 401:
refresh_token()
return check_mal()
resp.raise_for_status()
data = resp.json()
assert data["id"] == 1
assert data["title"] == "Cowboy Bebop"
logger.info("MAL API is up")
return True
except requests.exceptions.RequestException as e:
logger.warning("MAL API is down!", exc_info=e)
return False
class MetadataCache(URLCache):
BASE_ANIME_URL = "https://api.myanimelist.net/v2/anime/{}?nsfw=true"
ANIME_FIELDS = "fields=id,title,main_picture,alternative_titles,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,start_season,broadcast,source,average_episode_duration,rating,pictures,background,related_anime,related_manga,recommendations,studios,statistics"
BASE_MANGA_URL = r"https://api.myanimelist.net/v2/manga/{}?nsfw=true"
MANGA_FIELDS = "fields=id,title,main_picture,alternative_titles,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_volumes,num_chapters,authors{first_name,last_name},pictures,background,related_anime,related_manga,recommendations,serialization{name}"
def __init__(
self, cache_dir: Path = metadatacache_dir, loglevel: int = logging.INFO
) -> None:
self.mal_session = mal_api_session()
super().__init__(cache_dir=cache_dir, loglevel=loglevel)
def request_data(self, url: str, preprocess_url: bool = True) -> Summary:
mal_id = int(url.split("/")[-1])
media_type = url.split("/")[-2]
assert media_type in ("anime", "manga")
# this is the URL we use as they key, but not the one we cache
myanimelist_url = url
del url # to be safe
# this is the actual URL we want to request
if media_type == "anime":
api_url = self.BASE_ANIME_URL.format(mal_id) + "&" + self.ANIME_FIELDS
else:
api_url = self.BASE_MANGA_URL.format(mal_id) + "&" + self.MANGA_FIELDS
api_url = self.preprocess_url(api_url) if preprocess_url else api_url
logger.info(f"requesting {api_url}")
try:
if "skip_retry" in self.options and self.options["skip_retry"] is True:
json_data = _api_request(self.mal_session, api_url)
else:
json_data = api_request(self.mal_session, api_url)
# succeeded, return the data
return Summary(
url=myanimelist_url,
data={},
metadata=json_data,
timestamp=datetime.now(),
)
except requests.exceptions.RequestException as ex:
logger.exception(f"error requesting {api_url}", exc_info=ex)
logger.warning(ex.response.text)
logger.warning(
"Couldn't cache info, could be deleted or failed to cache because entry data is broken/unapproved causing the MAL API to fail"
)
# TODO: this needs more testing to make sure we never overwrite good data
# prevent a broken entry from removing old, valid data
#
# If it has valid but failed now, we should just keep the old valid data
if self.summary_cache.has(myanimelist_url):
logger.warning("using existing cached data for this entry")
sc = self.summary_cache.get(myanimelist_url)
assert sc is not None
logger.info("Updating timestamp to prevent re-requesting this entry")
# check if this has a few keys, i.e. (this isnt {"error": 404})
if "error" in sc.metadata:
# if we had cached an error, then just return the error
# TODO: should we update the timestamp here? i dont think it hurts to, as this
# is just an error where we have no data. it just prevents possible re-requests
# of the same error in the future
sc.timestamp = datetime.now()
return sc
else:
# we failed to get new data, but have old data
# so, just return the old data
assert "error" not in sc.metadata and MetadataCache.has_data(
sc
), f"{sc.metadata} does not have data"
# reusing old data is fine, but we should update the timestamp so
# we dont try to refresh it again for a while
sc.timestamp = datetime.now()
return sc
else:
# there is no existing data, and we failed to get new data,
# so save an error to the cache
# sanity check to make sure were not overwriting good data
assert not self.summary_cache.has(myanimelist_url)
logger.warning(
"no existing cached data for this entry, saving error to cache"
)
# this just doesnt exist (deleted a long time ago etc.?)
# no way to get data for this
return Summary(
url=myanimelist_url,
data={},
metadata={"error": ex.response.status_code},
timestamp=datetime.now(),
)
def refresh_data(self, url: str) -> Summary:
uurl = self.preprocess_url(url)
summary = self.request_data(uurl)
self.summary_cache.put(uurl, summary)
return summary
@staticmethod
def is_404(summary: Summary) -> bool:
if "error" in summary.metadata:
return bool(summary.metadata["error"] == 404)
return False
@staticmethod
def has_data(summary: Summary) -> bool:
return all(k in summary.metadata for k in ("title", "id"))
@cache
def metadata_cache() -> MetadataCache:
return MetadataCache()
def request_metadata(
id_: int,
entry_type: str,
/,
*,
rerequest_failed: bool = False,
force_rerequest: bool = False,
mcache: MetadataCache = metadata_cache(),
) -> Summary:
assert entry_type in {"anime", "manga"}
# use this as the key for the cache
url_key = "https://myanimelist.net/{}/{}".format(entry_type, id_)
# if this had failed previouly, try again
#
# this may never actually be the case, but just want to make sure if we
# add some refresh mechanism that that does not happen...
if rerequest_failed:
sdata = mcache.get(url_key)
# if theres no data and this isnt a 404, retry
if not MetadataCache.has_data(sdata) and not MetadataCache.is_404(sdata):
logger.info("re-requesting failed entry: {}".format(sdata.metadata))
return mcache.refresh_data(url_key)
elif force_rerequest:
logger.info("re-requesting entry")
try:
mcache.options["skip_retry"] = True
dat = mcache.refresh_data(url_key)
finally:
mcache.options["skip_retry"] = False
return dat
return mcache.get(url_key)
def has_metadata(
id_: int,
entry_type: str,
) -> bool:
assert entry_type in {"anime", "manga"}
# use this as the key for the cache
url_key = "https://myanimelist.net/{}/{}".format(entry_type, id_)
return metadata_cache().summary_cache.has(url_key)