-
Notifications
You must be signed in to change notification settings - Fork 3k
Expand file tree
/
Copy pathpublic_domain_image_archive.py
More file actions
151 lines (111 loc) · 4.5 KB
/
Copy pathpublic_domain_image_archive.py
File metadata and controls
151 lines (111 loc) · 4.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Public domain image archive"""
import re
from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
from json import dumps
from searx.network import get
from searx.utils import extr
from searx.exceptions import SearxEngineAccessDeniedException, SearxEngineException
THUMBNAIL_SUFFIX = "?fit=max&h=360&w=360"
"""
Example thumbnail urls (from requests & html):
- https://the-public-domain-review.imgix.net
/shop/nov-2023-prints-00043.jpg
?fit=max&h=360&w=360
- https://the-public-domain-review.imgix.net
/collections/the-history-of-four-footed-beasts-and-serpents-1658/
8616383182_5740fa7851_o.jpg
?fit=max&h=360&w=360
Example full image urls (from html)
- https://the-public-domain-review.imgix.net/shop/
nov-2023-prints-00043.jpg
?fit=clip&w=970&h=800&auto=format,compress
- https://the-public-domain-review.imgix.net/collections/
the-history-of-four-footed-beasts-and-serpents-1658/8616383182_5740fa7851_o.jpg
?fit=clip&w=310&h=800&auto=format,compress
The thumbnail url from the request will be cleaned for the full image link
The cleaned thumbnail url will have THUMBNAIL_SUFFIX added to them, based on the original thumbnail parameters
"""
# about
about = {
"website": 'https://pdimagearchive.org',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
pdia_base_url = 'https://pdimagearchive.org'
pdia_config_start = "/_astro/InfiniteSearch."
pdia_config_end = ".js"
categories = ['images']
page_size = 20
paging = True
__CACHED_API_URL = None
_API_URL_RE = re.compile(r"\"(https://.*?/search-proxy)\"")
def _clean_url(url):
parsed = urlparse(url)
query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']]
return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))
def _get_algolia_api_url():
global __CACHED_API_URL # pylint:disable=global-statement
if __CACHED_API_URL:
return __CACHED_API_URL
# fake request to extract api url
resp = get(f"{pdia_base_url}/search/?q=", timeout=3)
if resp.status_code != 200:
raise LookupError("Failed to fetch config location (and as such the API url) for PDImageArchive")
pdia_config_filepart = extr(resp.text, pdia_config_start, pdia_config_end)
pdia_config_url = pdia_base_url + pdia_config_start + pdia_config_filepart + pdia_config_end
resp = get(pdia_config_url)
if resp.status_code != 200:
raise LookupError("Failed to obtain AWS api url for PDImageArchive")
api_url_match = _API_URL_RE.search(resp.text)
if api_url_match is None:
raise LookupError("Couldn't obtain AWS api url for PDImageArchive")
api_url = api_url_match.group(1)
__CACHED_API_URL = api_url
return api_url
def _clear_cached_api_url():
global __CACHED_API_URL # pylint:disable=global-statement
__CACHED_API_URL = None
def request(query, params):
params['url'] = _get_algolia_api_url()
params['method'] = 'POST'
request_data = {
'page': params['pageno'] - 1,
'query': query,
'hitsPerPage': page_size,
'indexName': 'prod_all-images',
}
params['headers'] = {'Content-Type': 'application/json'}
params['data'] = dumps(request_data)
# http errors are handled manually to be able to reset the api url
params['raise_for_httperror'] = False
return params
def response(resp):
results = []
json_data = resp.json()
if resp.status_code == 403:
_clear_cached_api_url()
raise SearxEngineAccessDeniedException()
if resp.status_code != 200:
raise SearxEngineException()
if 'results' not in json_data:
return []
for result in json_data['results'][0]['hits']:
content = []
if result.get("themes"):
content.append("Themes: " + result['themes'])
if result.get("encompassingWork"):
content.append("Encompassing work: " + result['encompassingWork'])
base_image_url = result['thumbnail'].split("?")[0]
results.append(
{
'template': 'images.html',
'url': _clean_url(f"{about['website']}/images/{result['objectID']}"),
'img_src': _clean_url(base_image_url),
'thumbnail_src': _clean_url(base_image_url + THUMBNAIL_SUFFIX),
'title': f"{result['title'].strip()} by {result['artist']} {result.get('displayYear', '')}",
'content': "\n".join(content),
}
)
return results