Skip to content

Commit

Permalink
[fix] engine: google has changed the layout of its response
Browse files Browse the repository at this point in the history
Since 28. March google has changed its response, this patch fixes the google
engine to scrap out the results & images from the new designed response.

closes: #2287

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
  • Loading branch information
return42 committed Mar 28, 2023
1 parent 1498202 commit 6f9e678
Showing 1 changed file with 26 additions and 45 deletions.
71 changes: 26 additions & 45 deletions searx/engines/google.py
Expand Up @@ -59,13 +59,10 @@
# specific xpath variables
# ------------------------

results_xpath = './/div[@data-sokoban-container]'
results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
title_xpath = './/a/h3[1]'
href_xpath = './/a[h3]/@href'
content_xpath = './/div[@data-content-feature]'

# google *sections* are no usual *results*, we ignore them
g_section_with_header = './g-section-with-header'
content_xpath = './/div[@data-sncf]'

# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
Expand Down Expand Up @@ -303,21 +300,18 @@ def request(query, params):
return params


# (function(){var s='data:image/jpeg;base64,/9j/4AAQ ...
# ... DX/Ff5XSpSgdU32xSlKDJ//9k\x3d';var ii=['dimg_21'];_setImagesSrc(ii,s);})();
RE_DATA_IMAGE = re.compile(r"'(data:image[^']*)'[^']*ii=\['([^']*)'\];_setImagesSrc")
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')


def _parse_data_images(dom):
data_image_map = {}
for _script in eval_xpath_list(dom, "//script[@nonce]"):
script = _script.text
if not script:
continue
script = RE_DATA_IMAGE.search(script)
if not script:
continue
data_image_map[script.group(2)] = script.group(1).replace(r'\x3d', '=')
for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
end_pos = data_image.rfind('=')
if end_pos > 0:
data_image = data_image[: end_pos + 1]
data_image_map[img_id] = data_image
logger.debug('data:image objects --> %s', list(data_image_map.keys()))
return data_image_map

Expand All @@ -331,11 +325,7 @@ def response(resp):

# convert the text to dom
dom = html.fromstring(resp.text)

data_image_map = {}
if '_fmt:html' in UI_ASYNC:
# in this format images are embedded by a bse64 encoded 'data:image'
data_image_map = _parse_data_images(dom)
data_image_map = _parse_data_images(dom)

# results --> answer
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
Expand All @@ -349,45 +339,36 @@ def response(resp):

for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks

# google *sections*
if extract_text(eval_xpath(result, g_section_with_header)):
logger.debug("ignoring <g-section-with-header>")
continue

try:
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
if title_tag is None:
# this not one of the common google results *section*
logger.debug('ignoring item from the result_xpath list: missing title')
continue
title = extract_text(title_tag)

url = eval_xpath_getindex(result, href_xpath, 0, None)
if url is None:
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
continue

content = []
img_list = []
for content_feature in eval_xpath(result, content_xpath):
val = content_feature.attrib['data-content-feature']
if val in ['1', '2']:
txt = extract_text(content_feature, allow_none=True)
if txt:
content.append(txt)
elif '0' in val:
img = content_feature.xpath('.//img/@src')
if img:
img = img[0]
if img.startswith('data:image'):
img_id = content_feature.xpath('.//img/@id')
if img_id:
img = data_image_map.get(img_id[0])
img_list.append(img)
content_nodes = eval_xpath(result, content_xpath)
content = extract_text(content_nodes)

if not content:
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
continue
content = ' / '.join(content)
img_src = img_list[0] if img_list else None

img_src = content_nodes[0].xpath('.//img/@src')
if img_src:
img_src = img_src[0]
if img_src.startswith('data:image'):
img_id = content_nodes[0].xpath('.//img/@id')
if img_id:
img_src = data_image_map.get(img_id[0])
else:
img_src = None

results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})

except Exception as e: # pylint: disable=broad-except
Expand Down

0 comments on commit 6f9e678

Please sign in to comment.