Skip to content
This repository has been archived by the owner on Sep 7, 2023. It is now read-only.

[fix] get YouTube results #1544

Merged
merged 2 commits into from
Apr 9, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 34 additions & 36 deletions searx/engines/youtube_noapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
# @stable no
# @parse url, title, content, publishedDate, thumbnail, embedded

from lxml import html
from functools import reduce
from json import loads
from searx.engines.xpath import extract_text
from searx.utils import list_get
from searx.url_utils import quote_plus
Expand All @@ -34,20 +35,6 @@

base_youtube_url = 'https://www.youtube.com/watch?v='

# specific xpath variables
results_xpath = "//ol/li/div[contains(@class, 'yt-lockup yt-lockup-tile yt-lockup-video vve-check')]"
url_xpath = './/h3/a/@href'
title_xpath = './/div[@class="yt-lockup-content"]/h3/a'
content_xpath = './/div[@class="yt-lockup-content"]/div[@class="yt-lockup-description yt-ui-ellipsis yt-ui-ellipsis-2"]'


# returns extract_text on the first result selected by the xpath or None
def extract_text_from_dom(result, xpath):
r = result.xpath(xpath)
if len(r) > 0:
return extract_text(r[0])
return None


# do search-request
def request(query, params):
Expand All @@ -63,27 +50,38 @@ def request(query, params):
def response(resp):
results = []

dom = html.fromstring(resp.text)

# parse results
for result in dom.xpath(results_xpath):
videoid = list_get(result.xpath('@data-context-item-id'), 0)
if videoid is not None:
url = base_youtube_url + videoid
thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg'

title = extract_text_from_dom(result, title_xpath) or videoid
content = extract_text_from_dom(result, content_xpath)

embedded = embedded_url.format(videoid=videoid)

# append result
results.append({'url': url,
'title': title,
'content': content,
'template': 'videos.html',
'embedded': embedded,
'thumbnail': thumbnail})
results_data = resp.text[resp.text.find('ytInitialData'):]
results_data = results_data[results_data.find('{'):results_data.find(';\n')]

results_json = loads(results_data) if results_data else {}
sections = results_json.get('contents', {})\
.get('twoColumnSearchResultsRenderer', {})\
.get('primaryContents', {})\
.get('sectionListRenderer', {})\
.get('contents', [])

for section in sections:
for video_container in section.get('itemSectionRenderer', {}).get('contents', []):
video = video_container.get('videoRenderer', {})
videoid = video.get('videoId')
if videoid is not None:
url = base_youtube_url + videoid
thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg'
title = video.get('title', {}).get('simpleText', videoid)
description_snippet = video.get('descriptionSnippet', {})
if 'runs' in description_snippet:
content = reduce(lambda a, b: a + b.get('text', ''), description_snippet.get('runs'), '')
else:
content = description_snippet.get('simpleText', '')
embedded = embedded_url.format(videoid=videoid)

# append result
results.append({'url': url,
'title': title,
'content': content,
'template': 'videos.html',
'embedded': embedded,
'thumbnail': thumbnail})

# return results
return results
162 changes: 56 additions & 106 deletions tests/unit/engines/test_youtube_noapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,121 +46,71 @@ def test_response(self):
self.assertEqual(youtube_noapi.response(response), [])

html = """
<ol id="item-section-063864" class="item-section">
<li>
<div class="yt-lockup yt-lockup-tile yt-lockup-video vve-check clearfix yt-uix-tile"
data-context-item-id="DIVZCPfAOeM"
data-visibility-tracking="CBgQ3DAYACITCPGXnYau6sUCFZEIHAod-VQASCj0JECx_-GK5uqMpcIB">
<div class="yt-lockup-dismissable"><div class="yt-lockup-thumbnail contains-addto">
<a aria-hidden="true" href="/watch?v=DIVZCPfAOeM" class=" yt-uix-sessionlink pf-link"
data-sessionlink="itct=CBgQ3DAYACITCPGXnYau6sUCFZEIHAod-VQASCj0JFIEdGVzdA">
<div class="yt-thumb video-thumb"><img src="//i.ytimg.com/vi/DIVZCPfAOeM/mqdefault.jpg"
width="196" height="110"/></div><span class="video-time" aria-hidden="true">11:35</span></a>
<span class="thumb-menu dark-overflow-action-menu video-actions">
</span>
</div>
<div class="yt-lockup-content">
<h3 class="yt-lockup-title">
<a href="/watch?v=DIVZCPfAOeM"
class="yt-uix-tile-link yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link"
data-sessionlink="itct=CBgQ3DAYACITCPGXnYau6sUCFZEIHAod-VQASCj0JFIEdGVzdA"
title="Top Speed Test Kawasaki Ninja H2 (Thailand) By. MEHAY SUPERBIKE"
aria-describedby="description-id-259079" rel="spf-prefetch" dir="ltr">
Title
</a>
<span class="accessible-description" id="description-id-259079"> - Durée : 11:35.</span>
</h3>
<div class="yt-lockup-byline">de
<a href="/user/mheejapan" class=" yt-uix-sessionlink spf-link g-hovercard"
data-sessionlink="itct=CBgQ3DAYACITCPGXnYau6sUCFZEIHAod-VQASCj0JA" data-ytid="UCzEesu54Hjs0uRKmpy66qeA"
data-name="">MEHAY SUPERBIKE</a></div><div class="yt-lockup-meta">
<ul class="yt-lockup-meta-info">
<li>il y a 20 heures</li>
<li>8 424 vues</li>
</ul>
</div>
<div class="yt-lockup-description yt-ui-ellipsis yt-ui-ellipsis-2" dir="ltr">
Description
</div>
<div class="yt-lockup-badges">
<ul class="yt-badge-list ">
<li class="yt-badge-item" >
<span class="yt-badge">Nouveauté</span>
</li>
<li class="yt-badge-item" ><span class="yt-badge " >HD</span></li>
</ul>
</div>
<div class="yt-lockup-action-menu yt-uix-menu-container">
<div class="yt-uix-menu yt-uix-videoactionmenu hide-until-delayloaded"
data-video-id="DIVZCPfAOeM" data-menu-content-id="yt-uix-videoactionmenu-menu">
</div>
</div>
</div>
</div>
</div>
</li>
</ol>
<div></div>
<script>
window["ytInitialData"] = {
"contents": {
"twoColumnSearchResultsRenderer": {
"primaryContents": {
"sectionListRenderer": {
"contents": [
{
"itemSectionRenderer": {
"contents": [
{
"videoRenderer": {
"videoId": "DIVZCPfAOeM",
"title": {
"simpleText": "Title"
},
"descriptionSnippet": {
"runs": [
{
"text": "Des"
},
{
"text": "cription"
}
]
}
}
},
{
"videoRenderer": {
"videoId": "9C_HReR_McQ",
"title": {
"simpleText": "Title"
},
"descriptionSnippet": {
"simpleText": "Description"
}
}
}
]
}
}
]
}
}
}
}
};
</script>
"""
response = mock.Mock(text=html)
results = youtube_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 1)
self.assertEqual(len(results), 2)
self.assertEqual(results[0]['title'], 'Title')
self.assertEqual(results[0]['url'], 'https://www.youtube.com/watch?v=DIVZCPfAOeM')
self.assertEqual(results[0]['content'], 'Description')
self.assertEqual(results[0]['thumbnail'], 'https://i.ytimg.com/vi/DIVZCPfAOeM/hqdefault.jpg')
self.assertTrue('DIVZCPfAOeM' in results[0]['embedded'])

html = """
<ol id="item-section-063864" class="item-section">
<li>
<div class="yt-lockup yt-lockup-tile yt-lockup-video vve-check clearfix yt-uix-tile"
data-context-item-id="DIVZCPfAOeM"
data-visibility-tracking="CBgQ3DAYACITCPGXnYau6sUCFZEIHAod-VQASCj0JECx_-GK5uqMpcIB">
<div class="yt-lockup-dismissable"><div class="yt-lockup-thumbnail contains-addto">
<a aria-hidden="true" href="/watch?v=DIVZCPfAOeM" class=" yt-uix-sessionlink pf-link"
data-sessionlink="itct=CBgQ3DAYACITCPGXnYau6sUCFZEIHAod-VQASCj0JFIEdGVzdA">
<div class="yt-thumb video-thumb"><img src="//i.ytimg.com/vi/DIVZCPfAOeM/mqdefault.jpg"
width="196" height="110"/></div><span class="video-time" aria-hidden="true">11:35</span></a>
<span class="thumb-menu dark-overflow-action-menu video-actions">
</span>
</div>
<div class="yt-lockup-content">
<h3 class="yt-lockup-title">
<span class="accessible-description" id="description-id-259079"> - Durée : 11:35.</span>
</h3>
<div class="yt-lockup-byline">de
<a href="/user/mheejapan" class=" yt-uix-sessionlink spf-link g-hovercard"
data-sessionlink="itct=CBgQ3DAYACITCPGXnYau6sUCFZEIHAod-VQASCj0JA" data-ytid="UCzEesu54Hjs0uRKmpy66qeA"
data-name="">MEHAY SUPERBIKE</a></div><div class="yt-lockup-meta">
<ul class="yt-lockup-meta-info">
<li>il y a 20 heures</li>
<li>8 424 vues</li>
</ul>
</div>
<div class="yt-lockup-badges">
<ul class="yt-badge-list ">
<li class="yt-badge-item" >
<span class="yt-badge">Nouveauté</span>
</li>
<li class="yt-badge-item" ><span class="yt-badge " >HD</span></li>
</ul>
</div>
<div class="yt-lockup-action-menu yt-uix-menu-container">
<div class="yt-uix-menu yt-uix-videoactionmenu hide-until-delayloaded"
data-video-id="DIVZCPfAOeM" data-menu-content-id="yt-uix-videoactionmenu-menu">
</div>
</div>
</div>
</div>
</div>
</li>
</ol>
"""
response = mock.Mock(text=html)
results = youtube_noapi.response(response)
self.assertEqual(type(results), list)
self.assertEqual(len(results), 1)
self.assertEqual(results[1]['title'], 'Title')
self.assertEqual(results[1]['url'], 'https://www.youtube.com/watch?v=9C_HReR_McQ')
self.assertEqual(results[1]['content'], 'Description')
self.assertEqual(results[1]['thumbnail'], 'https://i.ytimg.com/vi/9C_HReR_McQ/hqdefault.jpg')
self.assertTrue('9C_HReR_McQ' in results[1]['embedded'])

html = """
<ol id="item-section-063864" class="item-section">
Expand Down