-
Notifications
You must be signed in to change notification settings - Fork 436
/
extractor.js
114 lines (95 loc) 路 3.02 KB
/
extractor.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import { extractFromMeta } from 'utils/dom';
import { cleanImage } from 'cleaners';
import {
LEAD_IMAGE_URL_META_TAGS,
LEAD_IMAGE_URL_SELECTORS,
} from './constants';
import {
scoreImageUrl,
scoreAttr,
scoreByParents,
scoreBySibling,
scoreByDimensions,
scoreByPosition,
} from './score-image';
// Given a resource, try to find the lead image URL from within
// it. Like content and next page extraction, uses a scoring system
// to determine what the most likely image may be. Short circuits
// on really probable things like og:image meta tags.
//
// Potential signals to still take advantage of:
// * domain
// * weird aspect ratio
const GenericLeadImageUrlExtractor = {
extract({ $, content, metaCache, html }) {
let cleanUrl;
if (!$.browser && $('head').length === 0) {
$('*')
.first()
.prepend(html);
}
// Check to see if we have a matching meta tag that we can make use of.
// Moving this higher because common practice is now to use large
// images on things like Open Graph or Twitter cards.
// images usually have for things like Open Graph.
const imageUrl = extractFromMeta(
$,
LEAD_IMAGE_URL_META_TAGS,
metaCache,
false
);
if (imageUrl) {
cleanUrl = cleanImage(imageUrl);
if (cleanUrl) return cleanUrl;
}
// Next, try to find the "best" image via the content.
// We'd rather not have to fetch each image and check dimensions,
// so try to do some analysis and determine them instead.
const $content = $(content);
const imgs = $('img', $content).toArray();
const imgScores = {};
imgs.forEach((img, index) => {
const $img = $(img);
const src = $img.attr('src');
if (!src) return;
let score = scoreImageUrl(src);
score += scoreAttr($img);
score += scoreByParents($img);
score += scoreBySibling($img);
score += scoreByDimensions($img);
score += scoreByPosition(imgs, index);
imgScores[src] = score;
});
const [topUrl, topScore] = Reflect.ownKeys(imgScores).reduce(
(acc, key) => (imgScores[key] > acc[1] ? [key, imgScores[key]] : acc),
[null, 0]
);
if (topScore > 0) {
cleanUrl = cleanImage(topUrl);
if (cleanUrl) return cleanUrl;
}
// If nothing else worked, check to see if there are any really
// probable nodes in the doc, like <link rel="image_src" />.
// eslint-disable-next-line no-restricted-syntax
for (const selector of LEAD_IMAGE_URL_SELECTORS) {
const $node = $(selector).first();
const src = $node.attr('src');
if (src) {
cleanUrl = cleanImage(src);
if (cleanUrl) return cleanUrl;
}
const href = $node.attr('href');
if (href) {
cleanUrl = cleanImage(href);
if (cleanUrl) return cleanUrl;
}
const value = $node.attr('value');
if (value) {
cleanUrl = cleanImage(value);
if (cleanUrl) return cleanUrl;
}
}
return null;
},
};
export default GenericLeadImageUrlExtractor;