Skip to content

Commit

Permalink
feat: update nytimes extractor (#506)
Browse files Browse the repository at this point in the history
* feat: update custom extractor for nytimes.com
  • Loading branch information
WajeehZantout committed Oct 17, 2019
1 parent 3fb8526 commit b0e708a
Show file tree
Hide file tree
Showing 8 changed files with 1,170 additions and 87 deletions.
2 changes: 0 additions & 2 deletions fixtures/www.nytimes.com/1474061823854.html

This file was deleted.

1 change: 0 additions & 1 deletion fixtures/www.nytimes.com/1474318141888.html

This file was deleted.

69 changes: 0 additions & 69 deletions fixtures/www.nytimes.com/1539194812689.html

This file was deleted.

1,016 changes: 1,016 additions & 0 deletions fixtures/www.nytimes.com/1571223287888.html

Large diffs are not rendered by default.

75 changes: 75 additions & 0 deletions fixtures/www.nytimes.com/1571223477873.html

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions fixtures/www.nytimes.com/1571224616991.html

Large diffs are not rendered by default.

16 changes: 13 additions & 3 deletions src/extractors/custom/www.nytimes.com/index.js
Expand Up @@ -2,15 +2,25 @@ export const NYTimesExtractor = {
domain: 'www.nytimes.com',

title: {
selectors: ['h1.g-headline', 'h1[itemprop="headline"]', 'h1.headline'],
selectors: [
'h1.g-headline',
'h1[itemprop="headline"]',
'h1.headline',
'h1 .balancedHeadline',
],
},

author: {
selectors: [['meta[name="author"]', 'value'], '.g-byline', '.byline'],
selectors: [
['meta[name="author"]', 'value'],
'.g-byline',
'.byline',
['meta[name="byl"]', 'value'],
],
},

content: {
selectors: ['div.g-blocks', 'article#story'],
selectors: ['div.g-blocks', 'section[name="articleBody"]', 'article#story'],

transforms: {
'img.g-lazy': $node => {
Expand Down
28 changes: 16 additions & 12 deletions src/extractors/custom/www.nytimes.com/index.test.js
Expand Up @@ -14,9 +14,9 @@ describe('NYTimesExtractor', () => {
let url;
beforeAll(() => {
url =
'http://www.nytimes.com/interactive/2016/09/15/arts/design/national-museum-of-african-american-history-and-culture.html';
'https://www.nytimes.com/2016/09/20/nyregion/nyc-nj-explosions-ahmad-khan-rahami.html';
const html = fs.readFileSync(
'./fixtures/www.nytimes.com/1474318141888.html'
'./fixtures/www.nytimes.com/1571224616991.html'
);
result = Mercury.parse(url, { html, fallback: false });
});
Expand Down Expand Up @@ -76,7 +76,7 @@ describe('NYTimesExtractor', () => {
// the article.
assert.equal(
lead_image_url,
'https://static01.nyt.com/images/2016/09/20/nyregion/20MANHUNT1/20MANHUNT1-facebookJumbo.jpg'
'https://static01.nyt.com/images/2016/09/20/nyregion/Manhunt/Manhunt-facebookJumbo-v2.jpg'
);
});

Expand All @@ -100,34 +100,38 @@ describe('NYTimesExtractor', () => {
// the article.
assert.equal(
first13,
'The man believed to be responsible for the explosion in Manhattan on Saturday'
'The man who the police said sowed terror across two states, setting off'
);
});
});

it('works with a feature story', async () => {
const html = fs.readFileSync(
'./fixtures/www.nytimes.com/1474061823854.html'
'./fixtures/www.nytimes.com/1571223287888.html'
);
const uri =
'http://www.nytimes.com/interactive/2016/09/15/arts/design/national-museum-of-african-american-history-and-culture.html';

const { content, title, author } = await Mercury.parse(uri, { html });
const $ = cheerio.load(content);
const text = $('*')
.first()
.text()
.trim()
.slice(0, 20);
const text = excerptContent(
$('*')
.first()
.text(),
13
);

assert.equal(title, 'I, Too, Sing America');
assert.equal(author, 'The New York Times');
assert.equal(text, 'T he Smithsonian’s N');
assert.equal(
text,
'T he Smithsonian’s National Museum of African American History and Culture opens on'
);
});

it('returns the title on most recent articles', async () => {
const html = fs.readFileSync(
'./fixtures/www.nytimes.com/1539194812689.html'
'./fixtures/www.nytimes.com/1571223477873.html'
);
const uri =
'https://www.nytimes.com/2018/10/09/us/politics/nikki-haley-united-nations.html';
Expand Down

0 comments on commit b0e708a

Please sign in to comment.