Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 66 additions & 108 deletions dist/mercury.js
Original file line number Diff line number Diff line change
Expand Up @@ -2227,41 +2227,29 @@ var LittleThingsExtractor = {
excerpt: null
};

// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var PoliticoExtractor = {
domain: 'www.politico.com',
title: {
selectors: [// enter title selectors
['meta[name="og:title"]', 'value']]
selectors: [['meta[name="og:title"]', 'value']]
},
author: {
selectors: ['.story-main-content .byline .vcard']
selectors: [['div[itemprop="author"] meta[itemprop="name"]', 'value'], '.story-meta__authors .vcard', '.story-main-content .byline .vcard']
},
content: {
selectors: [// enter content selectors
'.story-main-content', '.content-group', '.story-core', '.story-text'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
selectors: [['.story-text'], '.story-main-content', '.story-core'],
transforms: [],
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['figcaption']
clean: ['figcaption', '.story-meta', '.ad']
},
date_published: {
selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']]
selectors: [['time[itemprop="datePublished"]', 'datetime'], ['.story-meta__details time[datetime]', 'datetime'], ['.story-main-content .timestamp time[datetime]', 'datetime']],
timezone: 'America/New_York'
},
lead_image_url: {
selectors: [// enter lead_image_url selectors
['meta[name="og:image"]', 'value']]
selectors: [['meta[name="og:image"]', 'value']]
},
dek: {
selectors: []
},
next_page_url: null,
excerpt: null
selectors: [['meta[name="og:description"]', 'value']]
}
};

var DeadspinExtractor = {
Expand Down Expand Up @@ -3980,33 +3968,6 @@ var WwwCnetComExtractor = {
}
};

var WwwCinemablendComExtractor = {
domain: 'www.cinemablend.com',
title: {
selectors: ['.story_title']
},
author: {
selectors: ['.author']
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']],
timezone: 'EST'
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['div#wrap_left_content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: []
}
};

var WwwTodayComExtractor = {
domain: 'www.today.com',
title: {
Expand All @@ -4033,33 +3994,6 @@ var WwwTodayComExtractor = {
}
};

var WwwHowtogeekComExtractor = {
domain: 'www.howtogeek.com',
title: {
selectors: ['title']
},
author: {
selectors: ['#authorinfobox a']
},
date_published: {
selectors: ['#authorinfobox + div li'],
timezone: 'GMT'
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.thecontent'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: []
}
};

var WwwAlComExtractor = {
domain: 'www.al.com',
title: {
Expand Down Expand Up @@ -4286,33 +4220,6 @@ var ThoughtcatalogComExtractor = {
}
};

var WwwNjComExtractor = {
domain: 'www.nj.com',
title: {
selectors: [['meta[name="title"]', 'value']]
},
author: {
selectors: [['meta[name="article_author"]', 'value']]
},
date_published: {
selectors: [['meta[name="article_date_original"]', 'value']],
timezone: 'America/New_York'
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.entry-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: []
}
};

var WwwInquisitrComExtractor = {
domain: 'www.inquisitr.com',
title: {
Expand Down Expand Up @@ -6185,14 +6092,66 @@ var PostlightComExtractor = {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['article.body'],
selectors: ['main.post'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['section.pl-post-link', 'aside', 'section.insights_featured_case_studies']
}
};

var WwwInvestmentexecutiveComExtractor = {
domain: 'www.investmentexecutive.com',
title: {
selectors: ['h1']
},
author: {
selectors: ['div[itemprop="author"]']
},
date_published: {
selectors: [['meta[itemprop="datePublished"]', 'value']]
},
dek: {
selectors: [['meta[name="og:description"]', 'value']]
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['section.article-body'],
clean: ['.hidden']
}
};

var WwwCbcCaExtractor = {
domain: 'www.cbc.ca',
title: {
selectors: ['h1']
},
author: {
selectors: ['.authorText', '.bylineDetails']
},
date_published: {
selectors: [['.timeStamp[datetime]', 'datetime']]
},
dek: {
selectors: ['.deck']
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['.story'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['section.pl-post-link']
clean: []
}
};

Expand Down Expand Up @@ -6265,9 +6224,7 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
WwwSiComExtractor: WwwSiComExtractor,
WwwRawstoryComExtractor: WwwRawstoryComExtractor,
WwwCnetComExtractor: WwwCnetComExtractor,
WwwCinemablendComExtractor: WwwCinemablendComExtractor,
WwwTodayComExtractor: WwwTodayComExtractor,
WwwHowtogeekComExtractor: WwwHowtogeekComExtractor,
WwwAlComExtractor: WwwAlComExtractor,
WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
Expand All @@ -6276,7 +6233,6 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
ScienceflyComExtractor: ScienceflyComExtractor,
HellogigglesComExtractor: HellogigglesComExtractor,
ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
WwwNjComExtractor: WwwNjComExtractor,
WwwInquisitrComExtractor: WwwInquisitrComExtractor,
WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
FortuneComExtractor: FortuneComExtractor,
Expand Down Expand Up @@ -6343,7 +6299,9 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
ArstechnicaComExtractor: ArstechnicaComExtractor,
WwwNdtvComExtractor: WwwNdtvComExtractor,
SpektrumExtractor: SpektrumExtractor,
PostlightComExtractor: PostlightComExtractor
PostlightComExtractor: PostlightComExtractor,
WwwInvestmentexecutiveComExtractor: WwwInvestmentexecutiveComExtractor,
WwwCbcCaExtractor: WwwCbcCaExtractor
});

var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
Expand Down
2 changes: 1 addition & 1 deletion dist/mercury.web.js

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion fixtures/www.cinemablend.com/1482432215722.html

This file was deleted.

7 changes: 0 additions & 7 deletions fixtures/www.howtogeek.com/1482438125052.html

This file was deleted.

56 changes: 0 additions & 56 deletions fixtures/www.nj.com/1481666201503.html

This file was deleted.

3 changes: 0 additions & 3 deletions src/extractors/custom/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ export * from './www.androidcentral.com';
export * from './www.si.com';
export * from './www.rawstory.com';
export * from './www.cnet.com';
export * from './www.cinemablend.com';
export * from './www.today.com';
export * from './www.howtogeek.com';
export * from './www.al.com';
export * from './www.thepennyhoarder.com';
export * from './www.westernjournalism.com';
Expand All @@ -75,7 +73,6 @@ export * from './www.americanow.com';
export * from './sciencefly.com';
export * from './hellogiggles.com';
export * from './thoughtcatalog.com';
export * from './www.nj.com';
export * from './www.inquisitr.com';
export * from './www.nbcnews.com';
export * from './fortune.com';
Expand Down
34 changes: 0 additions & 34 deletions src/extractors/custom/www.cinemablend.com/index.js

This file was deleted.

Loading