From 23b44c4c358c29718e6a8486faad99878bec9116 Mon Sep 17 00:00:00 2001 From: Collin Wu Date: Wed, 10 Apr 2024 05:37:04 -0700 Subject: [PATCH] fix(various): fix formatting/lints - etc --- src/cleaner.ts | 6 +++--- src/extractor.ts | 54 ++++++++++++++++++++++++------------------------ src/index.ts | 6 +++--- 3 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/cleaner.ts b/src/cleaner.ts index 2d519be..4040ab5 100644 --- a/src/cleaner.ts +++ b/src/cleaner.ts @@ -64,7 +64,7 @@ function cleanErrantLineBreaks(doc: cheerio.Root) { const contentsNode = doc(cElement); if (contentsNode && contentsNode[0] && contentsNode[0].type === "text") { contentsNode.replaceWith( - contentsNode.text().replace(/([^\n])\n([^\n])/g, "$1 $2") + contentsNode.text().replace(/([^\n])\n([^\n])/g, "$1 $2"), ); } }); @@ -135,7 +135,7 @@ function divToPara(doc: cheerio.Root, domType: string) { function getReplacementNodes( doc: cheerio.Root, - div: cheerio.Cheerio + div: cheerio.Cheerio, ): cheerio.Cheerio[] { let replacementText: string[] = []; const nodesToReturn: cheerio.Cheerio[] = []; @@ -269,7 +269,7 @@ function removeScriptsStyles(doc: cheerio.Root): cheerio.Cheerio { function replaceWithPara( doc: cheerio.Root, - div: cheerio.Element + div: cheerio.Element, ): cheerio.Root { const divContent = doc(div).html(); doc(div).replaceWith(`

${divContent}

`); diff --git a/src/extractor.ts b/src/extractor.ts index 8de2875..2445166 100644 --- a/src/extractor.ts +++ b/src/extractor.ts @@ -33,7 +33,7 @@ export interface Extractor { links: ( doc: cheerio.Root, topNode: cheerio.Cheerio, - lang: string + lang: string, ) => LinkObj[]; locale: (doc: cheerio.Root) => string; publisher: (doc: cheerio.Root) => string; @@ -49,7 +49,7 @@ export interface Extractor { function addSiblings( doc: cheerio.Root, topNode: cheerio.Cheerio, - lang: string + lang: string, ): cheerio.Cheerio { const baselineScoreSiblingsPara = getSiblingsScore(doc, topNode, lang); const sibs = topNode.prevAll(); @@ -60,7 +60,7 @@ function addSiblings( doc, lang, currentNode, - baselineScoreSiblingsPara + baselineScoreSiblingsPara, ); if (ps) { @@ -124,7 +124,7 @@ function cleanTitle(title: string, delimiters: string[]): string { function doesNodeListContainNode( list: cheerio.Cheerio[], - node: cheerio.Cheerio + node: cheerio.Cheerio, ): boolean { let contains = false; for (let i = 0; i < list.length; i++) { @@ -151,7 +151,7 @@ function getSiblingsContent( doc: cheerio.Root, lang: string, currentSibling: cheerio.Cheerio, - baselineScoreSiblingsPara: number + baselineScoreSiblingsPara: number, ) { if ( currentSibling.get(0).tagName === "p" && @@ -188,7 +188,7 @@ function getSiblingsContent( function getSiblingsScore( doc: cheerio.Root, topNode: cheerio.Cheerio, - lang: string + lang: string, ): number { const nodesToCheck = topNode.find("p"); let base = 100000; @@ -249,7 +249,7 @@ function isAbsoluteUrl(url: string): boolean { function isBoostable( doc: cheerio.Root, node: cheerio.Cheerio, - lang: string + lang: string, ): boolean { const minimumStopWordCount = 5; const maxStepsAwayFromNode = 3; @@ -312,14 +312,14 @@ function isHighLinkDensity(doc: cheerio.Root, node: cheerio.Cheerio): boolean { function isNodeScoreThresholdMet( _doc: cheerio.Root, node: cheerio.Cheerio, - e: cheerio.Cheerio + e: cheerio.Cheerio, ): boolean { const topNodeScore = getScore(node); const currentNodeScore = getScore(e); const thresholdScore = topNodeScore * 0.08; const elIsTdUlOlOrBlockQ = ["td", "ul", "ol", "blockquote"].includes( - e.get(0).tagName + e.get(0).tagName, ); if (currentNodeScore < thresholdScore && !elIsTdUlOlOrBlockQ) { return false; @@ -356,7 +356,7 @@ function isValidDate(d: string): boolean { function postCleanup( doc: cheerio.Root, targetNode: cheerio.Cheerio, - lang: string + lang: string, ): cheerio.Cheerio { const node = addSiblings(doc, targetNode, lang); @@ -423,7 +423,7 @@ function updateScore(node: cheerio.Cheerio, addToScore: number): void { const extractor: Extractor = { author: (doc: cheerio.Root): string[] => { const authorCandidates = doc( - "meta[property='article:author'], meta[property='og:article:author'], meta[name='author'], meta[name='dcterms.creator'], meta[name='DC.creator'], meta[name='DC.Creator'], meta[name='dc.creator'], meta[name='creator']" + "meta[property='article:author'], meta[property='og:article:author'], meta[name='author'], meta[name='dcterms.creator'], meta[name='DC.creator'], meta[name='DC.Creator'], meta[name='dc.creator'], meta[name='creator']", ); const authorList = []; @@ -552,7 +552,7 @@ const extractor: Extractor = { // if it gets to the end without one of these links or meta tags, return the original url as canonical canonicalLink: (doc: cheerio.Root, resourceUrl: string): string => { const canonicalLinkTag = doc( - "link[rel='canonical'], meta[property='og:url']" + "link[rel='canonical'], meta[property='og:url']", ); if (canonicalLinkTag) { const resourceUrlObj = new URL(resourceUrl); @@ -562,7 +562,7 @@ const extractor: Extractor = { canonicalLinkTag.get(0).tagName === "link" ) { const cleanedCanonicalLink = cleanNull( - canonicalLinkTag.first().attr("href") + canonicalLinkTag.first().attr("href"), ); // check if link is a relative url, if so, append origin if (!isAbsoluteUrl(cleanedCanonicalLink)) { @@ -579,7 +579,7 @@ const extractor: Extractor = { if (urlProtocol === "https:") { cleanedCanonicalMeta = cleanedCanonicalMeta.replace( /^http:\/\//i, - "https://" + "https://", ); return cleanedCanonicalMeta; } @@ -591,7 +591,7 @@ const extractor: Extractor = { }, copyright: (doc: cheerio.Root): string => { const copyrightCandidates = doc( - "p[class*='copyright'], div[class*='copyright'], span[class*='copyright'], li[class*='copyright'], p[id*='copyright'], div[id*='copyright'], span[id*='copyright'], li[id*='copyright']" + "p[class*='copyright'], div[class*='copyright'], span[class*='copyright'], li[class*='copyright'], p[id*='copyright'], div[id*='copyright'], span[id*='copyright'], li[id*='copyright']", ); let text = copyrightCandidates?.first()?.text(); if (!text) { @@ -632,17 +632,17 @@ const extractor: Extractor = { time, \ span[class*='date'], \ p[class*='date'], \ - div[class*='date']" + div[class*='date']", ); let dateToReturn = ""; if (dateCandidates) { const dateContentCandidate = cleanNull( - dateCandidates.first().attr("content") + dateCandidates.first().attr("content"), ); const dateTimeCandidate = cleanNull( - dateCandidates.first().attr("datetime") + dateCandidates.first().attr("datetime"), ); const dateTextCandidate = cleanText(dateCandidates.first().text()); @@ -669,11 +669,11 @@ const extractor: Extractor = { }, description: (doc: cheerio.Root): string => { const descriptionTag = doc( - "meta[name=description], meta[property='og:description']" + "meta[name=description], meta[property='og:description']", ); if (descriptionTag) { const cleanedDescription = cleanNull( - descriptionTag.first().attr("content") + descriptionTag.first().attr("content"), ); if (cleanedDescription) { return replaceCharacters(cleanedDescription.trim(), false, true); @@ -685,7 +685,7 @@ const extractor: Extractor = { const tag = doc("link").filter( (_index, el) => doc(el).attr("rel")?.toLowerCase() === "shortcut icon" || - doc(el).attr("rel")?.toLowerCase() === "icon" + doc(el).attr("rel")?.toLowerCase() === "icon", ); const faviconLink = tag.attr("href") || ""; // ensure the url returned from favicon is absolute url @@ -697,7 +697,7 @@ const extractor: Extractor = { }, image: (doc: cheerio.Root): string => { const images = doc( - "meta[property='og:image'], meta[property='og:image:url'], meta[itemprop=image], meta[name='twitter:image:src'], meta[name='twitter:image'], meta[name='twitter:image0']" + "meta[property='og:image'], meta[property='og:image:url'], meta[itemprop=image], meta[name='twitter:image:src'], meta[name='twitter:image'], meta[name='twitter:image0']", ); if (images.length > 0 && cleanNull(images.first().attr("content"))) { @@ -753,7 +753,7 @@ const extractor: Extractor = { links: ( doc: cheerio.Root, topNode: cheerio.Cheerio, - lang: string + lang: string, ): LinkObj[] => { const links: LinkObj[] = []; @@ -789,11 +789,11 @@ const extractor: Extractor = { }, publisher: (doc: cheerio.Root): string => { const publisherCandidates = doc( - "meta[property='og:site_name'], meta[itemprop=name], meta[name='dc.publisher'], meta[name='DC.publisher'], meta[name='DC.Publisher']" + "meta[property='og:site_name'], meta[itemprop=name], meta[name='dc.publisher'], meta[name='DC.publisher'], meta[name='DC.Publisher']", ); if (publisherCandidates) { const cleanedPublisher = cleanNull( - publisherCandidates.first().attr("content") + publisherCandidates.first().attr("content"), ); if (cleanedPublisher) { return cleanedPublisher.trim(); @@ -803,7 +803,7 @@ const extractor: Extractor = { }, siteName: (doc: cheerio.Root): string => { const siteNameTag = doc( - "meta[property='og:site_name'], meta[itemprop=name]" + "meta[property='og:site_name'], meta[itemprop=name]", ); if (siteNameTag) { const cleanedSiteName = cleanNull(siteNameTag.first().attr("content")); @@ -822,7 +822,7 @@ const extractor: Extractor = { let elements = doc("a[rel='tag']"); if (elements.length === 0) { elements = doc( - "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']" + "a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']", ); if (elements.length === 0) { return []; diff --git a/src/index.ts b/src/index.ts index f61614a..64a5ab3 100644 --- a/src/index.ts +++ b/src/index.ts @@ -58,7 +58,7 @@ export interface LazyExtractor { const siteMetadataExtractor = ( markup: string, resourceUrl: string, - lang = "en" + lang = "en", ): PageData => { const resourceUrlObj = new URL(resourceUrl); const doc = cheerio.load(markup, { xmlMode: true }); @@ -114,7 +114,7 @@ export default siteMetadataExtractor; export const lazy = ( html: string, resourceUrl: string, - language = "en" + language = "en", ): LazyExtractor => { const resourceUrlObj = new URL(resourceUrl); global.lazyPageData = global.lazyPageData || {}; @@ -129,7 +129,7 @@ export const lazy = ( const doc = getParsedDoc.call(global, html); global.lazyPageData.canonicalLink = extractor.canonicalLink( doc, - resourceUrl + resourceUrl, ); return global.lazyPageData.canonicalLink; },