Skip to content

Commit

Permalink
fix(various): fix formatting/lints - etc
Browse files Browse the repository at this point in the history
  • Loading branch information
collinwu committed Apr 10, 2024
1 parent b63aa89 commit 23b44c4
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 33 deletions.
6 changes: 3 additions & 3 deletions src/cleaner.ts
Expand Up @@ -64,7 +64,7 @@ function cleanErrantLineBreaks(doc: cheerio.Root) {
const contentsNode = doc(cElement);
if (contentsNode && contentsNode[0] && contentsNode[0].type === "text") {
contentsNode.replaceWith(
contentsNode.text().replace(/([^\n])\n([^\n])/g, "$1 $2")
contentsNode.text().replace(/([^\n])\n([^\n])/g, "$1 $2"),
);
}
});
Expand Down Expand Up @@ -135,7 +135,7 @@ function divToPara(doc: cheerio.Root, domType: string) {

function getReplacementNodes(
doc: cheerio.Root,
div: cheerio.Cheerio
div: cheerio.Cheerio,
): cheerio.Cheerio[] {
let replacementText: string[] = [];
const nodesToReturn: cheerio.Cheerio[] = [];
Expand Down Expand Up @@ -269,7 +269,7 @@ function removeScriptsStyles(doc: cheerio.Root): cheerio.Cheerio {

function replaceWithPara(
doc: cheerio.Root,
div: cheerio.Element
div: cheerio.Element,
): cheerio.Root {
const divContent = doc(div).html();
doc(div).replaceWith(`<p>${divContent}</p>`);
Expand Down
54 changes: 27 additions & 27 deletions src/extractor.ts
Expand Up @@ -33,7 +33,7 @@ export interface Extractor {
links: (
doc: cheerio.Root,
topNode: cheerio.Cheerio,
lang: string
lang: string,
) => LinkObj[];
locale: (doc: cheerio.Root) => string;
publisher: (doc: cheerio.Root) => string;
Expand All @@ -49,7 +49,7 @@ export interface Extractor {
function addSiblings(
doc: cheerio.Root,
topNode: cheerio.Cheerio,
lang: string
lang: string,
): cheerio.Cheerio {
const baselineScoreSiblingsPara = getSiblingsScore(doc, topNode, lang);
const sibs = topNode.prevAll();
Expand All @@ -60,7 +60,7 @@ function addSiblings(
doc,
lang,
currentNode,
baselineScoreSiblingsPara
baselineScoreSiblingsPara,
);

if (ps) {
Expand Down Expand Up @@ -124,7 +124,7 @@ function cleanTitle(title: string, delimiters: string[]): string {

function doesNodeListContainNode(
list: cheerio.Cheerio[],
node: cheerio.Cheerio
node: cheerio.Cheerio,
): boolean {
let contains = false;
for (let i = 0; i < list.length; i++) {
Expand All @@ -151,7 +151,7 @@ function getSiblingsContent(
doc: cheerio.Root,
lang: string,
currentSibling: cheerio.Cheerio,
baselineScoreSiblingsPara: number
baselineScoreSiblingsPara: number,
) {
if (
currentSibling.get(0).tagName === "p" &&
Expand Down Expand Up @@ -188,7 +188,7 @@ function getSiblingsContent(
function getSiblingsScore(
doc: cheerio.Root,
topNode: cheerio.Cheerio,
lang: string
lang: string,
): number {
const nodesToCheck = topNode.find("p");
let base = 100000;
Expand Down Expand Up @@ -249,7 +249,7 @@ function isAbsoluteUrl(url: string): boolean {
function isBoostable(
doc: cheerio.Root,
node: cheerio.Cheerio,
lang: string
lang: string,
): boolean {
const minimumStopWordCount = 5;
const maxStepsAwayFromNode = 3;
Expand Down Expand Up @@ -312,14 +312,14 @@ function isHighLinkDensity(doc: cheerio.Root, node: cheerio.Cheerio): boolean {
function isNodeScoreThresholdMet(
_doc: cheerio.Root,
node: cheerio.Cheerio,
e: cheerio.Cheerio
e: cheerio.Cheerio,
): boolean {
const topNodeScore = getScore(node);
const currentNodeScore = getScore(e);
const thresholdScore = topNodeScore * 0.08;

const elIsTdUlOlOrBlockQ = ["td", "ul", "ol", "blockquote"].includes(
e.get(0).tagName
e.get(0).tagName,
);
if (currentNodeScore < thresholdScore && !elIsTdUlOlOrBlockQ) {
return false;
Expand Down Expand Up @@ -356,7 +356,7 @@ function isValidDate(d: string): boolean {
function postCleanup(
doc: cheerio.Root,
targetNode: cheerio.Cheerio,
lang: string
lang: string,
): cheerio.Cheerio {
const node = addSiblings(doc, targetNode, lang);

Expand Down Expand Up @@ -423,7 +423,7 @@ function updateScore(node: cheerio.Cheerio, addToScore: number): void {
const extractor: Extractor = {
author: (doc: cheerio.Root): string[] => {
const authorCandidates = doc(
"meta[property='article:author'], meta[property='og:article:author'], meta[name='author'], meta[name='dcterms.creator'], meta[name='DC.creator'], meta[name='DC.Creator'], meta[name='dc.creator'], meta[name='creator']"
"meta[property='article:author'], meta[property='og:article:author'], meta[name='author'], meta[name='dcterms.creator'], meta[name='DC.creator'], meta[name='DC.Creator'], meta[name='dc.creator'], meta[name='creator']",
);

const authorList = [];
Expand Down Expand Up @@ -552,7 +552,7 @@ const extractor: Extractor = {
// if it gets to the end without one of these links or meta tags, return the original url as canonical
canonicalLink: (doc: cheerio.Root, resourceUrl: string): string => {
const canonicalLinkTag = doc(
"link[rel='canonical'], meta[property='og:url']"
"link[rel='canonical'], meta[property='og:url']",
);
if (canonicalLinkTag) {
const resourceUrlObj = new URL(resourceUrl);
Expand All @@ -562,7 +562,7 @@ const extractor: Extractor = {
canonicalLinkTag.get(0).tagName === "link"
) {
const cleanedCanonicalLink = cleanNull(
canonicalLinkTag.first().attr("href")
canonicalLinkTag.first().attr("href"),
);
// check if link is a relative url, if so, append origin
if (!isAbsoluteUrl(cleanedCanonicalLink)) {
Expand All @@ -579,7 +579,7 @@ const extractor: Extractor = {
if (urlProtocol === "https:") {
cleanedCanonicalMeta = cleanedCanonicalMeta.replace(
/^http:\/\//i,
"https://"
"https://",
);
return cleanedCanonicalMeta;
}
Expand All @@ -591,7 +591,7 @@ const extractor: Extractor = {
},
copyright: (doc: cheerio.Root): string => {
const copyrightCandidates = doc(
"p[class*='copyright'], div[class*='copyright'], span[class*='copyright'], li[class*='copyright'], p[id*='copyright'], div[id*='copyright'], span[id*='copyright'], li[id*='copyright']"
"p[class*='copyright'], div[class*='copyright'], span[class*='copyright'], li[class*='copyright'], p[id*='copyright'], div[id*='copyright'], span[id*='copyright'], li[id*='copyright']",
);
let text = copyrightCandidates?.first()?.text();
if (!text) {
Expand Down Expand Up @@ -632,17 +632,17 @@ const extractor: Extractor = {
time, \
span[class*='date'], \
p[class*='date'], \
div[class*='date']"
div[class*='date']",
);

let dateToReturn = "";

if (dateCandidates) {
const dateContentCandidate = cleanNull(
dateCandidates.first().attr("content")
dateCandidates.first().attr("content"),
);
const dateTimeCandidate = cleanNull(
dateCandidates.first().attr("datetime")
dateCandidates.first().attr("datetime"),
);
const dateTextCandidate = cleanText(dateCandidates.first().text());

Expand All @@ -669,11 +669,11 @@ const extractor: Extractor = {
},
description: (doc: cheerio.Root): string => {
const descriptionTag = doc(
"meta[name=description], meta[property='og:description']"
"meta[name=description], meta[property='og:description']",
);
if (descriptionTag) {
const cleanedDescription = cleanNull(
descriptionTag.first().attr("content")
descriptionTag.first().attr("content"),
);
if (cleanedDescription) {
return replaceCharacters(cleanedDescription.trim(), false, true);
Expand All @@ -685,7 +685,7 @@ const extractor: Extractor = {
const tag = doc("link").filter(
(_index, el) =>
doc(el).attr("rel")?.toLowerCase() === "shortcut icon" ||
doc(el).attr("rel")?.toLowerCase() === "icon"
doc(el).attr("rel")?.toLowerCase() === "icon",
);
const faviconLink = tag.attr("href") || "";
// ensure the url returned from favicon is absolute url
Expand All @@ -697,7 +697,7 @@ const extractor: Extractor = {
},
image: (doc: cheerio.Root): string => {
const images = doc(
"meta[property='og:image'], meta[property='og:image:url'], meta[itemprop=image], meta[name='twitter:image:src'], meta[name='twitter:image'], meta[name='twitter:image0']"
"meta[property='og:image'], meta[property='og:image:url'], meta[itemprop=image], meta[name='twitter:image:src'], meta[name='twitter:image'], meta[name='twitter:image0']",
);

if (images.length > 0 && cleanNull(images.first().attr("content"))) {
Expand Down Expand Up @@ -753,7 +753,7 @@ const extractor: Extractor = {
links: (
doc: cheerio.Root,
topNode: cheerio.Cheerio,
lang: string
lang: string,
): LinkObj[] => {
const links: LinkObj[] = [];

Expand Down Expand Up @@ -789,11 +789,11 @@ const extractor: Extractor = {
},
publisher: (doc: cheerio.Root): string => {
const publisherCandidates = doc(
"meta[property='og:site_name'], meta[itemprop=name], meta[name='dc.publisher'], meta[name='DC.publisher'], meta[name='DC.Publisher']"
"meta[property='og:site_name'], meta[itemprop=name], meta[name='dc.publisher'], meta[name='DC.publisher'], meta[name='DC.Publisher']",
);
if (publisherCandidates) {
const cleanedPublisher = cleanNull(
publisherCandidates.first().attr("content")
publisherCandidates.first().attr("content"),
);
if (cleanedPublisher) {
return cleanedPublisher.trim();
Expand All @@ -803,7 +803,7 @@ const extractor: Extractor = {
},
siteName: (doc: cheerio.Root): string => {
const siteNameTag = doc(
"meta[property='og:site_name'], meta[itemprop=name]"
"meta[property='og:site_name'], meta[itemprop=name]",
);
if (siteNameTag) {
const cleanedSiteName = cleanNull(siteNameTag.first().attr("content"));
Expand All @@ -822,7 +822,7 @@ const extractor: Extractor = {
let elements = doc("a[rel='tag']");
if (elements.length === 0) {
elements = doc(
"a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']"
"a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'], a[href*='?keyword=']",
);
if (elements.length === 0) {
return [];
Expand Down
6 changes: 3 additions & 3 deletions src/index.ts
Expand Up @@ -58,7 +58,7 @@ export interface LazyExtractor {
const siteMetadataExtractor = (
markup: string,
resourceUrl: string,
lang = "en"
lang = "en",
): PageData => {
const resourceUrlObj = new URL(resourceUrl);
const doc = cheerio.load(markup, { xmlMode: true });
Expand Down Expand Up @@ -114,7 +114,7 @@ export default siteMetadataExtractor;
export const lazy = (
html: string,
resourceUrl: string,
language = "en"
language = "en",
): LazyExtractor => {
const resourceUrlObj = new URL(resourceUrl);
global.lazyPageData = global.lazyPageData || {};
Expand All @@ -129,7 +129,7 @@ export const lazy = (
const doc = getParsedDoc.call(global, html);
global.lazyPageData.canonicalLink = extractor.canonicalLink(
doc,
resourceUrl
resourceUrl,
);
return global.lazyPageData.canonicalLink;
},
Expand Down

0 comments on commit 23b44c4

Please sign in to comment.