Skip to content
This repository has been archived by the owner on Sep 5, 2023. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
better popularWords support
  • Loading branch information
squallstar authored and Andrew committed Oct 20, 2014
1 parent 4c5caf8 commit 856d51e
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 1 deletion.
1 change: 1 addition & 0 deletions resources/text/stopwords-en.txt
Expand Up @@ -225,6 +225,7 @@ into
inward
is
isn't
isnt
it
it'd
it'll
Expand Down
7 changes: 7 additions & 0 deletions src/Crawler.php
Expand Up @@ -55,6 +55,8 @@ public function crawl($crawlCandidate) {

$article->setTopNode($extractor->calculateBestNodeBasedOnClustering($article));

$txt = $article->getTitle() . $article->getMetaDescription();

if ($article->getTopNode()) {
$article->setMovies($extractor->extractVideos($article->getTopNode()));
$article->setLinks($extractor->extractLinks($article->getTopNode()));
Expand All @@ -72,8 +74,12 @@ public function crawl($crawlCandidate) {
$article->setTopNode($extractor->postExtractionCleanup($article->getTopNode()));
$article->setCleanedArticleText($outputFormatter->getFormattedText($article->getTopNode()));
$article->setHtmlArticle($outputFormatter->cleanupHtml($article->getTopNode()));

$txt .= $article->getCleanedArticleText();
}

$article->setPopularWords($extractor->getPopularWords($txt));

return $article;
}

Expand Down Expand Up @@ -138,3 +144,4 @@ private function getExtractor() {
return $this->config->getContentExtractor();
}
}

2 changes: 1 addition & 1 deletion src/Extractors/ContentExtractor.php
Expand Up @@ -636,7 +636,7 @@ private function getBaselineScoreForSiblings($topNode) {

public function getPopularWords($cleanedText, $limit = 5)
{
$minFrequency = 2;
$minFrequency = 1;

$string = trim(preg_replace('/ss+/i', '', $cleanedText));
$string = preg_replace('/[^a-zA-Z -]/', '', $string); // only take alphabet characters, but keep the spaces and dashes too
Expand Down

0 comments on commit 856d51e

Please sign in to comment.