Skip to content

Commit

Permalink
MINOR: keyword scores
Browse files Browse the repository at this point in the history
  • Loading branch information
spekulatius committed Aug 16, 2020
1 parent 1a36bf4 commit e91bce2
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 8 deletions.
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "spekulatius/phpscraper",
"description": "An oppinionated web to access the web. See tests/ for examples.",
"description": "An oppinionated way to access the web. See tests/ for examples.",
"keywords": [
"PHP scraper",
"PHP scraping",
Expand Down
59 changes: 54 additions & 5 deletions src/phpscraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,8 @@ public function cleanOutlineWithParagraphs()
}

/**
* gets a set of keywords based on the rake approach.
* Internal method to prepare the content for keyword analysis
* done in the called methods for the rake analysis
*
* Uses:
* - Title
Expand All @@ -580,12 +581,11 @@ public function cleanOutlineWithParagraphs()
* - Meta Title, Description and Keywords
*
* @see https://github.com/Donatello-za/rake-php-plus
* @see https://phpscraper.de/examples/keywords.html
* @see https://phpscraper.de/examples/extract-keywords.html
*
* @param string $locale (default: 'en_US')
* @return array
*/
public function contentKeywords($locale = 'en_US')
protected function prepContent()
{
// Collect content strings
$content = array_merge(
Expand Down Expand Up @@ -617,8 +617,57 @@ public function contentKeywords($locale = 'en_US')
$content[] = $image['alt'];
}

return $content;
}

/**
* gets a set of keywords based on the rake approach.
*
* Uses:
* - Title
* - Headings
* - Paragraphs/Content
* - Link anchors and Titles
* - Alt Texts of Images
* - Meta Title, Description and Keywords
*
* @see https://github.com/Donatello-za/rake-php-plus
* @see https://phpscraper.de/examples/extract-keywords.html
*
* @param string $locale (default: 'en_US')
* @return array
*/
public function contentKeywords($locale = 'en_US')
{
// Extract the keyword phrases and return a sorted array
return RakePlus::create(join(' ', $this->prepContent()), $locale)
->sort('asc')
->get();
}

/**
* gets a set of keywords with scores based on the rake approach
*
* Uses:
* - Title
* - Headings
* - Paragraphs/Content
* - Link anchors and Titles
* - Alt Texts of Images
* - Meta Title, Description and Keywords
*
* @see https://github.com/Donatello-za/rake-php-plus
* @see https://phpscraper.de/examples/extract-keywords.html
*
* @param string $locale (default: 'en_US')
* @return array
*/
public function contentKeywordsWithScores($locale = 'en_US')
{
// Extract the keyword phrases and return a sorted array
return RakePlus::create(join(' ', $content), $locale)->sort('asc')->get();
return RakePlus::create(join(' ', $this->prepContent()), $locale)
->sortByScore('desc')
->scores();
}

/**
Expand Down
42 changes: 41 additions & 1 deletion tests/KeywordTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ final class KeywordTest extends TestCase
/**
* @test
*/
public function testExtractionExamples()
public function testKeywordExtraction()
{
$web = new \spekulatius\phpscraper();

Expand Down Expand Up @@ -44,4 +44,44 @@ public function testExtractionExamples()
$this->assertTrue(in_array($keyword, $keywords));
}
}

/**
* @test
*/
public function testKeywordExtractionWithScores()
{
$web = new \spekulatius\phpscraper();

// Navigate to the test page.
// It contains 3 paragraphs from the English Wikipedia article for "lorem ipsum"
$web->go('https://test-pages.phpscraper.de/content/keywords.html');

// Check the keywords on this case...
$keywords = $web->contentKeywordsWithScores;

// a selected list of keywords to expect
$shouldKeywords = [
'1960s' => 1.0,
'added' => 1.0,
'adopted lorem ipsum' => 11.0,
'advertisements' => 1.0,
'aldus employed' => 4.0,
'corrupted version' => 4.0,
'graphic' => 1.0,
'improper latin' => 4.0,
'introduced' => 1.0,
'keyword extraction tests' => 9.0,
'test' => 1.0,
'microsoft word' => 5.3333333333333,
'english wikipedia' => 4.0,
'lorem ipsum' => 8.0,
'lorem ipsum text' => 11.0,
];

// check if all are part of the output with the expected score
foreach ($shouldKeywords as $keyword => $score) {
// has the same score
$this->assertSame($keywords[$keyword], $score);
}
}
}
51 changes: 50 additions & 1 deletion websites/examples/extract-keywords.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,53 @@ foreach ($keywords as $keyword) {

::: tip
The default language (locale) for this is `en_US`. Other languages can be passed as a parameter. This currently works only for a selection of languages. Check this [list](https://github.com/Donatello-za/rake-php-plus#currently-supported-languages) for further information.
:::
:::


## Scoring of Keywords

Not every keyword has the same weight in the ranking-algorithms of search engines. A mix of several factors and SEO-signals decides on the weight a search engine assigns to a word. Frequency of words, length of the texts, and variations such as synonyms can lead to different weighting.

The PHPScraper library allows you to get an indication of keyword weights in the form of scores:


```PHP
$web = new \spekulatius\phpscraper();

// Navigate to the test page.
// It contains 3 paragraphs from the English Wikipedia article for "lorem ipsum"
$web->go('https://test-pages.phpscraper.de/content/keywords.html');

// check the number of keywords.
$keywords = $web->contentKeywordsWithScores;
echo "This page contains at least " . count($keywords) . " keywords/phrases.\n\n";

// Loop through the keywords
foreach ($keywords as $keyword => $score) {
echo sprintf(" - %s (%s)\n", $keyword, $score);
}

/**
* Will print out:
*
* This page contains at least 40 keywords/phrases.
*
* [...]
* - 1960s (1.0)
* - added (1.0)
* - adopted lorem ipsum (11.0)
* - advertisements (1.0)
* - aldus employed (4.0)
* - corrupted version (4.0)
* - graphic (1.0)
* - improper latin (4.0)
* - introduced (1.0)
* - keyword extraction tests (9.0)
* - test (1.0)
* - microsoft word (5.3333333333333)
* - english wikipedia (4.0)
* - lorem ipsum (8.0)
* - lorem ipsum text (11.0)
* [...]
*/
```

0 comments on commit e91bce2

Please sign in to comment.