Skip to content

Commit

Permalink
Basic Lists (#36)
Browse files Browse the repository at this point in the history
* Adding example

* wip

* wip

* wip

* wip

* wip

* wip
  • Loading branch information
spekulatius committed Nov 24, 2020
1 parent 47d6f8a commit 0aac528
Show file tree
Hide file tree
Showing 6 changed files with 213 additions and 0 deletions.
44 changes: 44 additions & 0 deletions src/phpscraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,50 @@ public function headings()
];
}

/**
* Get all lists on the page
*
* @return array
*/
public function lists()
{
$lists = [];

foreach ($this->current_page->filter('ol, ul') as $list) {
$lists[] = [
'type' => $list->tagName,
'children' => $list->childNodes,
'children_plain' => array_filter(array_map('trim', explode("\n", $list->textContent))),
];
}

return $lists;
}

/**
* Get all ordered lists on the page
*
* @return array
*/
public function orderedLists()
{
return array_values(array_filter($this->lists(), function ($list) {
return $list['type'] === 'ol';
}));
}

/**
* Get all unordered lists on the page
*
* @return array
*/
public function unorderedLists()
{
return array_values(array_filter($this->lists(), function ($list) {
return $list['type'] === 'ul';
}));
}

/**
* Get all paragraphs of the page
*
Expand Down
51 changes: 51 additions & 0 deletions tests/ListsTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<?php

namespace Tests;

class ListsTest extends BaseTest
{
/**
* @test
*/
public function checkCountTest()
{
$web = new \spekulatius\phpscraper();

/**
* Navigate to the test page. This page contains:
*
* <h2>Example 1: Unordered List</h2>
* <ul>
* <li>Unordered item 1</li>
* <li>Unordered item 2</li>
* <li>Unordered item with <b>HTML</b></li>
* </ul>
*
* <h2>Example 2: Ordered List</h2>
* <ol>
* <li>Order list item 1</li>
* <li>Order list item 2</li>
* <li>Order list item with <i>HTML</i></li>
* </ol>
*/
$web->go($this->url . '/content/lists.html');

// Check all lists are recognized
$this->assertSame(count($web->lists), 2);
$this->assertSame(count($web->unorderedLists), 1);
$this->assertSame(count($web->orderedLists), 1);

// Check the contents
$this->assertSame([
'Ordered list item 1',
'Ordered list item 2',
'Ordered list item with HTML',
], $web->orderedLists[0]['children_plain']);

$this->assertSame([
'Unordered list item 1',
'Unordered list item 2',
'Unordered list item with HTML',
], $web->unorderedLists[0]['children_plain']);
}
}
24 changes: 24 additions & 0 deletions tests/resources/content/lists.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<!DOCTYPE html>
<html>
<head>
<title>List Tests</title>
</head>
<body>
<h1>We are testing here!</h1>
<p>This page contains example lists.</p>

<h2>Example 1: Unordered List</h2>
<ul>
<li>Unordered list item 1</li>
<li>Unordered list item 2</li>
<li>Unordered list item with <b>HTML</b></li>
</ul>

<h2>Example 2: Ordered List</h2>
<ol>
<li>Ordered list item 1</li>
<li>Ordered list item 2</li>
<li>Ordered list item with <i>HTML</i></li>
</ol>
</body>
</html>
1 change: 1 addition & 0 deletions websites/.vuepress/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ module.exports = {
children: [
'examples/headings',
'examples/paragraphs',
'examples/lists',
'examples/outline',
'examples/extract-keywords',
'examples/scrape-images',
Expand Down
69 changes: 69 additions & 0 deletions websites/examples/lists.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
---
image: https://api.imageee.com/bold?text=PHP:%20Scraping%20Lists&bg_image=https://images.unsplash.com/photo-1542762933-ab3502717ce7
---

# Scraping Lists

Scraping lists follows a similar approach as other scraping:

```php
$web = new \spekulatius\phpscraper();

/**
* Navigate to the test page. This page contains:
*
* <h2>Example 1: Unordered List</h2>
* <ul>
* <li>Unordered list item 1</li>
* <li>Unordered list item 2</li>
* <li>Unordered list item with <b>HTML</b></li>
* </ul>
*
* <h2>Example 2: Ordered List</h2>
* <ol>
* <li>Ordered list item 1</li>
* <li>Ordered list item 2</li>
* <li>Ordered list item with <i>HTML</i></li>
* </ol>
*/
$web->go('https://test-pages.phpscraper.de/content/lists.html');

var_dump($web->unorderedLists);
/**
* Only unordered lists (<ul>)
*
* [
* "type" => "ul",
* "children" => ... // List of childNodes
* "children_plain" =>
* [
* "Unordered list item 1"
* "Unordered list item 2"
* "Unordered list item with HTML"
* ]
* ]
*/

var_dump($web->orderedLists);
/**
* Only ordered lists (<ol>)
*
* [
* "type" => "ul",
* "children" => ... // List of childNodes
* "children_plain" =>
* [
* "Ordered list item 1"
* "Ordered list item 2"
* "Ordered list item with HTML"
* ]
* ]
*/

// Both lists combined (as above)
var_dump($web->lists);
```

::: warning Nested Lists
At the moment, this doesn't handle nested lists well. To allow further processing nested lists are included in the result as `children`.
:::
24 changes: 24 additions & 0 deletions websites/test-pages/content/lists.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<!DOCTYPE html>
<html>
<head>
<title>List Tests</title>
</head>
<body>
<h1>We are testing here!</h1>
<p>This page contains example lists.</p>

<h2>Example 1: Unordered List</h2>
<ul>
<li>Unordered list item 1</li>
<li>Unordered list item 2</li>
<li>Unordered list item with <b>HTML</b></li>
</ul>

<h2>Example 2: Ordered List</h2>
<ol>
<li>Ordered list item 1</li>
<li>Ordered list item 2</li>
<li>Ordered list item with <i>HTML</i></li>
</ol>
</body>
</html>

0 comments on commit 0aac528

Please sign in to comment.