Skip to content

Commit

Permalink
Merge pull request #3 from rafaelglikis/dev
Browse files Browse the repository at this point in the history
Spider
  • Loading branch information
rafaelglikis committed Aug 16, 2018
2 parents 06c5959 + 45b9688 commit 479ac66
Show file tree
Hide file tree
Showing 3 changed files with 208 additions and 1 deletion.
46 changes: 45 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,52 @@ $crawler->filter('h3 > a')->each(function ($node) {
print trim($node->text())."\n";
});
```

Now that we have learned enough let's scrape a site with Sinama Spider:

```php
use Sinama\Crawler;
use Sinama\Spider as BaseSpider;

class Spider extends BaseSpider
{
public function parse(Crawler $crawler)
{
$crawler->filter('div.read-more > a')->each(function (Crawler $node) {
$this->scrape($node->attr('href'));
});

$crawler->filter('div.blog-pagination > a')->each(function ($node) {
$this->follow($node->attr('href'));
});
}

public function scrape($url)
{
echo "*************************************************** ".$url."\n";
$crawler = $this->client->request('GET', $url);
echo "Title: " . $crawler->findTitle() . "\n";
echo "Main Image: " . $crawler->findMainImage()."\n";
echo "Main Content: \n" . $crawler->findMainContent()."\n";
echo "Emails: \n";
print_r($crawler->findEmails());
echo "Links: \n";
print_r($crawler->findLinks());
}

public function getStartUrls(): array
{
return [
'https://blog.scrapinghub.com'
];
}
}

$spider = new Spider();
$spider->run();
```

## TODO
* Crawler::findTags()
* implement Spider


92 changes: 92 additions & 0 deletions src/Spider.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
<?php

namespace Sinama;


abstract class Spider
{
/**
* @var array
*/
protected $followUrls = [];

protected $lastIndex = 0;

/**
* @var \Sinama\Client
*/
protected $client;

/**
* Spider constructor.
*
* @param Client|null $client
*/
public function __construct(Client $client = null)
{
$this->client = $client ?? new Client();

if (is_null($client)) {
$this->client = new Client();
}
else {
$this->client = $client;
}
$this->followUrls = $this->getStartUrls();
}

/**
* Starts the spider.
*/
public function run()
{
for ($i = $this->lastIndex ; $i < count($this->followUrls); ++$i) {
$this->lastIndex = $i;
$crawler = $this->client->request('GET', $this->followUrls[$i]);
$this->parse($crawler);
}
}

/**
* Implements how to parse each web page.
*
* @param Crawler $crawler
* @return mixed
*/
abstract public function parse(Crawler $crawler);

/**
* Implements how to scrape each web page.
*
* @param $url
* @return mixed
*/
abstract public function scrape($url);

/**
* Puts url in followUrls to be parsed
*
* @param $url
*/
public function follow($url)
{
if (!in_array($url, $this->followUrls)) {
$this->followUrls[] = $url;
}
}

/**
* Returns a list with the start urls of a spider.
*
* @return array
*/
abstract public function getStartUrls(): array;

/**
* @param Client $client
*/
public function setClient(Client $client)
{
$this->client = $client;
}
}
71 changes: 71 additions & 0 deletions src/Tests/CrawlerTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
<?php

namespace Sinama\Tests;


use PHPUnit\Framework\TestCase;
use Sinama\Crawler;

class CrawlerTest extends TestCase
{
/**
* @var \Sinama\Crawler
*/
private static $crawler;

public static function setUpBeforeClass()
{
CrawlerTest::$crawler = new Crawler(file_get_contents("fixtures/test.html"), "http://www.mfw.com");
}

public function testFindTitle()
{
$this->assertEquals('Motherfucking Website', CrawlerTest::$crawler->findTitle());
}

public function testFindMainImage()
{
$this->assertEquals('https://www.w3schools.com/html5.gif', CrawlerTest::$crawler->findMainImage());
}

public function testFindMainContent()
{
$space=' ';
$p = $space."<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus vitae pretium augue. Quisque viverra dui non enim commodo auctor. Sed.</p>\n";
$expexted = $p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p;
$this->assertEquals(trim($expexted), trim(CrawlerTest::$crawler->findMainContent()));
}

public function testFindImages()
{
$expectedImages = [
'https://www.w3schools.com/html5.gif',
'https://www.w3schools.com/html5.gif',
'https://www.w3schools.com/pic_trulli.jpg',
'https://www.w3schools.com/img_chania.jpg',
'https://www.w3schools.com/img_girl.jpg'
];

$this->assertEquals($expectedImages, CrawlerTest::$crawler->findImages());
}

public function testFindEmails()
{
$expectedEmails = [
'test@test.com',
'test1@test.com',
'test2@test.com',
'test3@test.com',
'test4@test.com',
'test5@test.com',
'test6@test.com',
'test7@test.com',
'test8@test.com',
'test9@test.com',
'test10@test.com',
'test11@test.com'
];

$this->assertEquals($expectedEmails, CrawlerTest::$crawler->findEmails());
}
}

0 comments on commit 479ac66

Please sign in to comment.