-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from rafaelglikis/dev
Spider
- Loading branch information
Showing
3 changed files
with
208 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
<?php | ||
|
||
namespace Sinama; | ||
|
||
|
||
abstract class Spider | ||
{ | ||
/** | ||
* @var array | ||
*/ | ||
protected $followUrls = []; | ||
|
||
protected $lastIndex = 0; | ||
|
||
/** | ||
* @var \Sinama\Client | ||
*/ | ||
protected $client; | ||
|
||
/** | ||
* Spider constructor. | ||
* | ||
* @param Client|null $client | ||
*/ | ||
public function __construct(Client $client = null) | ||
{ | ||
$this->client = $client ?? new Client(); | ||
|
||
if (is_null($client)) { | ||
$this->client = new Client(); | ||
} | ||
else { | ||
$this->client = $client; | ||
} | ||
$this->followUrls = $this->getStartUrls(); | ||
} | ||
|
||
/** | ||
* Starts the spider. | ||
*/ | ||
public function run() | ||
{ | ||
for ($i = $this->lastIndex ; $i < count($this->followUrls); ++$i) { | ||
$this->lastIndex = $i; | ||
$crawler = $this->client->request('GET', $this->followUrls[$i]); | ||
$this->parse($crawler); | ||
} | ||
} | ||
|
||
/** | ||
* Implements how to parse each web page. | ||
* | ||
* @param Crawler $crawler | ||
* @return mixed | ||
*/ | ||
abstract public function parse(Crawler $crawler); | ||
|
||
/** | ||
* Implements how to scrape each web page. | ||
* | ||
* @param $url | ||
* @return mixed | ||
*/ | ||
abstract public function scrape($url); | ||
|
||
/** | ||
* Puts url in followUrls to be parsed | ||
* | ||
* @param $url | ||
*/ | ||
public function follow($url) | ||
{ | ||
if (!in_array($url, $this->followUrls)) { | ||
$this->followUrls[] = $url; | ||
} | ||
} | ||
|
||
/** | ||
* Returns a list with the start urls of a spider. | ||
* | ||
* @return array | ||
*/ | ||
abstract public function getStartUrls(): array; | ||
|
||
/** | ||
* @param Client $client | ||
*/ | ||
public function setClient(Client $client) | ||
{ | ||
$this->client = $client; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
<?php | ||
|
||
namespace Sinama\Tests; | ||
|
||
|
||
use PHPUnit\Framework\TestCase; | ||
use Sinama\Crawler; | ||
|
||
class CrawlerTest extends TestCase | ||
{ | ||
/** | ||
* @var \Sinama\Crawler | ||
*/ | ||
private static $crawler; | ||
|
||
public static function setUpBeforeClass() | ||
{ | ||
CrawlerTest::$crawler = new Crawler(file_get_contents("fixtures/test.html"), "http://www.mfw.com"); | ||
} | ||
|
||
public function testFindTitle() | ||
{ | ||
$this->assertEquals('Motherfucking Website', CrawlerTest::$crawler->findTitle()); | ||
} | ||
|
||
public function testFindMainImage() | ||
{ | ||
$this->assertEquals('https://www.w3schools.com/html5.gif', CrawlerTest::$crawler->findMainImage()); | ||
} | ||
|
||
public function testFindMainContent() | ||
{ | ||
$space=' '; | ||
$p = $space."<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus vitae pretium augue. Quisque viverra dui non enim commodo auctor. Sed.</p>\n"; | ||
$expexted = $p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p.$p; | ||
$this->assertEquals(trim($expexted), trim(CrawlerTest::$crawler->findMainContent())); | ||
} | ||
|
||
public function testFindImages() | ||
{ | ||
$expectedImages = [ | ||
'https://www.w3schools.com/html5.gif', | ||
'https://www.w3schools.com/html5.gif', | ||
'https://www.w3schools.com/pic_trulli.jpg', | ||
'https://www.w3schools.com/img_chania.jpg', | ||
'https://www.w3schools.com/img_girl.jpg' | ||
]; | ||
|
||
$this->assertEquals($expectedImages, CrawlerTest::$crawler->findImages()); | ||
} | ||
|
||
public function testFindEmails() | ||
{ | ||
$expectedEmails = [ | ||
'test@test.com', | ||
'test1@test.com', | ||
'test2@test.com', | ||
'test3@test.com', | ||
'test4@test.com', | ||
'test5@test.com', | ||
'test6@test.com', | ||
'test7@test.com', | ||
'test8@test.com', | ||
'test9@test.com', | ||
'test10@test.com', | ||
'test11@test.com' | ||
]; | ||
|
||
$this->assertEquals($expectedEmails, CrawlerTest::$crawler->findEmails()); | ||
} | ||
} |