Skip to content

Commit

Permalink
added Spider Interface
Browse files Browse the repository at this point in the history
added more params
 - max_depth
 - verbose
 - start_urls
 removed abstract method getStartUrls
  • Loading branch information
rafaelglikis committed Aug 23, 2018
1 parent beeaf75 commit ec3abb1
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 26 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,11 @@ class Spider extends BaseSpider
}
}

$spider = new Spider();
$spider = new Spider([
'start_urls' => [ 'https://blog.scrapinghub.com' ],
'max_depth' => 2,
'verbose' => true
]);
$spider->run();
```

Expand Down
75 changes: 50 additions & 25 deletions src/Spider.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,32 @@

namespace Sinama;


abstract class Spider
abstract class Spider implements SpiderInterface
{
/**
* @var array
*/
protected $followUrls = [];
private $followUrls = [];

protected $lastIndex = 0;
/**
* @var int
*/
private $maxDepth = -1;

/**
* @var int
*/
private $lastIndex = 0;

/**
* @var float
*/
private $sleepTime = 0;

/**
* @var boolean
*/
private $verbose = false;

/**
* @var \Sinama\Client
Expand All @@ -20,73 +37,81 @@ abstract class Spider
/**
* Spider constructor.
*
* @param array $params
* @param Client|null $client
*/
public function __construct(Client $client = null)
public function __construct($params = [], Client $client = null)
{
$this->client = $client ?? new Client();

if (is_null($client)) {
$this->client = new Client();
}
else {
$this->client = $client;
}
$this->followUrls = $this->getStartUrls();
// Setting parameters
$this->followUrls = $params['start_urls'] ?? [];
$this->maxDepth = (int)$params['max_depth'] ?? -1;
$this->verbose = (bool)$params['verbose'] ?? false;
}

/**
* Starts the spider.
*/
public function run()
{
$this->log('i', 'Spider started');

for ($i = $this->lastIndex ; $i < count($this->followUrls); ++$i) {
$this->log('i', 'Parsing ' . $this->followUrls[$i]);

$this->lastIndex = $i;
$crawler = $this->client->request('GET', $this->followUrls[$i]);
$this->parse($crawler);
$this->parse($this->followUrls[$i]);
if ( $i == $this->maxDepth) {
$this->log('i', 'Max depth reached');
break;
}
}

$this->log('i', 'End of site reached');
}

/**
* Implements how to parse each web page.
*
* @param Crawler $crawler
* @param string $url
* @return mixed
*/
abstract public function parse(Crawler $crawler);
abstract public function parse(string $url);

/**
* Implements how to scrape each web page.
*
* @param $url
* @return mixed
*/
abstract public function scrape($url);
abstract public function scrape(string $url);

/**
* Puts url in followUrls to be parsed
*
* @param $url
*/
public function follow($url)
public function follow(string $url)
{
if (!in_array($url, $this->followUrls)) {
$this->followUrls[] = $url;
}
}

/**
* Returns a list with the start urls of a spider.
*
* @return array
*/
abstract public function getStartUrls(): array;

/**
* @param Client $client
*/
public function setClient(Client $client)
{
$this->client = $client;
}

private function log($type, $message)
{
if ($this->verbose) {
$time = date('Y:m:d:h:i:s');
echo "[$type] [$time] $message\n";
}
}
}
16 changes: 16 additions & 0 deletions src/SpiderInterface.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php

namespace Sinama;


interface SpiderInterface
{

public function parse(string $url);

public function scrape(string $url);

public function follow(string $url);

public function run();
}

0 comments on commit ec3abb1

Please sign in to comment.