Skip to content

sergioska/xcrawler

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

62 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

xCrawler

xCrawler is a toolkit FRAMEWORK AGNOSTIC to develop spider application based on xsl stylesheets.

The simple example below show how to use this toolkit to parse a rss feed (yahoo news in this case) throw a xsl stylesheet to get in output a xml forged in another format.

<?php

require_once('vendor/autoload.php');

use xcrawler\Bot;


class Spider
{
    public function run(
	Processor $processor,
	string $content,
	string $stylesheet
    ): string
    {
	return $processor->process($content, $stylesheet);
    }
}

$spider = new Spider();
$bot = new Bot();
$bot->setUrl('http://www.example.com');
$stylesheet = 'stylesheet.xsl';
$content = $bot->get();
$output = $spider->run(new Processor\HtmlProcessor(), $content, $stylesheet);

echo $output

Toolkit parse xml or html source file in trasparent way; so in setUrl method, the only one parameter can set with an xml or html url. Above example use get method to retrieve code of source page, but also it is available post method for post request.

A more complex example is:

<?php

require_once('vendor/autoload.php');

use xcrawler\Bot;
use xcrawler\Helpers\Utils;
use xcrawler\Processor;

class Spider
{

	const XSL_LIST = 'stylesheets/example.xsl';
	const URL_LIST = "http://www.example.comlist.php?ltr=%s";
	const XSL_DETAILS = 'stylesheets/example-details.xsl';
	const URL_DETAILS_BASE = 'http://www.example.com/x/details';
	const BUFFER_PATH = 'buffer/';

	public function run(Processor $processor)
	{

		$pages = array('A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z');

		for ($nPage=0; $nPage < count($pages); $nPage++) {

			$bot = new Bot();
			$bot->setUrl(sprintf(self::URL_LIST, $pages[$nPage]));
			$bot->setProxy('192.168.99.100:9050');
			$bot->setSOCKS5();
			$sPage = "";
			$sPage = Utils::bufferize(self::BUFFER_PATH . sprintf('buffer-%d.bak', $nPage), array('Spider', 'getData'), array($bot));

			$sXml = $processor->process($sPage, self::XSL_LIST);
			$aXml = Utils::xmlToArray($sXml);

			$i=0;
			foreach($aXml['result']['item'] as $item) {

				if(empty($item['name']))
					continue;

				$oDetailsBot = new Bot();
				$oDetailsBot->setProxy('192.168.99.100:9050');
				$oDetailsBot->setSOCKS5();
				$oDetailsBot->setUrl(self::URL_DETAILS_BASE . $item['link']);
				$sDetailsPage = "";
				$bak = sprintf('buffer-details-%d-%d.bak', $nPage, $i);
				$sDetailsPage = Utils::bufferize(self::BUFFER_PATH . $bak, array('Spider', 'getData'), array($oDetailsBot));

				$oDetailsProcessor = Factory::factory($sDetailsPage, self::XSL_DETAILS);
				$sDetailsXml = $oDetailsProcessor->process($sDetailsPage, self::XSL_DETAILS);
				$aEmail = Utils::xmlToArray($sDetailsXml);
				if ($aEmail['result'])
					echo $item['name'] . ", " . $aEmail['result']['item']['email'] . "\n";
				$i++;
				$oDetailsBot->close();
			}

			$oBot->close();
		}

	}
	
	static public function getData($oBot) {
		return $oBot->get();
	}

}

$spider = new Spider();
$spider->run(new Processor\HtmlProcessor());

Install

xCrawler can be install via composer as follow:

php composer.phar install

About

PHP toolkit for web crawler xsl based

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages