Skip to content

Commit

Permalink
Migrate config after split class, improve chainability (#124)
Browse files Browse the repository at this point in the history
* Improve chainable down to Core

* Tidy up: Code OCD

* Merge traditionally

* Updating website
  • Loading branch information
spekulatius committed Oct 14, 2022
1 parent 31c0cbf commit e19baeb
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 73 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,15 @@ var_dump($web->imagesWithDetails);

### Proxy Support

You can configure proxy support with `setProxy`:
You can configure proxy support with `setConfig`:

```php
$web = new \spekulatius\phpscraper;
$web->setProxy('http://user:password@127.0.0.1:3128');
$web->setConfig(['proxy' => 'http://user:password@127.0.0.1:3128']);
```

You can call `setConfig` multiple times. It stores the config and merges it with previous settings. This should be kept in mind in the unlikely use-case when unsetting values.

See the full documentation on the website for more information and many more examples.

Installation
Expand Down
80 changes: 25 additions & 55 deletions src/Core.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@

namespace spekulatius;

use Goutte\Client;
/**
* This call organizes the actual scraping calls.
*
* It doesn't handle any client management. That's with phpscraper.php
*/

use Pdp\Cache;
use Pdp\CurlHttpClient;
use Pdp\Manager;
use DonatelloZa\RakePlus\RakePlus;
use Symfony\Component\HttpClient\HttpClient;
use Goutte\Client as GoutteClient;
use Symfony\Component\DomCrawler\Crawler;

class Core
{
Expand All @@ -21,55 +27,18 @@ class Core
/**
* Holds the current page (a Crawler object)
*
* @var Symfony\Component\DomCrawler\Crawler
* @var \Symfony\Component\DomCrawler\Crawler
*/
protected $currentPage = null;

/**
* Constructor
*/
public function __construct()
{
// Goutte Client
$this->client = new Client();

// We assume that we want to follow any redirects.
$this->client->followRedirects(true);
$this->client->followMetaRefresh(true);
$this->client->setMaxRedirects(5);

// Make ourselves known
$this->client->setServerParameter(
'HTTP_USER_AGENT',
'Mozilla/5.0 (compatible; PHP Scraper/0.x; +https://phpscraper.de)'
);
}

/**
* Sets a http proxy
*
* @param string $proxy
*/
public function setProxy(string $proxy)
{
$httpClient = HttpClient::create([
'proxy' => $proxy
]);

// Goutte Client
$this->client = new Client($httpClient);

return $this;
}

/**
* Overwrites the agent
* Overwrites the client
*
* @param string $agent
* @param \Goutte\Client $client
*/
public function setAgent(string $agent)
public function setClient(GoutteClient $client)
{
$this->client->setServerParameter('HTTP_USER_AGENT', $agent);
$this->client = $client;

return $this;
}
Expand All @@ -92,6 +61,8 @@ public function currentURL()
public function go(string $url)
{
$this->currentPage = $this->client->request('GET', $url);

return $this;
}

/**
Expand All @@ -102,10 +73,9 @@ public function go(string $url)
*/
public function setContent(string $url, string $content)
{
$this->currentPage = new \Symfony\Component\DomCrawler\Crawler(
$content,
$url
);
$this->currentPage = new Crawler($content, $url);

return $this;
}

/**
Expand All @@ -124,7 +94,7 @@ public function fetchAsset(string $url)
* @param string $filter
* @return Crawler
*/
public function filter(string $query)
public function filter(string $query): Crawler
{
return $this->currentPage->filterXPath($query);
}
Expand All @@ -133,9 +103,9 @@ public function filter(string $query)
* Filters the current page by a xPath-query and returns the first one, or null.
*
* @param string $filter
* @return Crawler|null
* @return ?Crawler
*/
public function filterFirst(string $query)
public function filterFirst(string $query): ?Crawler
{
$filteredNodes = $this->filter($query);

Expand All @@ -146,7 +116,7 @@ public function filterFirst(string $query)
* Filters the current page by a xPath-query and returns the first ones content, or null.
*
* @param string $filter
* @return string|null
* @return ?string
*/
public function filterFirstText(string $query): ?string
{
Expand All @@ -171,7 +141,7 @@ public function filterTexts(string $query): array
*
* @param string $filter
* @param array $attributes
* @return array|null
* @return ?array
*/
public function filterExtractAttributes(string $query, array $attributes): ?array
{
Expand All @@ -185,7 +155,7 @@ public function filterExtractAttributes(string $query, array $attributes): ?arra
*
* @param string $filter
* @param array $attributes
* @return string|null
* @return ?string
*/
public function filterFirstExtractAttribute(string $query, array $attributes): ?string
{
Expand All @@ -198,7 +168,7 @@ public function filterFirstExtractAttribute(string $query, array $attributes): ?
* Returns the content attribute for the first result of the query, or null.
*
* @param string $filter
* @return string|null
* @return ?string
*/
public function filterFirstContent(string $query): ?string
{
Expand Down
93 changes: 87 additions & 6 deletions src/phpscraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,92 @@

namespace spekulatius;

/**
* This class manages the Clients and connections.
*
* Most calls are passed through to the Core class.
*/

use Goutte\Client as GoutteClient;
use Symfony\Component\HttpClient\HttpClient as SymfonyHttpClient;

class phpscraper
{
/**
* Holds the client
* Holds the config for the clients.
*
* @var array
*/
protected $config = [];

/**
* Holds the Core class. It handles the actual scraping.
*
* @var spekulatius\Core;
*/
protected $core = null;

public function __construct(?array $config = [])
{
// Prepare the core.
$this->core = new Core();

// And set the config.
$this->setConfig($config);
}

/**
* Constructor
* Sets the config, generates the required Clients and updates the core with the new clients.
*
* @var ?array $config = []
*/
public function __construct()
public function setConfig(?array $config = []): self
{
$this->core = new Core();
// Define the default values
$defaults = [
// We assume that we want to follow any redirects, in reason.
'follow_redirects' => true,
'follow_meta_refresh' => true,
'max_redirects' => 5,

/**
* Agent can be overwritten using:
*
* ```php
* $web->setConfig(['agent' => 'My Agent']);
* ```
*/
'agent' => 'Mozilla/5.0 (compatible; PHP Scraper/0.x; +https://phpscraper.de)',

/**
* Agent can be overwritten using:
*
* ```php
* $web->setConfig(['agent' => 'http://user:password@127.0.0.1:3128']);
* ```
*/
'proxy' => null,
];

// Add the defaults in
$this->config = array_merge($defaults, $config);

// Symfony HttpCleint
$httpClient = SymfonyHttpClient::create([
'proxy' => $this->config['proxy'],
]);

// Goutte Client and set some config needed for it.
$client = new GoutteClient($httpClient);
$client->followRedirects($this->config['follow_redirects']);
$client->followMetaRefresh($this->config['follow_meta_refresh']);
$client->setMaxRedirects($this->config['max_redirects']);
$client->setServerParameter('HTTP_USER_AGENT', $this->config['agent']);

// Set the client on the core.
$this->core->setClient($client);

return $this;
}

/**
Expand All @@ -42,9 +113,19 @@ public function __call(string $name, array $arguments = null)
{
if ($name == 'call') {
$name = $arguments[0];
return $this->core->$name();
$result = $this->core->$name();
} else {
return $this->core->$name(...$arguments);
$result = $this->core->$name(...$arguments);
}

// Did we get a Core class element? Keep this.
if ($result instanceof Core) {
$this->core;

return $this;
}

// Otherwise: just return whatever the core returned.
return $result;
}
}
6 changes: 4 additions & 2 deletions websites/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,17 +131,19 @@ Further configuration is optional. Below are the correctly available options.

### Proxy Support

If you require proxies, you can configure the proxy support with `setProxy`:
If you require proxies, you can configure the proxy support with `setConfig`:

```php
$web = new \spekulatius\phpscraper;
$web->setProxy('http://user:password@127.0.0.1:3128');
$web->setConfig(['proxy' => 'http://user:password@127.0.0.1:3128']);
```

::: tip
If you're looking for decent prices residential proxy, check [IProyal](https://iproyal.com?r=119987).
:::

You can call `setConfig` multiple times. It stores the config and merges it with previous settings. This should be kept in mind in the unlikely use-case when unsetting values.


Found a bug and fixed it? Awesome!
----------------------------------
Expand Down
4 changes: 3 additions & 1 deletion websites/de/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,15 @@ Falls Sie Proxies benötigen können Sie PHPScraper wie folgt konfigurieren:

```php
$web = new \spekulatius\phpscraper;
$web->setProxy('http://user:password@127.0.0.1:3128');
$web->setConfig(['proxy' => 'http://user:password@127.0.0.1:3128']);
```

::: tip
Falls Sie nach einem günstigen Proxy suchen ist [IProyal](https://iproyal.com?r=119987) einen Blick wert.
:::

`setConfig` kann mehrfach aufgerufen werden. Es speichert die Konfiguration und merged frühere Einstellungen. Dies sollte bedacht werden, wenn man etwas zurücksetzen will.


Ein Problem gefunden und gefixt? Super!
---------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions websites/es/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,11 @@ La configuración adicional es opcional. A continuación se muestran las opcione

### Soporte de Proxy

Si necesita proxies, puede configurar el soporte de proxy con `setProxy`:
Si necesita proxies, puede configurar el soporte de proxy con `setConfig`:

```php
$web = new \spekulatius\phpscraper;
$web->setProxy('http://user:password@127.0.0.1:3128');
$web->setConfig(['proxy' => 'http://user:password@127.0.0.1:3128']);
```


Expand Down
6 changes: 3 additions & 3 deletions websites/fr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,11 @@ Une configuration supplémentaire est facultative. Voici les options correctemen

### Support des proxys

Si vous avez besoin de proxys, vous pouvez configurer le support des proxys avec `setProxy` :
Si vous avez besoin de proxys, vous pouvez configurer le support des proxys avec `setConfig` :

```php
$web = new \spekulatius\phpscraper ;
$web->setProxy('http://user:password@127.0.0.1:3128') ;
$web = new \spekulatius\phpscraper;
$web->setConfig(['proxy' => 'http://user:password@127.0.0.1:3128']);
```


Expand Down
4 changes: 2 additions & 2 deletions websites/vi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,11 @@ Các cấu hình sau không bắt buộc. Bên dưới là các tuỳ chọn có

### Hỗ trợ proxy

Nếu bạn cần sử dụng proxy, bạn có thể cấu hình proxy bằng `setProxy`:
Nếu bạn cần sử dụng proxy, bạn có thể cấu hình proxy bằng `setConfig`:

```php
$web = new \spekulatius\phpscraper;
$web->setProxy('http://user:password@127.0.0.1:3128');
$web->setConfig(['proxy' => 'http://user:password@127.0.0.1:3128']);
```


Expand Down

0 comments on commit e19baeb

Please sign in to comment.