/
TikaServerTextExtractor.php
137 lines (120 loc) · 3.07 KB
/
TikaServerTextExtractor.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Rest\TikaRestClient;
/**
* Enables text extraction of file content via the Tika Rest Server
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
class TikaServerTextExtractor extends FileTextExtractor
{
/**
* Tika server is pretty efficient so use it immediately if available
*
* @var integer
* @config
*/
private static $priority = 80;
/**
* Server endpoint
*
* @var string
* @config
*/
private static $server_endpoint;
/**
* @var TikaRestClient
*/
protected $client = null;
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = [];
/**
* @return TikaRestClient
*/
public function getClient()
{
if (!$this->client) {
$this->client = Injector::inst()->createWithArgs(
TikaRestClient::class,
[$this->getServerEndpoint()]
);
}
return $this->client;
}
/**
* @return string
*/
public function getServerEndpoint()
{
if ($endpoint = Environment::getEnv('SS_TIKA_ENDPOINT')) {
return $endpoint;
}
// Default to configured endpoint
return $this->config()->get('server_endpoint');
}
/**
* Get the version of Tika installed, or 0 if not installed
*
* @return float version of Tika
*/
public function getVersion()
{
return $this->getClient()->getVersion();
}
/**
* @return boolean
*/
public function isAvailable()
{
return $this->getServerEndpoint()
&& $this->getClient()->isAvailable()
&& version_compare($this->getVersion() ?? '', '1.7') >= 0;
}
/**
* @param string $extension
* @return boolean
*/
public function supportsExtension($extension)
{
// Determine support via mime type only
return false;
}
/**
* @param string $mime
* @return boolean
*/
public function supportsMime($mime)
{
if (!$this->supportedMimes) {
$this->supportedMimes = (array) $this->getClient()->getSupportedMimes();
}
// Check if supported (most common / quickest lookup)
if (isset($this->supportedMimes[$mime])) {
return true;
}
// Check aliases
foreach ($this->supportedMimes as $info) {
if (isset($info['alias']) && in_array($mime, $info['alias'] ?? [])) {
return true;
}
}
return false;
}
public function getContent($file)
{
$tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file;
$content = $this->getClient()->tika($tempFile);
//Cleanup temp file
if ($file instanceof File) {
unlink($tempFile ?? '');
}
return $content;
}
}