-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTikaWrapper.java
127 lines (109 loc) · 3.3 KB
/
TikaWrapper.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/**
*
*/
package uk.bl;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* @author Peter May (The British Library)
*/
public class TikaWrapper {
/* Singleton Tika instance */
private static TikaWrapper tika;
/* Reference to parser */
private Parser parser;
/* Private constructor to enable singleton creation */
private TikaWrapper(){
parser = new AutoDetectParser();
}
/**
* Returns a singleton instance of the TikaWrapper class
* @return
*/
public static synchronized TikaWrapper getTika() {
if (tika==null){
tika = new TikaWrapper();
}
return tika;
}
/**
* Returns a MediaType object representing the mime-type of the specified file.
* @param file the file to find the mime-type information of
* @return MediaType an object representing the mime-type information of the specified file
* @throws FileNotFoundException
*/
public MediaType getMimeType(File file) throws FileNotFoundException {
MediaType mediaType = null;
try{
Metadata metadata = new Metadata();
TikaInputStream stream = TikaInputStream.get(file, metadata);
try {
mediaType = ((AutoDetectParser) parser).getDetector().detect(stream, metadata);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
stream.close();
}
} catch (FileNotFoundException fnfe){
throw fnfe;
} catch (IOException ioe){
System.err.println("IO Exception: "+ioe);
ioe.printStackTrace();
}
return mediaType;
}
/**
* Parses the specified file using Tika and returns a String containing metadata information
* @param file the file to parse
* @return String a String containing metadata information
* @throws FileNotFoundException
*/
public String parse(File file) throws FileNotFoundException {
StringBuilder metadataBuffer = new StringBuilder();
Metadata metadata = new Metadata();
try{
TikaInputStream stream = TikaInputStream.get(file, metadata);
ContentHandler handler = new TeeContentHandler();
try {
((AutoDetectParser) parser).parse(stream, handler, metadata);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TikaException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
stream.close();
}
} catch (FileNotFoundException fnfe){
throw fnfe;
} catch (IOException ioe){
System.err.println("IO Exception: "+ioe);
ioe.printStackTrace();
}
String[] names = metadata.names();
Arrays.sort(names);
for (String name : names) {
metadataBuffer.append(name);
metadataBuffer.append(": ");
metadataBuffer.append(metadata.get(name));
metadataBuffer.append("\n");
}
return metadata.toString();
}
}