-
Notifications
You must be signed in to change notification settings - Fork 2.7k
/
TransformXML.java
381 lines (341 loc) · 14.7 KB
/
TransformXML.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
package edu.stanford.nlp.process;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.*;
import java.util.*;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.util.function.Function;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.util.XMLUtils;
/**
* Reads XML from an input file or stream and writes XML to an output
* file or stream, while transforming text appearing inside specified
* XML tags by applying a specified {@link Function
* <code>Function</code>}. See TransformXMLApplications for examples.
* <i>Implementation note:</i> This is done using SAX2.
*
* @param <T> The type of the output of the Function (from String to T)
* @author Bill MacCartney
* @author Anna Rafferty (refactoring, making SAXInterface easy to extend elsewhere)
*/
public class TransformXML<T> {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(TransformXML.class);
private final SAXParser saxParser;
public SAXInterface<T> buildSaxInterface() { return new SAXInterface<>(); }
public static class SAXInterface<T> extends DefaultHandler {
protected List<String> elementsToBeTransformed;
protected StringBuilder textToBeTransformed;
protected PrintWriter outWriter = new PrintWriter(System.out, true);
protected Function<String,T> function;
/**
* How far down we are in the nested tags. For example, if we've
* seen <foo> <bar> and "foo" and "bar" are both tags
* we care about, then depth = 2.
*/
protected int depth = 0;
public SAXInterface() {
elementsToBeTransformed = new ArrayList<>();
depth = 0;
openingTag = null;
textToBeTransformed = new StringBuilder();
}
/**
* The first tag from {@link <code>elementsToBeTransformed</code>}
* that we saw the last time {@link <code>depth</code>} was
* <code>0</code>.
* <br>
* You would expect incoming XML to be well-formatted, but just in
* case it isn't, we keep track of this so we can output the
* correct closing tag.
*/
String openingTag;
private void outputTextAndTag(String qName, Attributes attributes, boolean close) {
// If we're not already in an element to be transformed, first
// echo the previous text...
outWriter.print(XMLUtils.escapeXML(textToBeTransformed.toString()));
textToBeTransformed = new StringBuilder();
// ... then echo the new tag to outStream
outWriter.print('<');
if (close) {
outWriter.print('/');
}
outWriter.print(qName);
if (attributes != null) {
for (int i = 0; i < attributes.getLength(); i++) {
outWriter.print(' ');
outWriter.print(attributes.getQName(i));
outWriter.print("=\"");
outWriter.print(XMLUtils.escapeXML(attributes.getValue(i)));
outWriter.print('"');
}
}
outWriter.print(">\n");
}
@Override
public void endDocument() {
// Theoretically, there shouldn't be anything in the buffer after
// the last closing tag, but if there is, it's probably better to
// echo it than ignore it
outWriter.print(XMLUtils.escapeXML(textToBeTransformed.toString()));
// we need to flush because there are no other ways we
// explicitely flush
outWriter.flush();
}
// Called at the beginning of each element. If the tag is on the
// designated list, set flag to remember that we're in an element
// to be transformed. In either case, echo tag.
@Override
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
//log.info("start element " + qName);
if (depth == 0) {
outputTextAndTag(qName, attributes, false);
}
if (elementsToBeTransformed.contains(qName)) {
if (depth == 0) {
openingTag = qName;
}
++depth;
}
}
// Called at the end of each element. If the tag is on the
// designated list, apply the designated {@link Function
// <code>Function</code>} to the accumulated text and echo the the
// result. In either case, echo the closing tag.
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException
{
//log.info("end element " + qName + "; function is " + function.getClass());
//log.info("elementsToBeTransformed is " + elementsToBeTransformed);
//log.info("textToBeTransformed is " + textToBeTransformed);
if (depth == 0) {
outputTextAndTag(qName, null, true);
} else {
if (elementsToBeTransformed.contains(qName)) {
--depth;
if (depth == 0) {
String text = textToBeTransformed.toString().trim();
// factored out so subclasses can handle the text differently
processText(text);
textToBeTransformed = new StringBuilder();
outWriter.print("</" + openingTag + ">\n");
}
}
// when we're inside a block to be transformed, we ignore
// elements that don't end the block.
}
}
public void processText(String text) {
if (text.length() > 0) {
text = function.apply(text).toString();
outWriter.print(XMLUtils.escapeXML(text));
outWriter.print('\n');
}
}
// Accumulate characters in buffer of text to be transformed
// (SAX may call this after each line break)
@Override
public void characters(char[] buf, int offset, int len) throws SAXException {
// log.info("characters |" + new String(buf, offset, len) + "|");
textToBeTransformed.append(buf, offset, len);
}
} // end static class SAXInterface
/**
* This version of the SAXInterface doesn't escape the text produced
* by the function. This is useful in the case where the function
* already produces well-formed XML. One example of this is the
* Tagger, which already escapes the inner text and produces xml
* tags around the words.
*/
public static class NoEscapingSAXInterface<T> extends SAXInterface<T> {
@Override
public void processText(String text) {
if (text.length() > 0) {
text = function.apply(text).toString();
outWriter.print(text);
outWriter.print('\n');
}
}
}
public TransformXML() {
try {
saxParser = SAXParserFactory.newInstance().newSAXParser();
} catch (Exception e) {
log.info("Error configuring XML parser: " + e);
throw new RuntimeException(e);
}
}
/**
* Read XML from the specified file and write XML to stdout,
* while transforming text appearing inside the specified XML
* tags by applying the specified {@link Function
* <code>Function</code>}. Note that the <code>Function</code>
* you supply must be prepared to accept <code>String</code>s as
* input; if your <code>Function</code> doesn't handle
* <code>String</code>s, you need to write a wrapper for it that
* does.
*
* @param tags an array of <code>String</code>s, each an XML tag
* within which the transformation should be applied
* @param fn the {@link Function <code>Function</code>} to apply
* @param in the <code>File</code> to read from
*/
public void transformXML(String[] tags, Function<String,T> fn, File in) {
InputStream ins = null;
try {
ins = new BufferedInputStream(new FileInputStream(in));
transformXML(tags, fn, ins, System.out);
} catch (Exception e) {
log.info("Error reading file " + in + ": " + e);
e.printStackTrace();
} finally {
IOUtils.closeIgnoringExceptions(ins);
}
}
/**
* Read XML from the specified file and write XML to specified file,
* while transforming text appearing inside the specified XML tags
* by applying the specified {@link Function <code>Function</code>}.
* Note that the <code>Function</code> you supply must be
* prepared to accept <code>String</code>s as input; if your
* <code>Function</code> doesn't handle <code>String</code>s, you
* need to write a wrapper for it that does.
*
* @param tags an array of <code>String</code>s, each an XML tag
* within which the transformation should be applied
* @param fn the {@link Function <code>Function</code>} to apply
* @param in the <code>File</code> to read from
* @param out the <code>File</code> to write to
*/
public void transformXML(String[] tags, Function<String,T> fn, File in, File out) {
InputStream ins = null;
OutputStream outs = null;
try {
ins = new BufferedInputStream(new FileInputStream(in));
outs = new BufferedOutputStream(new FileOutputStream(out));
transformXML(tags, fn, ins, outs);
} catch (Exception e) {
log.info("Error reading file " + in + " or writing file " + out + ": " + e);
e.printStackTrace();
} finally {
IOUtils.closeIgnoringExceptions(ins);
IOUtils.closeIgnoringExceptions(outs);
}
}
/**
* Read XML from input stream and write XML to stdout, while
* transforming text appearing inside the specified XML tags by
* applying the specified {@link Function <code>Function</code>}.
* Note that the <code>Function</code> you supply must be
* prepared to accept <code>String</code>s as input; if your
* <code>Function</code> doesn't handle <code>String</code>s, you
* need to write a wrapper for it that does.
*
* @param tags an array of <code>String</code>s, each an XML tag
* within which the transformation should be applied
* @param fn the {@link Function <code>Function</code>} to apply
* @param in the <code>InputStream</code> to read from
*/
public void transformXML(String[] tags, Function<String,T> fn, InputStream in) {
transformXML(tags, fn, in, System.out);
}
/**
* Read XML from input stream and write XML to output stream,
* while transforming text appearing inside the specified XML tags
* by applying the specified {@link Function <code>Function</code>}.
* Note that the <code>Function</code> you supply must be
* prepared to accept <code>String</code>s as input; if your
* <code>Function</code> doesn't handle <code>String</code>s, you
* need to write a wrapper for it that does.
*
* @param tags an array of <code>String</code>s, each an XML tag
* within which the transformation should be applied
* @param fn the {@link Function <code>Function</code>} to apply
* @param in the <code>InputStream</code> to read from
* @param out the <code>OutputStream</code> to write to
*/
public void transformXML(String[] tags, Function<String,T> fn, InputStream in, OutputStream out) {
transformXML(tags, fn, in, new OutputStreamWriter(out),
buildSaxInterface());
}
/**
* Read XML from input stream and write XML to output stream,
* while transforming text appearing inside the specified XML tags
* by applying the specified {@link Function <code>Function</code>}.
* Note that the <code>Function</code> you supply must be
* prepared to accept <code>String</code>s as input; if your
* <code>Function</code> doesn't handle <code>String</code>s, you
* need to write a wrapper for it that does.
* <p><i>Implementation notes:</i> The InputStream is assumed to already
* be buffered if useful, and we need a stream, so that the XML decoder
* can determine the correct character encoding of the XML file. The output
* is to a Writer, and the provided Writer should again be buffered if
* desirable. Internally, this Writer is wrapped as a PrintWriter.
*
* @param tags an array of <code>String</code>s, each an XML entity
* within which the transformation should be applied
* @param fn the {@link Function <code>Function</code>} to apply
* @param in the <code>InputStream</code> to read from
* @param w the <code>Writer</code> to write to
*/
public void transformXML(String[] tags, Function<String,T> fn, InputStream in, Writer w) {
transformXML(tags, fn, in, w, buildSaxInterface());
}
/**
* Calls the fully specified transformXML with an InputSource
* constructed from <code>in</code>.
*/
public void transformXML(String[] tags, Function<String,T> fn, InputStream in, Writer w, SAXInterface<T> handler) {
transformXML(tags, fn, new InputSource(in), w, handler);
}
/**
* Calls the fully specified transformXML with an InputSource
* constructed from <code>in</code>.
*/
public void transformXML(String[] tags, Function<String,T> fn, Reader in, Writer w, SAXInterface<T> handler) {
transformXML(tags, fn, new InputSource(in), w, handler);
}
/**
* Read XML from input source and write XML to output writer,
* while transforming text appearing inside the specified XML tags
* by applying the specified {@link Function <code>Function</code>}.
* Note that the <code>Function</code> you supply must be
* prepared to accept <code>String</code>s as input; if your
* <code>Function</code> doesn't handle <code>String</code>s, you
* need to write a wrapper for it that does.
* <br>
* <p><i>Implementation notes:</i> The InputSource is assumed to already
* be buffered if useful, and we need a stream, so that the XML decoder
* can determine the correct character encoding of the XML file.
* TODO: does that mean there's a bug if you send it a Reader
* instead of an InputStream? It seems to work with a Reader...
* <br>
* The output is to a Writer, and the provided Writer should again
* be buffered if desirable. Internally, this Writer is wrapped as
* a PrintWriter.
*
* @param tags an array of <code>String</code>s, each an XML entity
* within which the transformation should be applied
* @param fn the {@link Function <code>Function</code>} to apply
* @param in the <code>InputStream</code> to read from
* @param w the <code>Writer</code> to write to
* @param saxInterface the sax handler you would like to use (default is SaxInterface, defined in this class, but you may define your own handler)
*/
public void transformXML(String[] tags, Function<String,T> fn, InputSource in, Writer w, SAXInterface<T> saxInterface) {
saxInterface.outWriter = new PrintWriter(w, true);
saxInterface.function = fn;
saxInterface.elementsToBeTransformed = new ArrayList<>();
saxInterface.elementsToBeTransformed.addAll(Arrays.asList(tags));
try {
saxParser.parse(in, saxInterface);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
} // end class TransformXML