Permalink
Browse files

Added support for Plain Text output format on top of existing XML for…

…mat. Use Case: Needed support to extract text only from the Wikipedia in order to use it as a Corpus for different Machine Learning experiments.
  • Loading branch information...
1 parent 60309d3 commit ccc0df530f6887bc3d4d60a633645c18a63e1a88 @smhumayun committed May 9, 2016
@@ -0,0 +1,78 @@
+package se.lth.cs.nlp.io;
+
+import se.lth.cs.nlp.mediawiki.model.WikipediaPage;
+import se.lth.cs.nlp.pipeline.Sink;
+
+import java.io.File;
+import java.io.IOError;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.List;
+
+/**
+ * This file/class is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This file/class is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this forked version of wikiforia.
+ * If not, see <http://www.gnu.org/licenses/>.
+ *
+ * @author Syed Muhammad Humayun - smhumayun@gmail.com - http://www.smhumayun.com
+ */
+public class PlainTextWikipediaPageWriter implements Sink<WikipediaPage> {
+
+ private final File output;
+ private FileChannel fileChannel;
+
+ /**
+ * Default constructor
+ * @param output which file to write to
+ */
+ public PlainTextWikipediaPageWriter(File output) {
+ try {
+ this.output = output;
+ this.fileChannel = FileChannel.open(Paths.get(output.toURI()), StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING);
+ } catch (IOException e) {
+ throw new IOError(e);
+ }
+ }
+
+ @Override
+ public synchronized void process(List<WikipediaPage> batch) {
+ if(this.fileChannel == null)
+ return;
+
+ try {
+ if(batch.size() == 0) {
+ this.fileChannel.write(ByteBuffer.wrap("\n".getBytes("utf-8")));
+ this.fileChannel.close();
+ this.fileChannel = null;
+ return;
+ }
+
+ for (WikipediaPage wikipediaPage : batch) {
+ if(wikipediaPage.getText().length() > 0) {
+ this.fileChannel.write(ByteBuffer.wrap(wikipediaPage.getText().getBytes("utf-8")));
+ this.fileChannel.write(ByteBuffer.wrap("\n".getBytes("utf-8")));
+ }
+ }
+ } catch (IOException e) {
+ throw new IOError(e);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return String.format("XML Writer { target: %s }", output.getAbsolutePath());
+ }
+}
@@ -20,13 +20,15 @@
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import se.lth.cs.nlp.io.PlainTextWikipediaPageWriter;
import se.lth.cs.nlp.io.SimpleHadoopTextWriter;
import se.lth.cs.nlp.io.XmlWikipediaPageWriter;
import se.lth.cs.nlp.mediawiki.model.Page;
import se.lth.cs.nlp.mediawiki.model.WikipediaPage;
import se.lth.cs.nlp.mediawiki.parser.MultistreamBzip2XmlDumpParser;
import se.lth.cs.nlp.mediawiki.parser.SinglestreamXmlDumpParser;
import se.lth.cs.nlp.pipeline.Filter;
+import se.lth.cs.nlp.pipeline.Sink;
import se.lth.cs.nlp.pipeline.Source;
import se.lth.cs.nlp.wikipedia.lang.EnglishConfig;
import se.lth.cs.nlp.wikipedia.lang.LangFactory;
@@ -35,7 +37,6 @@
import java.io.File;
import java.io.IOError;
import java.io.IOException;
-import java.nio.channels.Pipe;
import java.util.ArrayList;
import java.util.TreeSet;
import java.util.regex.Matcher;
@@ -114,6 +115,17 @@
.withArgName("language")
.create("lang");
+ @SuppressWarnings("static-access")
+ private static final Option outputFormatOption = OptionBuilder.withLongOpt("output-format")
+ .withDescription("Output format : xml or plain-text")
+ .hasArg()
+ .withArgName("outputformat")
+ .create("outputformat");
+
+ private static final String OUTPUT_FORMAT_XML = "xml";
+ private static final String OUTPUT_FORMAT_PLAIN_TEXT = "plain-text";
+ private static final String OUTPUT_FORMAT_DEFAULT = OUTPUT_FORMAT_XML;
+
/**
* Used to invoke the hadoop conversion internally
* @param config the language config
@@ -197,18 +209,50 @@ public static void convert(
int numThreads,
int batchsize)
{
+ convert(config, indexPath, pagesPath, outputPath, numThreads, batchsize, OUTPUT_FORMAT_DEFAULT);
+ }
+
+ /**
+ * Used to invoke the conversion internally
+ * @param config the language config
+ * @param indexPath the index path (might be null)
+ * @param pagesPath the pages path (must never be null)
+ * @param outputPath the output path (must never be null)
+ * @param numThreads the number of threads to use
+ * @param batchsize the size of a batch
+ * @param outputFormat format of output i.e. xml or plain-text
+ */
+ public static void convert(
+ TemplateConfig config,
+ File indexPath,
+ File pagesPath,
+ File outputPath,
+ int numThreads,
+ int batchsize,
+ String outputFormat)
+ {
Source<Page,Void> source;
if(index == null)
source = new SinglestreamXmlDumpParser(pagesPath, batchsize);
else
source = new MultistreamBzip2XmlDumpParser(indexPath, pagesPath, batchsize, numThreads);
- Pipeline pipeline = new Pipeline(source, new XmlWikipediaPageWriter(outputPath), config);
+ Pipeline pipeline = new Pipeline(source, getSink(outputFormat, outputPath), config);
pipeline.run();
}
/**
+ * @param outputFormat output format
+ * @param outputPath output path
+ * @return Sink
+ */
+ private static Sink<WikipediaPage> getSink(String outputFormat, File outputPath) {
+ return outputFormat != null && outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_PLAIN_TEXT)
+ ? new PlainTextWikipediaPageWriter(outputPath) : new XmlWikipediaPageWriter(outputPath);
+ }
+
+ /**
* Used to invoke the conversion internally
* @param config the language config
* @param indexPath the index path (might be null)
@@ -227,14 +271,38 @@ public static void convert(
int batchsize,
ArrayList<Filter<WikipediaPage>> filters)
{
+ convert(config, indexPath, pagesPath, outputPath, numThreads, batchsize, filters, OUTPUT_FORMAT_DEFAULT);
+ }
+
+ /**
+ * Used to invoke the conversion internally
+ * @param config the language config
+ * @param indexPath the index path (might be null)
+ * @param pagesPath the pages path (must never be null)
+ * @param outputPath the output path (must never be null)
+ * @param numThreads the number of threads to use
+ * @param batchsize the size of a batch
+ * @param filters All filters to append
+ * @param outputFormat format of output i.e. xml or plain-text
+ */
+ public static void convert(
+ TemplateConfig config,
+ File indexPath,
+ File pagesPath,
+ File outputPath,
+ int numThreads,
+ int batchsize,
+ ArrayList<Filter<WikipediaPage>> filters,
+ String outputFormat)
+ {
Source<Page,Void> source;
if(index == null)
source = new SinglestreamXmlDumpParser(pagesPath, batchsize);
else
source = new MultistreamBzip2XmlDumpParser(indexPath, pagesPath, batchsize, numThreads);
- Pipeline pipeline = new Pipeline(source, new XmlWikipediaPageWriter(outputPath), config);
+ Pipeline pipeline = new Pipeline(source, getSink(outputFormat, outputPath), config);
pipeline.appendAllFilters(filters);
pipeline.run();
}
@@ -276,6 +344,7 @@ public static void main( String[] args )
options.addOption(gzip);
options.addOption(testDecompression);
options.addOption(filterNs);
+ options.addOption(outputFormatOption);
CommandLineParser parser = new PosixParser();
try {
@@ -284,6 +353,7 @@ public static void main( String[] args )
File indexPath = null, pagesPath, outputPath;
int batchsize = 100;
int numThreads = Runtime.getRuntime().availableProcessors();
+ String outputFormat = OUTPUT_FORMAT_DEFAULT;
//Read batch size
if(cmdline.hasOption(batch.getOpt())) {
@@ -295,6 +365,11 @@ public static void main( String[] args )
numThreads = Integer.parseInt(cmdline.getOptionValue(threads.getOpt()));
}
+ //Output format
+ if(cmdline.hasOption(outputFormatOption.getOpt())) {
+ outputFormat = cmdline.getOptionValue(outputFormatOption.getOpt());
+ }
+
//Read required paths
pagesPath = new File(cmdline.getOptionValue(pages.getOpt()));
outputPath = new File(cmdline.getOptionValue(output.getOpt()));
@@ -410,7 +485,7 @@ public String toString() {
test(config, indexPath, pagesPath, numThreads, batchsize);
}
else {
- convert(config,indexPath,pagesPath, outputPath, numThreads, batchsize, filters);
+ convert(config,indexPath,pagesPath, outputPath, numThreads, batchsize, filters, outputFormat);
}
}

0 comments on commit ccc0df5

Please sign in to comment.