minor bugfix and add usage of raw_output option

relwell · Jun 23, 2013 · 33888c3 · 33888c3
1 parent ab2b8a5
commit 33888c3
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # A Python wrapper for the Java Stanford Core NLP tools
 ---------------------------
 
-This is a fork of [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python)
+This is a fork of Dustin Smith's [stanford-corenlp-python](https://github.com/dasmith/stanford-corenlp-python). A Python interface to [Stanford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml). It can either be python package, or run as a JSON-RPC server.
 
 ## Edited
    * Update to Stanford CoreNLP v3.2.0
@@ -130,6 +130,12 @@ If you need to parse long texts (more than 30-50 sentences), you have to use a b
     parsed = batch_parse(raw_text_directory, corenlp_dir)  # It returns a generator object
     print parsed  #=> [{'coref': ..., 'sentences': ..., 'file_name': 'new_sample.txt'}]
 
+The function uses XML output feature of Stanford CoreNLP, and you can take all information by `raw_output` option. If true, CoreNLP's XML is returned as a dictionary without converting the format.
+
+    parsed = batch_parse(raw_text_directory, corenlp_dir, raw_output=True)
+
+(note: The function requires xmltodict now, you must install it by `sudo pip install xmltodict`)
+
 ## Developer
    * Hiroyoshi Komatsu [hiroyoshi.komat@gmail.com]
    * Johannes Castner [jac2130@columbia.edu]
diff --git a/corenlp/corenlp.py b/corenlp/corenlp.py
@@ -293,18 +293,20 @@ def parse_xml_output(input_dir, corenlp_path=DIRECTORY, memory="3g", raw_output=
     call(command, shell=True)
 
     #reading in the raw xml file:
-    result = []
+    # result = []
     try:
         for output_file in os.listdir(xml_dir):
             with open(xml_dir+'/'+output_file, 'r') as xml:
                 # parsed = xml.read()
                 file_name = re.sub('.xml$', '', os.path.basename(output_file))
-                result.append(parse_parser_xml_results(xml.read(), file_name,
-                                                       raw_output=raw_output))
+                # result.append(parse_parser_xml_results(xml.read(), file_name,
+                #                                        raw_output=raw_output))
+                yield parse_parser_xml_results(xml.read(), file_name,
+                                               raw_output=raw_output)
     finally:
         file_list.close()
         shutil.rmtree(xml_dir)
-    return result
+    # return result
 
 class StanfordCoreNLP:
     """

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 AUTHOR = "Hiroyoshi Komatsu"
 AUTHOR_EMAIL = "hiroyoshi.komat@gmail.com"
 URL = "https://bitbucket.org/torotoki/corenlp-python"
-VERSION = "3.2.0-0"
+VERSION = "3.2.0-1"
 
 # Utility function to read the README file.
 # Used for the long_description.  It's nice, because now 1) we have a top level