In [153]:
try:
    from xml.etree.cElementTree import XML
except ImportError:
    from xml.etree.ElementTree import XML
import zipfile


"""
Module that extract text from MS XML Word document (.docx).
(Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
"""

WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'


def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [node.text
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)

In [154]:
dump = get_docx_text('TNOTR.docx')

In [155]:
import re

In [156]:
s = re.compile(r'\n[a-zA-Z]+\n')

In [157]:
s1 = s.findall(dump)

In [158]:
words = list(map(lambda x: x[1:][:-1], s1))

In [159]:
words

['verbatim',
 'concord',
 'consecrate',
 'diaphanous',
 'cheeks',
 'voluptuous',
 'delirium',
 'reticent',
 'penance',
 'colloquy',
 'erudite',
 'prostrating',
 'enamored',
 'trifling',
 'necromantic',
 'omniscient',
 'expounded',
 'exterminated',
 'inundated',
 'injunction',
 'scriptorium',
 'tortuous',
 'inexplicable',
 'despaired',
 'elixir',
 'evangelized',
 'assiduous',
 'imploring',
 'nocturnal',
 'racked',
 'canonical',
 'inopportune',
 'exacerbate',
 'imbecile',
 'firmament',
 'imminent',
 'inscrutable',
 'temporal']

In [168]:
import json
with open('TNOTR.txt', 'w') as outfile:
    for word in words:
        outfile.write(word)
        outfile.write('\n')