In [252]:
import lxml.html
import time
import re
from tqdm import tqdm
from stanfordnlp.server import CoreNLPClient

class WikiDocument:
    def __init__(self, path=''):
        self.path = path
        self.title = ''
        self.namespace = ''
        self.document_id = 0
        self.text =''
        self.body = ''
        # Holds all formulas found within the document.
        # The key of the HashMap is the replacement string in the document and the value contains the TeX String
        self.formulae = [] # [{'hash': foo1, 'src':foo2}, ...]
        # Stores all unique identifiers found in this document
        self.identifiers = []
        self.sentences = []
        self.tagged_sentence_list = []
        self.description_candidate = []
        
    def processor(self):
        title_regexp = re.compile(r'(?:<title>)(.*?)(?:</title>)', re.DOTALL)
        namespace_regexp = re.compile(r'(?:<ns>)(.*?)(?:</ns>)', re.DOTALL)
        id_regexp = re.compile(r'(?:<revision>.*?<id>)(\d+)(?:</id>)', re.DOTALL)
        text_regexp = re.compile(r'(?:<text.*?>)(.*?)(?:</text>)', re.DOTALL)
        body_regexp = re.compile(r'(?:<body.*?>)(.*?)(?:</body>)', re.DOTALL)
        annotation_xml_regexp = re.compile(r'(?:<annotation-xml.*?>)(.*?)(?:</annotation-xml>)', re.DOTALL)
        annotation_regexp = re.compile(r'(?:<annotation.*?>)(.*?)(?:</annotation>)', re.DOTALL)
        
        with open(doc.path, 'r') as document_open:
            document_read = document_open.read()
            title = title_regexp.findall(document_read)
            namespace = namespace_regexp.findall(document_read)
            document_id = id_regexp.findall(document_read)
            text = text_regexp.findall(document_read)
            body = body_regexp.findall(document_read)
        
        if title: 
            self.title = title[0]
        if namespace:
            self.namespace = namespace[0]
        if document_id:
            self.document_id = document_id[0]
        if text:
            self.text = text[0]
        if body:
            body = body[0]
            # remove annotation-xml tag and anntation tag
            body = annotation_xml_regexp.sub('', body)
            body = annotation_regexp.sub('', body)
            self.body = body
        print("process document")    
        
    def extract_identifiers(self):
        tree = lxml.html.parse(self.path)
        html = tree.getroot()
        print('Number of math components is {}'.format(len(html.cssselect('math'))))
        identifiers = []

        def is_identifier(math_component):
            is_mi = (math_component.tag == 'mi')
            is_math_component_len_1 = (len(math_component.text_content())==1)
            is_italic = (math_component.get('mathvariant')=='italic')
            return is_mi and (is_math_component_len_1 or is_italic)

        for html_math in html.cssselect('math'):
            # variable with subscript
            for html_math_msub in html_math.cssselect('msub'):
                html_math_msub_component = [x for x in html_math_msub.iterchildren()]
                math_txt = '_'.join([x.text_content() for x in html_math_msub_component])
                if is_identifier(html_math_msub_component[0]) and (not math_txt in identifiers):
                    identifiers.append(math_txt)
                html_math_msub.drop_tree()
                # replace identifier by 'MATH_'+str(id)
                reg_string = lxml.html.tostring(html_math_msub, encoding='unicode')
                if is_identifier(html_math_msub_component[0]):
                    doc.body = doc.body.replace(reg_string, 'MATH{:04d}'.format(identifiers.index(math_txt)))
                else:
                    doc.body = doc.body.replace(reg_string, math_txt)

            # variable with superscript
            for html_math_msup in html_math.cssselect('msup'):
                html_math_msup_component = [x for x in html_math_msup.iterchildren()]
                math_txt = '^'.join([x.text_content() for x in html_math_msup_component])
                if is_identifier(html_math_msup_component[0]) and (not math_txt in identifiers):
                    identifiers.append(math_txt)
                html_math_msup.drop_tree()
                reg_string = lxml.html.tostring(html_math_msup, encoding='unicode')
                if is_identifier(html_math_msup_component[0]):
                    doc.body = doc.body.replace(reg_string, 'MATH{:04d}'.format(identifiers.index(math_txt)))
                else:
                    doc.body = doc.body.replace(reg_string, math_txt)
                
            # variable with subscript and superscript
            for html_math_msubsup in html_math.cssselect('msubsup'):
                html_math_msubsup_component = [x for x in html_math_msubsup.iterchildren()]
                math_txt = [x.text_content() for x in html_math_msubsup_component]
                math_txt = math_txt[0] + '_' + math_txt[1] + '^' + math_txt[2]
                if is_identifier(html_math_msubsup) and (not math_txt in identifiers):
                    identifiers.append(math_txt)
                html_math_msubsup.drop_tree()
                reg_string = lxml.html.tostring(html_math_msubsup, encoding='unicode')
                if is_identifier(html_math_msubsup):
                    doc.body = doc.body.replace(reg_string, 'MATH{:04d}'.format(identifiers.index(math_txt)))
                else:
                    doc.body = doc.body.replace(reg_string, math_txt)

            # variable without subscript and superscript
            for html_math_mi in html_math.cssselect('mi'):
                math_txt = html_math_mi.text_content()
                if is_identifier(html_math_mi) and (not math_txt in identifiers):
                    identifiers.append(math_txt)
                html_math_mi.drop_tree()
                reg_string = lxml.html.tostring(html_math_mi, encoding='unicode')
                if is_identifier(html_math_mi):
                    doc.body = doc.body.replace(reg_string, 'MATH{:04d}'.format(identifiers.index(math_txt)))
                else:
                    doc.body = doc.body.replace(reg_string, math_txt)
        
        self.identifiers = identifiers
        self.text = lxml.html.fromstring(doc.body).text_content()

    def extract_sentences(self):
        # this extract sentences which contain identifier from text
        sentences = [[]]*len(self.identifiers)
        for i, identifier in enumerate(self.identifiers):
            sentences[i] = re.findall(r'.*?'+identifier+r'.*?\.', self.text)
        self.sentences = sentences
        
    def POS_tagging(self):
        tagged_sentence_list = [[]]*len(self.identifiers)
        with CoreNLPClient(annotators=['tokenize','ssplit','pos'], timeout=600000, memory='16G') as client:
            for i, sentence in enumerate(self.sentences):
                if sentence:
                    for text in sentence:
                        # submit the request to the server
                        ann = client.annotate(text)
                        sentence = ann.sentence[0]
                        word_pos = [(token.word, token.pos) for token in sentence.token]
                    tagged_sentence_list[i].append(word_pos)
        self.tagged_sentence_list = tagged_sentence_list
    
    def extract_description(self):
        count_identifiers = len(self.identifiers)
        math_id=range(count_identifiers)
        description_candidate = [[]] * count_identifiers
        reg_description = re.compile(r'(NN[PS]{0,2}|NP)')
        for math_id_ in math_id:
            tagged_sentence_list_i = self.tagged_sentence_list[math_id_]
            identifier = self.identifiers[math_id_]
            if not tagged_sentence_list_i:
                continue

            description_candidate_ = []
            for tagged_sentence_list_i_ in tagged_sentence_list_i:
                indexes_target = [n for n, v in enumerate(tagged_sentence_list_i_) if v == (identifier, 'NN')]
                for index_target in indexes_target:
                    # 1. <description> <identifier>
                    description=[]
                    for i in range(index_target-1, -1, -1):
                        (description_, pos_) = word_pos[i]
                        if (not 'MATH' in description_) and reg_description.fullmatch(pos_):
                            description.append(description_)
                        else:
                            break
                    if description:
                        description_candidate_.append((' '.join(description), ' '.join(description) + ' ' + identifier))

                    # 2. <identifier> is <description>
                    # 3. <identifier> is the <description>
                    description=[]
                    if word_pos[index_target+1][0]=='is':
                        mid_pattern = ' is '
                        if word_pos[index_target+2][0]=='the':
                            index_start = index_target+3
                            mid_pattern+='the '
                        else:
                            index_start = idnex_target+2
                        for i in range(index_start, len(word_pos)):
                            (description_, pos_) = word_pos[i]
                            if (not 'MATH' in description_) and reg_description.fullmatch(pos_):
                                description.append(description_)
                            else:
                                break
                        if description:
                            description_candidate_.append((' '.join(description), identifier + mid_pattern + ' '.join(description)))

                    # 4. let <identifier> be the <description>
                    if (word_pos[index_target-1][0]=='let') and (word_pos[index_target+1][0]=='be') and (word_pos[index_target+2][0]=='the'):
                        for i in range(index_target+3, len(word_pos)):
                            (description_, pos_) = word_pos[i]
                            if (not 'MATH' in description_) and reg_description.fullmatch(pos_):
                                description.append(description_)
                            else:
                                break
                        if description:
                            description_candidate_.append((' '.join(description), 'let ' + identifier + ' be the ' + ' '.join(description)))

                    # 5. <description> is|are denoted by <identifier>
                    if (word_pos[index_target-1][0]=='by') and (word_pos[index_target-2][0]=='denoted') and (word_pos[index_target-3][0]==('is' or 'are')):
                        for i in range(index_target-4, -1, -1):
                            (description_, pos_) = word_pos[i]
                            if (not 'MATH' in description_) and reg_description.fullmatch(pos_):
                                description.append(description_)
                            else:
                                break
                        if description:
                            description_candidate_.append((' '.join(description), ' '.join(description) + ' is|are denoted by ' + identifier))
                            description=[]
                        
                    # 6. <identifier> denotes */DT <description>
                    if (word_pos[index_target+1][0]=='denotes') and (word_pos[index_target+2][1]=='DT'):
                        for i in range(index_target+3, len(word_pos)):
                            (description_, pos_) = word_pos[i]
                            if (not 'MATH' in description_) and reg_description.fullmatch(pos_):
                                description.append(description_)
                            else:
                                break
                        if description:
                            description_candidate_.append((' '.join(description), identifier + ' denotes */DT ' + ' '.join(description)))

            description_candidate[math_id_] = description_candidate_
        self.description_candidate = description_candidate

In [253]:
document_path = 'data/resources/dataset-arXMLiv-08-2019/process-control_physical-model/1806.09460.html'

start_time = time.time()
doc = WikiDocument(document_path)
print("elapsed time : {0:.7f} seconds ---".format(time.time()-start_time))

doc.processor()
print("elapsed time : {0:.7f} seconds ---".format(time.time()-start_time))

doc.extract_identifiers()
print("elapsed time : {0:.7f} seconds ---".format(time.time()-start_time))

doc.extract_sentences()
print("elapsed time : {0:.7f} seconds ---".format(time.time()-start_time))

doc.POS_tagging()
print("elapsed time : {0:.7f} seconds ---".format(time.time()-start_time))

doc.extract_description()
print("elapsed time : {0:.7f} seconds ---".format(time.time()-start_time)) 

elapsed time : 0.0000591 seconds ---
process document --- 0.0267198 seconds ---
elapsed time : 0.0278871 seconds ---
Number of math components is 235
elapsed time : 0.5598660 seconds ---
elapsed time : 27.8405890 seconds ---
Starting server with command: java -Xmx16G -cp /Users/kato/GoogleDrive/project-mir/data/resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 600000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-cc021bc8f4394a15.props -preload tokenize,ssplit,pos
elapsed time : 92.5149491 seconds ---
elapsed time : 92.5202160 seconds ---


In [254]:
import pickle

In [260]:
with open('doc_1806.09460.binaryfile', 'wb') as f:
    pickle.dump(doc, f)

In [None]:
# load pickle file
# with open('doc.binaryfile', 'rb') as f:
#     doc = pickle.load(doc)

In [198]:
text = doc.sentences[0][0]
if text:
with CoreNLPClient(annotators=['tokenize','ssplit','pos'], timeout=60000, memory='16G') as client:
    # submit the request to the server
    ann = client.annotate(text)

    sentence = ann.sentence[0]
    word_pos = [(token.word, token.pos) for token in sentence.token]


Starting server with command: java -Xmx16G -cp /Users/kato/GoogleDrive/project-mir/data/resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-9c2dc7003f5e4c53.props -preload tokenize,ssplit,pos
---
first token of first sentence
word: "In"
pos: "IN"
value: "In"
before: ""
after: " "
originalText: "In"
beginChar: 0
endChar: 2
tokenBeginIndex: 0
tokenEndIndex: 1
hasXmlContext: false
isNewline: false

---
part of speech tag of token
IN


In [204]:

sentence.token[2].word

'classic'

In [115]:
# text.split()
word_pos = [(token.word, token.pos) for token in sentence.token]
#token.pos
print(word_pos)
indexes_target = [n for n, v in enumerate(word_pos) if v == ('MATH0002', 'NN')]


[('In', 'IN'), ('the', 'DT'), ('classic', 'JJ'), ('optimal', 'JJ'), ('control', 'NN'), ('problem', 'NN'), (',', ','), ('we', 'PRP'), ('begin', 'VBP'), ('with', 'IN'), ('a', 'DT'), ('dynamical', 'JJ'), ('system', 'NN'), ('governed', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('difference', 'NN'), ('equation', 'NN'), ('MATH0000', 'NN'), ('=', 'JJ'), ('MATH0001', 'NN'), ('-LRB-', '-LRB-'), ('MATH0002', 'NN'), (',', ','), ('MATH0003', 'NN'), (',', ','), ('MATH0004', 'NN'), ('-RRB-', '-RRB-'), ('where', 'WRB'), ('MATH0002', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('state', 'NN'), ('of', 'IN'), ('the', 'DT'), ('system', 'NN'), (',', ','), ('MATH0003', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('control', 'NN'), ('action', 'NN'), (',', ','), ('and', 'CC'), ('MATH0004', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('random', 'JJ'), ('disturbance', 'NN'), ('.', '.')]


In [124]:
index_target = indexes_target[0]
i=0
description_ = word_pos[index_target-i][0]
pos_ = word_pos[index_target-i][1]


In [196]:
word_pos = [(token.word, token.pos) for token in sentence.token]
indexes_target = [n for n, v in enumerate(word_pos) if v == ('MATH0002', 'NN')]
# print(index_target)
identifier='MATH0002'

def extract_description(tagged_sentence_list, identifier):
    indexes_target = [n for n, v in enumerate(tagged_sentence_list) if v == (identifier, 'NN')]
    description_candidate = []
    reg_description = re.compile(r'(NN[PS]{0,2}|NP)')
    for index_target in indexes_target:
        # patternに合うか判定
        # 1. <description> <identifier>
        description=[]
        for i in range(index_target-1, -1, -1):
            (description_, pos_) = word_pos[i]
            if (not 'MATH' in description_) and reg_description.fullmatch(pos_):
                description.append(description_)
            else:
                break
        if description:
            description_candidate.append((' '.join(description), ' '.join(description) + ' ' + identifier))
            description=[]

        # 2. <identifier> is <description>
        # 3. <identifier> is the <description>
        if word_pos[index_target+1][0]=='is':
            mid_pattern = ' is '
            if word_pos[index_target+2][0]=='the':
                index_start = index_target+3
                mid_pattern+='the '
            else:
                index_start = idnex_target+2
            for i in range(index_start, len(word_pos)):
                (description_, pos_) = word_pos[i]
                if (not 'MATH' in description_) and reg_description.fullmatch(pos_):
                    description.append(description_)
                else:
                    break
            if description:
                description_candidate.append((' '.join(description), identifier + mid_pattern + ' '.join(description)))

        # 4. let <identifier> be the <description>
        if (word_pos[index_target-1][0]=='let') and (word_pos[index_target+1][0]=='be') and (word_pos[index_target+2][0]=='the'):
            for i in range(index_target+3, len(word_pos)):
                (description_, pos_) = word_pos[i]
                if (not 'MATH' in description_) and reg_description.fullmatch(pos_):
                    description.append(description_)
                else:
                    break
            if description:
                description_candidate.append((' '.join(description), 'let ' + identifier + ' be the ' + ' '.join(description)))

        # 5. <description> is|are denoted by <identifier>
        if (word_pos[index_target-1][0]=='by') and (word_pos[index_target-2][0]=='denoted') and (word_pos[index_target-3][0]==('is' or 'are')):
            for i in range(index_target-4, -1, -1):
                (description_, pos_) = word_pos[i]
                if (not 'MATH' in description_) and reg_description.fullmatch(pos_):
                    description.append(description_)
                else:
                    break
            if description:
                description_candidate.append((' '.join(description), ' '.join(description) + ' is|are denoted by ' + identifier))
                description=[]
            
        # 6. <identifier> denotes */DT <description>
        if (word_pos[index_target+1][0]=='denotes') and (word_pos[index_target+2][1]=='DT'):
            for i in range(index_target+3, len(word_pos)):
                (description_, pos_) = word_pos[i]
                if (not 'MATH' in description_) and reg_description.fullmatch(pos_):
                    description.append(description_)
                else:
                    break
            if description:
                description_candidate.append((' '.join(description), identifier + ' denotes */DT ' + ' '.join(description)))
    return description_candidate

In [197]:
description_candidate

[('state', 'MATH0002 is the state')]

In [193]:
word_pos

[('In', 'IN'),
 ('the', 'DT'),
 ('classic', 'JJ'),
 ('optimal', 'JJ'),
 ('control', 'NN'),
 ('problem', 'NN'),
 (',', ','),
 ('we', 'PRP'),
 ('begin', 'VBP'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('dynamical', 'JJ'),
 ('system', 'NN'),
 ('governed', 'VBN'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('difference', 'NN'),
 ('equation', 'NN'),
 ('MATH0000', 'NN'),
 ('=', 'JJ'),
 ('MATH0001', 'NN'),
 ('-LRB-', '-LRB-'),
 ('MATH0002', 'NN'),
 (',', ','),
 ('MATH0003', 'NN'),
 (',', ','),
 ('MATH0004', 'NN'),
 ('-RRB-', '-RRB-'),
 ('where', 'WRB'),
 ('MATH0002', 'NN'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('state', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('system', 'NN'),
 (',', ','),
 ('MATH0003', 'NN'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('control', 'NN'),
 ('action', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('MATH0004', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('random', 'JJ'),
 ('disturbance', 'NN'),
 ('.', '.')]

In [3]:
with open('output.txt', 'w') as f:
    f.write(doc.text)

In [None]:
print('len body')
print(len(doc.body))
doc.extract_identifiers()
with open('output_1.txt', 'w') as f:
    f.write('\n'.join(doc.identifiers))

print(doc.identifiers)
print('len body after processing')
print(len(doc.body))

In [9]:
tree = lxml.html.fromstring(doc.body)
# html = tree.getroot()

# doc.body.split()

# x = lxml.html.clean.clean_html(tree)


# x.text_content().split()


In [11]:
lxml.html.fromstring(doc.body).text_content().split()

['A',
 'Tour',
 'of',
 'Reinforcement',
 'Learning',
 'The',
 'View',
 'from',
 'Continuous',
 'Control',
 'Benjamin',
 'Recht',
 'Department',
 'of',
 'Electrical',
 'Engineering',
 'and',
 'Computer',
 'Sciences',
 'University',
 'of',
 'California,',
 'Berkeley',
 'June',
 '25,',
 '2018',
 'Abstract',
 'This',
 'manuscript',
 'surveys',
 'reinforcement',
 'learning',
 'from',
 'the',
 'perspective',
 'of',
 'optimization',
 'and',
 'control',
 'with',
 'a',
 'focus',
 'on',
 'continuous',
 'control',
 'applications.',
 'It',
 'surveys',
 'the',
 'general',
 'formulation,',
 'terminology,',
 'and',
 'typical',
 'experimental',
 'implementations',
 'of',
 'reinforcement',
 'learning',
 'and',
 'reviews',
 'competing',
 'solution',
 'paradigms.',
 'In',
 'order',
 'to',
 'compare',
 'the',
 'relative',
 'merits',
 'of',
 'various',
 'techniques,',
 'this',
 'survey',
 'presents',
 'a',
 'case',
 'study',
 'of',
 'the',
 'Linear',
 'Quadratic',
 'Regulator',
 '(LQR)',
 'with',
 'unknown

In [234]:
with open('output_x.txt', 'w') as f:
    f.write(lxml.html.tostring(x, method='text', encoding='unicode'))

In [117]:
x

'asdf;lkjdsaf'

In [None]:
# extract math content from a document
formulaRegexp = re.compile(r'<math(.*?)>(.*?)</math>', re.DOTALL)
formulae = formulaRegexp.findall(doc.body)

doc.formulae = [(s[0], re.sub(r'\n\s+', '', s[1])) for s in formulae]
doc.identifiers=[re.findall(r'<mi>(.*?)</mi>', formula[1]) for formula in doc.formulae]

In [109]:
print(len(doc.body))


804995


In [None]:
with open('output1.txt', 'w') as f:
    f.write(' '.join(doc.body.split()))

In [None]:
tree = lxml.html.parse(document_path)
html = tree.getroot()

path_w = 'output.txt'
with open(path_w, mode='w') as f:
    f.write('## msub ##\n')
    msub_list = []
    for math in html.cssselect('msub'):
        math_txt = ''
        for math_ in math.cssselect('mi'):
            math_txt = '_'.join([x.text_content() for x in x])
            # math_txt = '_'.join(math_txt)
            # math_txt = math_.text.strip()
            # math_txt += '_'
            
        if not math_txt[:-1] in msub_list:
            msub_list.append(math_txt[:-1])
        
    f.write('\n'.join(msub_list))
    f.write('\n')
    f.write('------------------------\n')

    f.write('## msup ##\n')
    msup_list = []
    for math in html.cssselect('msup'):
        math_txt = ''
        for math_ in math.cssselect('mi'):
            math_txt += math_.text.strip()
            math_txt += '_'
        if not math_txt[:-1] in msup_list:
            msup_list.append(math_txt[:-1])
    f.write('\n'.join(msup_list))
    f.write('\n')
    f.write('------------------------\n')

    f.write('## msubsup ##\n')
    msubsup_list = []
    for math in html.cssselect('msubsup'):
        math_txt = ''
        for math_ in math.cssselect('mi'):
            math_txt += math_.text.strip()
            math_txt += '_'
        if not math_txt[:-1] in msubsup_list:
            msubsup_list.append(math_txt[:-1])
    f.write('\n'.join(msubsup_list))
    f.write('\n')
    f.write('------------------------\n')


In [None]:
tree = lxml.html.parse(document_path)
html = tree.getroot()

In [None]:
a = html.cssselect('math')
# a_0_msub = a[0].cssselect('msub')
# a_0_msub_msup = a[0].cssselect(regex)
# a_0_msub[0].drop_tree()
# a_0_msub_mi = [msub.cssselect('mi') for msub in a_0_msub]
# a_0_msub_mi_x = [a_0_msub_mi_x[i].text for a_0_msub_mi_x in a_0_msub_mi for i, _ in enumerate(a_0_msub_mi_x)]
# print([b_.cssselect('mi').text.strip() for b_ in b[0]])
# a_0 = a_0_msub[0]

In [None]:
a[0].getchildren()[0].getchildren()[1].text_content()

In [None]:
print(a_0_msub[0].text_content())
print('text length : {}'.format(len(html.text_content())))

In [None]:
x = [x for x in a_0_msub[0].iterchildren()]

In [None]:
doc.body.split()

In [None]:
len(x[0].text_content())

In [None]:
a_0_mi = a[0].cssselect('mi')

In [None]:
html_tostring = lxml.html.tostring(a_0_mi[0], encoding='unicode')
print(html_tostring)
print(type(html_tostring))
text = 'the variable is represented by ' + html_tostring + '.'
print(text)

In [None]:
len(a[0].text_content())

In [None]:
a = [a for a in a_0.iterchildren()]                      
print(a[0].text_content())

In [37]:
msubsupRegexp = re.compile(r'<msubsup><mi>(.*?)</mi><m.*?>(.*?)</m.*?><m.*?>(.*?)</m.*?></msubsup>', re.DOTALL)
msupRegexp = re.compile(r'<msup><mi>(.*?)</mi><m.*?>(.*?)</m.*?><m.*?>(.*?)</m.*?></msup>', re.DOTALL)
msubRegexp = re.compile(r'<msup><mi>(.*?)</mi><m.*?>(.*?)</m.*?><m.*?>(.*?)</m.*?></msub>', re.DOTALL)

print('--- msubsup ---')
print([msubsupRegexp.findall(formula[1]) for formula in doc.formulae])

print('--- msup ---')
print([msupRegexp.findall(formula[1]) for formula in doc.formulae])

print('--- msub ---')
# このプログラム，msubを抽出するときに数十秒で計算が終わらない．
# jupyter notebookでプログレスバーを出しながら計算するか．
# print([msubRegexp.findall(formula[1]) for formula in tqdm(doc.formulae)])
# print(formulae[0][1])

In [44]:
tree = lxml.html.parse(document_path)
html = tree.getroot()

path_w = 'output.txt'
with open(path_w, mode='w') as f:
    f.write('## msub ##\n')
    msub_list = []
    for math in html.cssselect('msub'):
        math_txt = ''
        for math_ in math.cssselect('mi'):
            math_txt = '_'.join([x.text_content() for x in x])
            # math_txt = '_'.join(math_txt)
            # math_txt = math_.text.strip()
            # math_txt += '_'
            
        if not math_txt[:-1] in msub_list:
            msub_list.append(math_txt[:-1])
        
    f.write('\n'.join(msub_list))
    f.write('\n')
    f.write('------------------------\n')

    f.write('## msup ##\n')
    msup_list = []
    for math in html.cssselect('msup'):
        math_txt = ''
        for math_ in math.cssselect('mi'):
            math_txt += math_.text.strip()
            math_txt += '_'
        if not math_txt[:-1] in msup_list:
            msup_list.append(math_txt[:-1])
    f.write('\n'.join(msup_list))
    f.write('\n')
    f.write('------------------------\n')

    f.write('## msubsup ##\n')
    msubsup_list = []
    for math in html.cssselect('msubsup'):
        math_txt = ''
        for math_ in math.cssselect('mi'):
            math_txt += math_.text.strip()
            math_txt += '_'
        if not math_txt[:-1] in msubsup_list:
            msubsup_list.append(math_txt[:-1])
    f.write('\n'.join(msubsup_list))
    f.write('\n')
    f.write('------------------------\n')


TypeError: sequence item 0: expected str instance, list found

In [165]:
tree = lxml.html.parse(document_path)
html = tree.getroot()

In [166]:
a = html.cssselect('math')
# a_0_msub = a[0].cssselect('msub')
# a_0_msub_msup = a[0].cssselect(regex)
# a_0_msub[0].drop_tree()
# a_0_msub_mi = [msub.cssselect('mi') for msub in a_0_msub]
# a_0_msub_mi_x = [a_0_msub_mi_x[i].text for a_0_msub_mi_x in a_0_msub_mi for i, _ in enumerate(a_0_msub_mi_x)]
# print([b_.cssselect('mi').text.strip() for b_ in b[0]])
# a_0 = a_0_msub[0]

In [216]:
a[0].getchildren()[0].getchildren()[1].text_content()

'subscript𝑥𝑡1subscript𝑓𝑡subscript𝑥𝑡subscript𝑢𝑡subscript𝑒𝑡'

In [51]:
print(a_0_msub[0].text_content())
print('text length : {}'.format(len(html.text_content())))

xt+1
text length : 135239


In [54]:
x = [x for x in a_0_msub[0].iterchildren()]

In [218]:
doc.body.split()

['<div',
 'class="ltx_page_main">',
 '<div',
 'class="ltx_page_content">',
 '<article',
 'class="ltx_document',
 'ltx_authors_1line">',
 '<h1',
 'class="ltx_title',
 'ltx_title_document">A',
 'Tour',
 'of',
 'Reinforcement',
 'Learning',
 '<br',
 'class="ltx_break"><span',
 'id="id1"',
 'class="ltx_text"',
 'style="font-size:144%;">The',
 'View',
 'from',
 'Continuous',
 'Control</span>',
 '</h1>',
 '<div',
 'class="ltx_authors">',
 '<span',
 'class="ltx_creator',
 'ltx_role_author">',
 '<span',
 'class="ltx_personname">Benjamin',
 'Recht',
 '<br',
 'class="ltx_break">Department',
 'of',
 'Electrical',
 'Engineering',
 'and',
 'Computer',
 'Sciences',
 '<br',
 'class="ltx_break">University',
 'of',
 'California,',
 'Berkeley',
 '</span></span>',
 '</div>',
 '<div',
 'class="ltx_date',
 'ltx_role_creation">June',
 '25,',
 '2018</div>',
 '<div',
 'class="ltx_abstract">',
 '<h6',
 'class="ltx_title',
 'ltx_title_abstract">Abstract</h6>',
 '<p',
 'id="id2"',
 'class="ltx_p">This',
 'manusc

In [65]:
len(x[0].text_content())

1

In [96]:
a_0_mi = a[0].cssselect('mi')

In [108]:
html_tostring = lxml.html.tostring(a_0_mi[0], encoding='unicode')
print(html_tostring)
print(type(html_tostring))
text = 'the variable is represented by ' + html_tostring + '.'
print(text)

<mi id="S2.p2.1.m1.1.10" xref="S2.p2.1.m1.1.10.cmml">u</mi>
<class 'str'>
the variable is represented by <mi id="S2.p2.1.m1.1.10" xref="S2.p2.1.m1.1.10.cmml">u</mi>.


In [92]:
len(a[0].text_content())

1

In [70]:
a = [a for a in a_0.iterchildren()]                      
print(a[0].text_content())

x


In [8]:
msubsupRegexp = re.compile(r'<msubsup><mi>(.*?)</mi><m.*?>(.*?)</m.*?><m.*?>(.*?)</m.*?></msubsup>', re.DOTALL)
msupRegexp = re.compile(r'<msup><mi>(.*?)</mi><m.*?>(.*?)</m.*?><m.*?>(.*?)</m.*?></msup>', re.DOTALL)
msubRegexp = re.compile(r'<msup><mi>(.*?)</mi><m.*?>(.*?)</m.*?><m.*?>(.*?)</m.*?></msub>', re.DOTALL)

print('--- msubsup ---')
print([msubsupRegexp.findall(formula[1]) for formula in doc.formulae])

print('--- msup ---')
print([msupRegexp.findall(formula[1]) for formula in doc.formulae])

print('--- msub ---')
# このプログラム，msubを抽出するときに数十秒で計算が終わらない．
# jupyter notebookでプログレスバーを出しながら計算するか．
# print([msubRegexp.findall(formula[1]) for formula in tqdm(doc.formulae)])
# print(formulae[0][1])

--- msubsup ---
[[], [], [], [], [('T', '𝐯', '<mo>-')], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
--- msup ---
[[], [], [], [], [], [], [], [('R', 'T', '=')], [], [], [('x', '′', '<mtd columnalign="center"><msup><mi>y')], [], [], [], [], [], [], [], [], [], [], [], [], []]
--- msub ---


解析対象は，LaTeXMLによって生成したXML形式のファイルとする．
アノテーションは著者がmanualで行った．

関数と変数の分離については行わないものとする．
そもそも人が見て判断しても，f(x+b)が関数なのか乗算なのかは文章を読まなければ判断しかねるため．
（f(x)というように１文字のみを引数にとる場合はfが関数であるというルールにしてもいいと思うが）

In [37]:
import xml.etree.ElementTree as ET
with open(document_path, 'r') as documentOpend:
    document = documentOpend.read()

In [86]:
print(document[:1000])

<!DOCTYPE html>
<html lang="en" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
  <head>
    <meta name="generator" content="HTML Tidy for HTML5 for Linux version 5.1.25" />
    <meta charset="utf-8" />
    <meta content="2D_computer_graphics" name="docid" />
    <meta content="articles/math-pages/wpmath0000001.dat" name="datfile" />
    <meta content="1124" name="offset" />
    <title>
      2D computer graphics
    </title>
    <script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_SVG.js" type="text/javascript">
    </script>
    <style type="text/css">
    /*<![CDATA[*/
    code{white-space: pre;}
    /*]]>*/
    </style><!--[if lt IE 9]>
    <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
  <![endif]-->
  </head>
  <body>
    <blockquote>
      <blockquote>
        <h1>
          2D computer graphics
        </h1>
        <hr />
        <p>
          <strong>2D computer graphics</strong> is the <a class="uri" href="computer

In [208]:
tree = ET.parse(document_path)
root = tree.getroot()
# root = ET.fromstring(document)
xmlNameSpace = re.findall('({.*})', root.tag)[0]

In [213]:
for xmlMathElement in root.iter(xmlNameSpace+'math'):
    print(xmlMath.tag)
    for mathElement in xmlMathElement.iter(xmlNameSpace+'mi'):
        print('-->', mathElement.text.strip())

{http://www.w3.org/1999/xhtml}math
--> T
--> δ
{http://www.w3.org/1999/xhtml}math
--> T
--> δ
--> f
--> 𝐯
--> f
--> 𝐯
--> δ
{http://www.w3.org/1999/xhtml}math
--> T
--> 𝐯
--> v
--> x
--> v
--> y
--> v
--> z
{http://www.w3.org/1999/xhtml}math
--> T
--> 𝐯
--> 𝐩
--> v
--> x
--> v
--> y
--> v
--> z
--> p
--> x
--> p
--> y
--> p
--> z
--> p
--> x
--> v
--> x
--> p
--> y
--> v
--> y
--> p
--> z
--> v
--> z
--> 𝐩
--> 𝐯
{http://www.w3.org/1999/xhtml}math
--> T
--> 𝐯
--> T
--> 𝐯
{http://www.w3.org/1999/xhtml}math
--> T
--> 𝐮
--> T
--> 𝐯
--> T
--> 𝐮
--> 𝐯
{http://www.w3.org/1999/xhtml}math
--> R
--> cos
--> θ
--> sin
--> θ
--> sin
--> θ
--> cos
--> θ
{http://www.w3.org/1999/xhtml}math
--> R
--> T
--> R
--> R
{http://www.w3.org/1999/xhtml}math
--> S
--> O
--> n
{http://www.w3.org/1999/xhtml}math
--> R
--> θ
--> cos
--> θ
--> sin
--> θ
--> sin
--> θ
--> cos
--> θ
{http://www.w3.org/1999/xhtml}math
--> x
--> y
--> cos
--> θ
--> sin
--> θ
--> sin
--> θ
--> cos
--> θ
--> x
--> y
{http://www.w3.org/19

In [184]:
for math in root.iter('{http://www.w3.org/1999/xhtml}math'):
    print(math.tag)

{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math
{http://www.w3.org/1999/xhtml}math


In [164]:
for i, child in enumerate(root[1][0][0][8]):
    print(i, child.tag, child.attrib, '\n')

0 {http://www.w3.org/1999/xhtml}a {'href': 'Euclidean_geometry', 'title': 'wikilink'} 
a
1 {http://www.w3.org/1999/xhtml}strong {} 
a
2 {http://www.w3.org/1999/xhtml}a {'href': 'Euclidean_group', 'title': 'wikilink'} 
a
3 {http://www.w3.org/1999/xhtml}a {'href': 'vector_space', 'title': 'wikilink'} 
a
4 {http://www.w3.org/1999/xhtml}a {'href': 'Origin_(mathematics)', 'title': 'wikilink'} 
a
5 {http://www.w3.org/1999/xhtml}a {'href': 'coordinate_system', 'title': 'wikilink'} 
a
6 {http://www.w3.org/1999/xhtml}strong {} 
a
7 {http://www.w3.org/1999/xhtml}a {'href': 'operator_(mathematics)', 'title': 'wikilink'} 
a
8 {http://www.w3.org/1999/xhtml}math {'display': 'inline', 'id': '2D_computer_graphics:0'} 
a
9 {http://www.w3.org/1999/xhtml}math {'display': 'inline', 'id': '2D_computer_graphics:1'} 
a


In [125]:
for child in root[1][0][0][8][8]:
    print(child.tag, child.attrib)

{http://www.w3.org/1999/xhtml}semantics {}


In [131]:
for child in root.findall("{http://www.w3.org/1999/xhtml}math"):
    print(child.tag)

In [33]:
for value in root.iter('value'):
    print(value.attrib)

- identifierの持っている文字情報自体をDefenition retrieval に活用できないか．
- これは，Wikipediaのような汎用的な文書ではなく，化学プロセスを対象とした論文といった，専門性の高い文書だからこそ取れるアプローチであるといえる．
- 前提として，半径をRadiusの頭文字Rを用いて表したり，速度をVelocityの頭文字であるVを用いて表すように，科学文書では，，Identifierを表すのに，Definitionである英単語の一部を用いることが多い，
- 

In [None]:
import xnl.etree.ElementTree as etree