In [25]:
import os
from pathlib import Path
import stanfordnlp
# import projectmir

currentPath = Path(os.getcwd())
MODELS_DIR = os.path.join(currentPath, 'data/stanfordnlp_resources')
nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos',#lemma,depparse', 
                           models_dir=MODELS_DIR,
                           )
doc = nlp("In natural language, words and phrases themselves imply the semantics. In contrast, the meaning of identifiers in mathematical formulae  is  undefined.")
print(*[f'word: {word.text+" "}\tupos: {word.upos}\txpos: {word.xpos}' for sentence in doc.sentences for word in sentence.words], sep='\n')
# print(
#     *[f"index: {word.index.rjust(2)}\t\
#     word: {word.text.ljust(11)}\t\
#     governor index: {word.governor}\t\
#     governor: {(sentence.words[word.governor-1].text if word.governor > 0 else 'root').ljust(11)}\t\
#     deprel: {word.dependency_relation}" for sentence in doc.sentences for word in sentence.words ], sep='\n'\
#     )


Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/kato/GoogleDrive/project-mir/data/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/kato/GoogleDrive/project-mir/data/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/kato/GoogleDrive/project-mir/data/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---
word: In 	upos: ADP	xpos: IN
word: natural 	upos: ADJ	xpos: JJ
word: language 	upos: NOUN	xpos: NN
word: , 	upos: PUNCT	xpos: ,
word: words 	upos: NOUN	xpos: NNS
word: and 	upos: CCONJ	xpos: CC
word: phrases 	upos: NOUN	xpos: NNS
word: themselves 	upos: PRON	xpos: PRP
word: imply 	upos: VERB	xpos: VBP
word: the 	upos: DET	xpos: DT
word: semantics 	upos: NOUN	xpos: NNS
word: . 	upos: PUNCT	xpos: .
word: In 	upos: ADP	xpos: IN


In [62]:
from stanfordnlp.server import CoreNLPClient

# example text
print('---')
print('input text')
print('')

text = "Chris Manning is a nice person. Chris wrote a simple sentence. He also gives oranges to people."

print(text)

# set up the client
print('---')
print('starting up Java Stanford CoreNLP Server...')

# set up the client
with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner','parse','depparse','coref'], timeout=60000, memory='16G') as client:
    # submit the request to the server
    ann = client.annotate(text)

    # get the first sentence
    sentence = ann.sentence[0]

    # get the dependency parse of the first sentence
    print('---')
    print('dependency parse of first sentence')
    dependency_parse = sentence.basicDependencies
    print(dependency_parse)
 
    # get the constituency parse of the first sentence
    print('---')
    print('constituency parse of first sentence')
    constituency_parse = sentence.parseTree
    print(constituency_parse)

    # get the first subtree of the constituency parse
    print('---')
    print('first subtree of constituency parse')
    print(constituency_parse.child[0])

    # get the value of the first subtree
    print('---')
    print('value of first subtree of constituency parse')
    print(constituency_parse.child[0].value)

    # get the first token of the first sentence
    print('---')
    print('first token of first sentence')
    token = sentence.token[0]
    print(token)

    # get the part-of-speech tag
    print('---')
    print('part of speech tag of token')
    token.pos
    print(token.pos)

    # get the named entity tag
    print('---')
    print('named entity tag of token')
    print(token.ner)

    # get an entity mention from the first sentence
    print('---')
    print('first entity mention in sentence')
    print(sentence.mentions[0])

    # access the coref chain
    print('---')
    print('coref chains for the example')
    print(ann.corefChain)

    # Use tokensregex patterns to find who wrote a sentence.
    pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/'
    matches = client.tokensregex(text, pattern)
    # sentences contains a list with matches for each sentence.
    assert len(matches["sentences"]) == 3
    # length tells you whether or not there are any matches in this
    assert matches["sentences"][1]["length"] == 1
    # You can access matches like most regex groups.
    matches["sentences"][1]["0"]["text"] == "Chris wrote a simple sentence"
    matches["sentences"][1]["0"]["1"]["text"] == "Chris"

    # Use semgrex patterns to directly find who wrote what.
    pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object'
    matches = client.semgrex(text, pattern)
    # sentences contains a list with matches for each sentence.
    assert len(matches["sentences"]) == 3
    # length tells you whether or not there are any matches in this
    assert matches["sentences"][1]["length"] == 1
    # You can access matches like most regex groups.
    matches["sentences"][1]["0"]["text"] == "wrote"
    matches["sentences"][1]["0"]["$subject"]["text"] == "Chris"
    matches["sentences"][1]["0"]["$object"]["text"] == "sentence"

---
 input text

Chris Manning is a nice person. Chris wrote a simple sentence. He also gives oranges to people.
---
starting up Java Stanford CoreNLP Server...
Starting server with command: java -Xmx16G -cp /Users/kato/GoogleDrive/project-mir/data/resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-8d99acf434404263.props -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,coref
---
dependency parse of first sentence
node {
  sentenceIndex: 0
  index: 1
}
node {
  sentenceIndex: 0
  index: 2
}
node {
  sentenceIndex: 0
  index: 3
}
node {
  sentenceIndex: 0
  index: 4
}
node {
  sentenceIndex: 0
  index: 5
}
node {
  sentenceIndex: 0
  index: 6
}
node {
  sentenceIndex: 0
  index: 7
}
edge {
  source: 2
  target: 1
  dep: "compound"
  isExtra: false
  sourceCopy: 0
  targetCopy: 0
  language: UniversalEnglish
}
edge {
  source: 6
  target: 2

In [1]:
import time
import re

In [2]:
class WikiDocument():
    def __init__(self):
        self.raw = ''
        self.title = ''
        self.namespace = ''
        self.document_id = 0
        self.text =''
        self.body = ''
        # Holds all formulas found within the document.
        # The key of the HashMap is the replacement string in the document and the value contains the TeX String
        self.formulae = [] # [{'hash': foo1, 'src':foo2}, ...]
        # Stores all unique identifiers found in this document
        self.identifiers = []
        
# documentPath is one of the following:
# data/_project-mlp-develop/src/test/resources/wikienmathsample.xml
# data/_project-mlp-develop/src/test/resources/augmentendwikitext.xml
# data/_project-mlp-develop/src/test/resources/chartest.xml`
# data/resources/NTCIR12_MathIR_WikiCorpus_v2.1.0/MathTagArticles/wpmath0000001/Articles/2D_computer_graphics.html

def DocumentProcessor(documentPath):
    start_time = time.time()
    titleRegexp = re.compile(r'(?:<title>)(.*?)(?:</title>)', re.DOTALL)
    namespaceRegexp = re.compile(r'(?:<ns>)(.*?)(?:</ns>)', re.DOTALL)
    idRegexp = re.compile(r'(?:<revision>.*?<id>)(\d+)(?:</id>)', re.DOTALL)
    textRegexp = re.compile(r'(?:<text.*?>)(.*?)(?:</text>)', re.DOTALL)
    bodyRegexp = re.compile(r'(?:<body.*?>)(.*?)(?:</body>)', re.DOTALL)
    with open(documentPath, 'r') as documentOpend:
        document = documentOpend.read()
        title = titleRegexp.findall(document)
        # print('len(title)={0:d}'.format(len(title)))

        namespace = namespaceRegexp.findall(document)
        # print('len(namespace)={0:d}'.format(len(namespace)))

        document_id = idRegexp.findall(document)
        # print('len(id)={0:d}'.format(len(document_id)))

        text = textRegexp.findall(document)
        # print('len(text)={0:d}'.format(len(text)))
        
        body = bodyRegexp.findall(document)
        
        
    doc = WikiDocument()
    if title: 
        doc.title = title[0]
    if namespace:
        doc.namespace = namespace[0]
    if document_id:
        doc.document_id = document_id[0]
    if text:
        doc.text = text[0]
    if body:
        doc.body = body[0]
    print("run time--- {0:.7f} seconds ---".format(time.time()-start_time))    
    return doc


In [3]:
# docs = DocumentProcessor('data/_project-mlp-develop/src/test/resources/wikienmathsample.xml')
doc = DocumentProcessor('data/resources/NTCIR12_MathIR_WikiCorpus_v2.1.0/MathTagArticles/wpmath0000001/Articles/2D_computer_graphics.html')

# extract math content from a document
formulaRegexp = re.compile(r'<math(.*?)>(.*?)</math>', re.DOTALL)
formulae = formulaRegexp.findall(doc.body)

doc.formulae = [(s[0], re.sub(r'\n\s+', '', s[1])) for s in formulae]
doc.identifiers+=re.findall(r'<mi>(.*?)</mi>', doc.formulae[0][1])


run time--- 0.0056109 seconds ---


In [4]:
# [re.findall(r'<msub><mi>(.*?)</mi><m.*?>(.*?)</m.*?></msub>', formula[1]) for formula in doc.formulae]

[re.findall(r'<msubsup><mi>(.*?)</mi><m.*?>(.*?)</m.*?><m.*?>(.*?)</m.*?></msubsup>', formula[1]) for formula in doc.formulae]

[[],
 [],
 [],
 [],
 [('T', 'ùêØ', '<mo>-')],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

In [5]:
formulae[0][1]

'\n          <semantics>\n            <msub>\n              <mi>\n                T\n              </mi>\n              <mi>\n                Œ¥\n              </mi>\n            </msub>\n            <annotation-xml encoding="MathML-Content">\n              <apply>\n                <csymbol cd="ambiguous">\n                  subscript\n                </csymbol>\n                <ci>\n                  T\n                </ci>\n                <ci>\n                  Œ¥\n                </ci>\n              </apply>\n            </annotation-xml>\n            <annotation encoding="application/x-tex">\n              T_{\\mathbf{\\delta}}\n            </annotation>\n          </semantics>'