In [1]:
import stanfordnlp

MODELS_DIR = '/pi/ai/corenlp'
nlp = stanfordnlp.Pipeline(processors='tokenize,pos', models_dir=MODELS_DIR, treebank='en_ewt', use_gpu=True, pos_batch_size=3000) # Build the pipeline, specify part-of-speech processor's batch size
doc = nlp("Barack Obama was born in Hawaii.") # Run the pipeline on input text
doc.sentences[0].print_tokens() # Look at the result

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/pi/ai/corenlp/en_ewt_models/en_ewt.pretrain.pt', 'batch_size': 3000, 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---
<Token index=1;words=[<Word index=1;text=Barack;upos=PROPN;xpos=NNP;feats=Number=Sing>]>
<Token index=2;words=[<Word index=2;text=Obama;upos=PROPN;xpos=NNP;feats=Number=Sing>]>
<Token index=3;words=[<Word index=3;text=was;upos=AUX;xpos=VBD;feats=Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin>]>
<Token index=4;words=[<Word index=4;text=born;upos=VERB;xpos=VBN;feats=Tense=Past|VerbForm=Part|Voice=Pass>]>
<Token index=5;words=[<Word index=5;text=in;upos=ADP;xpos=IN;feats=_>]>
<Token index=6;words=[<Word index=6;text=Hawaii;upos=PROPN;xpos=NNP;

In [2]:
nlp = stanfordnlp.Pipeline(models_dir=MODELS_DIR, treebank='en_ewt')
doc = nlp("Barack Obama was born in Hawaii.")
print(*[f'text: {word.text+" "}\tlemma: {word.lemma}\tupos: {word.upos}\txpos: {word.xpos}' for sent in doc.sentences for word in sent.words], sep='\n')

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/pi/ai/corenlp/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/pi/ai/corenlp/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---
text: Barack 	lem

In [3]:
doc = nlp("Barack Obama was born in Hawaii.")
print(*[f'text: {word.text+" "}\tlemma: {word.lemma}\tupos: {word.upos}\txpos: {word.xpos}' for sent in doc.sentences for word in sent.words], sep='\n')

text: Barack 	lemma: Barack	upos: PROPN	xpos: NNP
text: Obama 	lemma: Obama	upos: PROPN	xpos: NNP
text: was 	lemma: be	upos: AUX	xpos: VBD
text: born 	lemma: bear	upos: VERB	xpos: VBN
text: in 	lemma: in	upos: ADP	xpos: IN
text: Hawaii 	lemma: Hawaii	upos: PROPN	xpos: NNP
text: . 	lemma: .	upos: PUNCT	xpos: .


In [4]:
doc.sentences[0].print_dependencies()

('Barack', '4', 'nsubj:pass')
('Obama', '1', 'flat')
('was', '4', 'aux:pass')
('born', '0', 'root')
('in', '6', 'case')
('Hawaii', '4', 'obl')
('.', '4', 'punct')


In [23]:
# 分析依赖关系, 自下而上, 可用于抽取指定关系的子节点集合, 比如此例中的'nsubj:pass'和'obl'
# word.governor即为当前word的parent
sent=doc.sentences[0]
for tok in sent.tokens:
    print(tok.index, tok.text, len(tok.words))
    for word in tok.words:
        print('\t', word.text, word.upos,  word.dependency_relation, 
              word.governor, sent.words[word.governor-1].text)
        # word.parent_token.text)

1 Barack 1
	 Barack PROPN nsubj:pass 4 born
2 Obama 1
	 Obama PROPN flat 1 Barack
3 was 1
	 was AUX aux:pass 4 born
4 born 1
	 born VERB root 0 .
5 in 1
	 in ADP case 6 Hawaii
6 Hawaii 1
	 Hawaii PROPN obl 4 born
7 . 1
	 . PUNCT punct 4 born


In [37]:
import sagas

def equals(a,b):
    return str(a)==str(b)

def get_children(sent, word, rs):
    for c in filter(lambda w: equals(w.governor, word.index), sent.words):
        rs.append((c.index, c.text))
        get_children(sent, c, rs)
        
def get_children_list(sent, word, include_self=True):
    rs=[]
    get_children(sent, word, rs)
    result= [w[1] for w in rs]
    if include_self:
        result.append(word.text)
    return result

def get_verb_domain(sent, filters):
    rs=[]
    for word in filter(lambda w: w.upos == "VERB", sent.words):
        # if money.dep_ in ("attr", "dobj"):
        # print(word.index, word.text)
        domains=[]
        for c in filter(lambda w: equals(w.governor, word.index), sent.words):
            # print('\t', c.index, c.text, get_children_list(sent, c))
            domains.append((c.dependency_relation, c.index, c.text, get_children_list(sent, c)))
        rs.append({'verb':word.text, 'index':word.index, 'domains':domains})
    return rs

r=get_verb_domain(sent, ['obl', 'nsubj:pass'])
# print(json.dumps(r, indent=2, ensure_ascii=False))
print(r[0]['verb'], r[0]['index'])
sagas.to_df(r[0]['domains'], ['rel', 'index', 'text', 'children'])

born 4


Unnamed: 0,rel,index,text,children
0,nsubj:pass,1,Barack,"[Obama, Barack]"
1,aux:pass,3,was,[was]
2,obl,6,Hawaii,"[in, Hawaii]"
3,punct,7,.,[.]


In [12]:
print(doc.sentences[0].dependencies_string())

('Barack', '4', 'nsubj:pass')
('Obama', '1', 'flat')
('was', '4', 'aux:pass')
('born', '0', 'root')
('in', '6', 'case')
('Hawaii', '4', 'obl')
('.', '4', 'punct')


In [5]:
def analyse(sents):
    doc = nlp(sents)
    print(*[f'text: {word.text+" "}\tlemma: {word.lemma}\tupos: {word.upos}\txpos: {word.xpos}' for sent in doc.sentences for word in sent.words], sep='\n')
    doc.sentences[0].print_dependencies()

sents='I am a student'
analyse(sents)

text: I 	lemma: I	upos: PRON	xpos: PRP
text: am 	lemma: be	upos: AUX	xpos: VBP
text: a 	lemma: a	upos: DET	xpos: DT
text: student 	lemma: student	upos: NOUN	xpos: NN
('I', '4', 'nsubj')
('am', '4', 'cop')
('a', '4', 'det')
('student', '0', 'root')


In [6]:
analyse('The car is red.')

text: The 	lemma: the	upos: DET	xpos: DT
text: car 	lemma: car	upos: NOUN	xpos: NN
text: is 	lemma: be	upos: AUX	xpos: VBZ
text: red 	lemma: red	upos: ADJ	xpos: JJ
text: . 	lemma: .	upos: PUNCT	xpos: .
('The', '2', 'det')
('car', '4', 'nsubj')
('is', '4', 'cop')
('red', '0', 'root')
('.', '4', 'punct')


In [1]:
import stanfordnlp

MODELS_DIR = '/pi/ai/corenlp'
nlp = stanfordnlp.Pipeline(processors = "tokenize,mwt,lemma,pos", models_dir=MODELS_DIR)
doc = nlp("""The prospects for Britain’s orderly withdrawal from the European Union on March 29 have receded further, even as MPs rallied to stop a no-deal scenario. An amendment to the draft bill on the termination of London’s membership of the bloc obliges Prime Minister Theresa May to renegotiate her withdrawal agreement with Brussels. A Tory backbencher’s proposal calls on the government to come up with alternatives to the Irish backstop, a central tenet of the deal Britain agreed with the rest of the EU.""")

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: pos
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/pi/ai/corenlp/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


In [2]:
doc.sentences[0].print_tokens()

<Token index=1;words=[<Word index=1;text=The;lemma=the;upos=DET;xpos=DT;feats=Definite=Def|PronType=Art>]>
<Token index=2;words=[<Word index=2;text=prospects;lemma=prospect;upos=NOUN;xpos=NNS;feats=Number=Plur>]>
<Token index=3;words=[<Word index=3;text=for;lemma=for;upos=ADP;xpos=IN;feats=_>]>
<Token index=4;words=[<Word index=4;text=Britain;lemma=Britain;upos=PROPN;xpos=NNP;feats=Number=Sing>]>
<Token index=5;words=[<Word index=5;text=’s;lemma='s;upos=PART;xpos=POS;feats=_>]>
<Token index=6;words=[<Word index=6;text=orderly;lemma=orderly;upos=ADJ;xpos=JJ;feats=Degree=Pos>]>
<Token index=7;words=[<Word index=7;text=withdrawal;lemma=withdrawal;upos=NOUN;xpos=NN;feats=Number=Sing>]>
<Token index=8;words=[<Word index=8;text=from;lemma=from;upos=ADP;xpos=IN;feats=_>]>
<Token index=9;words=[<Word index=9;text=the;lemma=the;upos=DET;xpos=DT;feats=Definite=Def|PronType=Art>]>
<Token index=10;words=[<Word index=10;text=European;lemma=european;upos=ADJ;xpos=JJ;feats=Degree=Pos>]>
<Token index=

In [3]:
import pandas as pd

#extract lemma
def extract_lemma(doc):
    parsed_text = {'word':[], 'lemma':[]}
    for sent in doc.sentences:
        for wrd in sent.words:
            #extract text and lemma
            parsed_text['word'].append(wrd.text)
            parsed_text['lemma'].append(wrd.lemma)
    #return a dataframe
    return pd.DataFrame(parsed_text)

#call the function on doc
extract_lemma(doc)

Unnamed: 0,lemma,word
0,the,The
1,prospect,prospects
2,for,for
3,Britain,Britain
4,'s,’s
5,orderly,orderly
6,withdrawal,withdrawal
7,from,from
8,the,the
9,european,European


In [5]:
#dictionary that contains pos tags and their explanations
pos_dict = {
'CC': 'coordinating conjunction','CD': 'cardinal digit','DT': 'determiner',
'EX': 'existential there (like: \"there is\" ... think of it like \"there exists\")',
'FW': 'foreign word','IN':  'preposition/subordinating conjunction','JJ': 'adjective \'big\'',
'JJR': 'adjective, comparative \'bigger\'','JJS': 'adjective, superlative \'biggest\'',
'LS': 'list marker 1)','MD': 'modal could, will','NN': 'noun, singular \'desk\'',
'NNS': 'noun plural \'desks\'','NNP': 'proper noun, singular \'Harrison\'',
'NNPS': 'proper noun, plural \'Americans\'','PDT': 'predeterminer \'all the kids\'',
'POS': 'possessive ending parent\'s','PRP': 'personal pronoun I, he, she',
'PRP$': 'possessive pronoun my, his, hers','RB': 'adverb very, silently,',
'RBR': 'adverb, comparative better','RBS': 'adverb, superlative best',
'RP': 'particle give up','TO': 'to go \'to\' the store.','UH': 'interjection errrrrrrrm',
'VB': 'verb, base form take','VBD': 'verb, past tense took',
'VBG': 'verb, gerund/present participle taking','VBN': 'verb, past participle taken',
'VBP': 'verb, sing. present, non-3d take','VBZ': 'verb, 3rd person sing. present takes',
'WDT': 'wh-determiner which','WP': 'wh-pronoun who, what','WP$': 'possessive wh-pronoun whose',
'WRB': 'wh-abverb where, when','QF' : 'quantifier, bahut, thoda, kam (Hindi)','VM' : 'main verb',
'PSP' : 'postposition, common in indian langs','DEM' : 'demonstrative, common in indian langs'
}

#extract parts of speech
def extract_pos(doc):
    parsed_text = {'word':[], 'pos':[], 'exp':[]}
    for sent in doc.sentences:
        for wrd in sent.words:
            if wrd.pos in pos_dict.keys():
                pos_exp = pos_dict[wrd.pos]
            else:
                pos_exp = 'NA'
            parsed_text['word'].append(wrd.text)
            parsed_text['pos'].append(wrd.pos)
            parsed_text['exp'].append(pos_exp)
    #return a dataframe of pos and text
    return pd.DataFrame(parsed_text)

#extract pos
extract_pos(doc)

# The output would be a data frame with three columns 
# – word, pos and exp (explanation). 
# The explanation column gives us the most information 
# about the text (and is hence quite useful).

Unnamed: 0,exp,pos,word
0,determiner,DT,The
1,noun plural 'desks',NNS,prospects
2,preposition/subordinating conjunction,IN,for
3,"proper noun, singular 'Harrison'",NNP,Britain
4,possessive ending parent's,POS,’s
5,adjective 'big',JJ,orderly
6,"noun, singular 'desk'",NN,withdrawal
7,preposition/subordinating conjunction,IN,from
8,determiner,DT,the
9,adjective 'big',JJ,European


In [13]:
nlp = stanfordnlp.Pipeline(processors = "tokenize,mwt,lemma,pos,depparse", 
                           models_dir=MODELS_DIR)

doc = nlp("""The prospects for Britain’s orderly withdrawal from the European Union on March 29 have receded further, even as MPs rallied to stop a no-deal scenario. An amendment to the draft bill on the termination of London’s membership of the bloc obliges Prime Minister Theresa May to renegotiate her withdrawal agreement with Brussels. A Tory backbencher’s proposal calls on the government to come up with alternatives to the Irish backstop, a central tenet of the deal Britain agreed with the rest of the EU.""")
# doc=nlp('I am a student')
print(len(doc.sentences))
doc.sentences[0].print_dependencies()

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: pos
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/pi/ai/corenlp/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: depparse
With settings: 
{'model_path': '/pi/ai/corenlp/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/pi/ai/corenlp/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---
3
('The', '2', 'd