In [3]:
from sagas.nlu.stanza_helper import get_nlp
from spacy_stanza import StanzaLanguage

snlp = get_nlp('en')
# snlp = stanza.Pipeline(lang="en")
nlp = StanzaLanguage(snlp)

doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008.")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)
print('*'*25)
print(doc.ents)

Barack Barack PROPN nsubj:pass PERSON
Obama Obama PROPN flat PERSON
was be AUX aux:pass 
born bear VERB root 
in in ADP case 
Hawaii Hawaii PROPN obl GPE
. . PUNCT punct 
He he PRON nsubj:pass 
was be AUX aux:pass 
elected elect VERB root 
president president PROPN xcomp 
in in ADP case 
2008 2008 NUM obl DATE
. . PUNCT punct 
*************************
(Barack Obama, Hawaii, 2008)


In [2]:
from sagas.nlu.stanza_helper import get_nlp
from spacy_stanza import StanzaLanguage

snlp = get_nlp('ru')
nlp = StanzaLanguage(snlp)

doc = nlp('У Вас есть сигареты?')
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)
print('*'*25)
print(doc.ents)

У у ADP case 
Вас вы PRON obl MISC
есть быть VERB root MISC
сигареты сигарета NOUN nsubj MISC
? ? PUNCT punct 
*************************
(Вас есть сигареты,)


In [5]:
def spacy_doc(sents, lang, simple=True):
    from sagas.nlu.spacy_helper import spacy_mgr
    spacy_nlp = spacy_mgr.get_model(lang, simple=simple)
    return spacy_nlp(sents)
doc=spacy_doc("do you have main fax number", 'en')
print([(ent.text, ent.label_) for ent in doc.ents])

[]


In [37]:
import spacy
from spacy.pipeline import EntityRuler
from sagas.nlu.spacy_helper import spacy_mgr

# nlp = spacy_mgr.get_model('en')
# nlp = spacy.load("en_core_web_sm")
snlp = get_nlp('en')
nlp = StanzaLanguage(snlp)
# create case-insensitive match patterns
ruler = EntityRuler(nlp, phrase_matcher_attr='LOWER')
# patterns = [{"label": "typ", "pattern": "main fax number", 'id': "ContactMechPurposeType.description.FAX_NUMBER"}]
patterns = [{"label": "typ", "pattern": "Main Fax Number", 'id': "ContactMechPurposeType.description.FAX_NUMBER"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

In [38]:
doc=nlp("do you have main fax number", 'en')
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents])

[('main fax number', 'typ', 'ContactMechPurposeType.description.FAX_NUMBER')]


In [41]:
doc=nlp("Do you have Main Fax Number", 'en')
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents])

[('Main Fax Number', 'typ', 'ContactMechPurposeType.description.FAX_NUMBER')]


In [39]:
from spacy import displacy
displacy.render([doc], style="dep")

In [21]:
from spacy.util import filter_spans
spans = list(doc.ents) + list(doc.noun_chunks)
spans = filter_spans(spans)
with doc.retokenize() as retokenizer:
    for span in spans:
        retokenizer.merge(span)

In [23]:
displacy.render([doc], style="dep")

In [26]:
print([(tok.i, tok.text, token.head.i) for tok in doc])

[(0, 'do', 2), (1, 'you', 2), (2, 'have', 2), (3, 'main fax number', 2)]


In [34]:
import srsly
from pathlib import Path

data = {"foo": "bar", "baz": 123}
json_string = srsly.json_dumps(data)
print(json_string)

depr_patterns_path=Path('patterns.jsonl')
if depr_patterns_path.is_file():
    patterns = srsly.read_jsonl(depr_patterns_path)
    # ruler.add_patterns(patterns)
    for entry in patterns:
        print(entry['label'], entry)

{"foo":"bar","baz":123}
ORG {'label': 'ORG', 'pattern': 'Apple'}
GPE {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}


In [36]:
data = [{"foo": "bar"}, {"baz": 123}]
srsly.write_jsonl("test.jsonl", data)
srsly.write_gzip_json("test.json.gz", data)