In [None]:
import spacy
from spacy.lang.en import English

# Sentencizer
https://spacy.io/usage/linguistic-features#sbd

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a sentence. This is another sentence.")
assert doc.has_annotation("SENT_START")
for sent in doc.sents:
    print(sent.text)

In [None]:
name = "../data/Original/iued_test_original.txt"
#name = "../data/Original/iued_test_original.vrt"
with open (name, "r") as myfile:
    data=myfile.read().replace('\n', '')

In [None]:
print(data)

In [None]:
doc = nlp(data)
assert doc.has_annotation("SENT_START")
for sent in doc.sents:
    print(sent.text)
    print('***')

This gives somewhat accurate results, with some errors after numbers. You can also use a trained model, however this will not work on uncommon texts.

In [None]:
doc = nlp(data)
for sent in doc.sents:
    print(sent.text)
    print('***')    

Also fails for the example here. Then there is the one based on a statistical model.

In [None]:
nlp.enable_pipe("senter")
doc = nlp(data)
for sent in doc.sents:
    print(sent.text)
    print('***')        

Directly use the sentencizer without the pipeline - this one looks at punctuation.

In [None]:
nlp = English()  # just the language with no pipeline
nlp.add_pipe("sentencizer")
doc = nlp(data)
for sent in doc.sents:
    print(sent.text)
    print('***')            

Seems to work correctly. What is the difference to the pipeline? In the DW scripts, the other components are disabled via the "exclude" command - should be faster as pipeline is not loaded at all.

In [None]:
texts = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]

nlp = spacy.load("en_core_web_sm")
for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
    # Do something with the doc here
    print([(ent.text, ent.label_) for ent in doc.ents])

In [None]:
nlp = spacy.load("en_core_web_sm", exclude=["tagger", "ner", "attribute_ruler", "lemmatizer"])
for doc in nlp.pipe(texts):
    for sent in doc.sents:
        print(sent.text)
        print('***') 

# Tokenizer
https://spacy.io/usage/linguistic-features#tokenization  
We need to allow for special case rules. 
```
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)
```

Also, there are custom tokenizer libraries that one may want to load. Probably we would want to keep it so that users can specify their custom tokenizers in addition to the standard one from spaCy.

In [None]:
doc = nlp(data)
for token in doc:
    print(token.text)

# Lemmatizer
https://spacy.io/usage/linguistic-features#lemmatization

In [None]:
lemmatizer = nlp.get_pipe("lemmatizer")
print(lemmatizer.mode)  # 'rule'
doc = nlp("I was reading the paper.")
print([token.lemma_ for token in doc])

In [None]:
doc = nlp(data)
print([token.lemma_ for token in doc])

Should punctuation be excluded?

# POS tagger
https://spacy.io/usage/linguistic-features#pos-tagging

In [None]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
doc = nlp(data)
for token in doc:
    print(token.text, token.pos_, token.tag_)

# Morphology
https://spacy.io/usage/linguistic-features#morphology

In [None]:
print("Pipeline:", nlp.pipe_names)
doc = nlp("I was reading the paper.")
token = doc[0]  # 'I'
print(token.morph)  # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
print(token.morph.get("PronType"))  # ['Prs']

# Constituency

# Collocation

# Word vectors

# Dependency

# Named entities