In [None]:
import spacy
from spacy.lang.en import English

# Sentencizer
https://spacy.io/usage/linguistic-features#sbd

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a sentence. This is another sentence.")
assert doc.has_annotation("SENT_START")
for sent in doc.sents:
    print(sent.text)

In [None]:
name = "../data/Original/iued_test_original.txt"
# name = "../data/Original/iued_test_original.vrt"
with open(name, "r") as myfile:
    data = myfile.read().replace("\n", "")

In [None]:
print(data)

In [None]:
doc = nlp(data)
assert doc.has_annotation("SENT_START")
for sent in doc.sents:
    print(sent.text)
    print("***")

This gives somewhat accurate results, with some errors after numbers. You can also use a trained model, however this will not work on uncommon texts.

In [None]:
doc = nlp(data)
for sent in doc.sents:
    print(sent.text)
    print("***")

Also fails for the example here. Then there is the one based on a statistical model.

In [None]:
nlp.enable_pipe("senter")
doc = nlp(data)
for sent in doc.sents:
    print(sent.text)
    print("***")

Directly use the sentencizer without the pipeline - this one looks at punctuation.

In [None]:
nlp = English()  # just the language with no pipeline
nlp.add_pipe("sentencizer")
doc = nlp(data)
for sent in doc.sents:
    print(sent.text)
    print("***")

Seems to work correctly. What is the difference to the pipeline? In the DW scripts, the other components are disabled via the "exclude" command - should be faster as pipeline is not loaded at all.

In [None]:
texts = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]

nlp = spacy.load("en_core_web_sm")
for doc in nlp.pipe(
    texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]
):
    # Do something with the doc here
    print([(ent.text, ent.label_) for ent in doc.ents])

In [None]:
nlp = spacy.load(
    "en_core_web_sm", exclude=["tagger", "ner", "attribute_ruler", "lemmatizer"]
)
for doc in nlp.pipe(texts):
    for sent in doc.sents:
        print(sent.text)
        print("***")

# Tokenizer
https://spacy.io/usage/linguistic-features#tokenization  
We need to allow for special case rules. 
```
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)
```

Also, there are custom tokenizer libraries that one may want to load. Probably we would want to keep it so that users can specify their custom tokenizers in addition to the standard one from spaCy.

In [None]:
doc = nlp(data)
for token in doc:
    print(token.text)

# Lemmatizer
https://spacy.io/usage/linguistic-features#lemmatization

needs package spacy_lookups_data to run

In [None]:
lemmatizer = nlp.add_pipe(
    "lemmatizer"
)  # need to be carefull which components are already in the pipeline or not. get_pipe() throws me an error when running this from the top
print(lemmatizer.mode)  # 'rule'
lemmatizer.initialize(lookups=None)
doc = nlp("I was reading the paper.")
print([token.lemma_ for token in doc])

In [None]:
doc = nlp(data)
print([token.lemma_ for token in doc])

Should punctuation be excluded?

# POS tagger
https://spacy.io/usage/linguistic-features#pos-tagging

In [None]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(
        token.text,
        token.lemma_,
        token.pos_,
        token.tag_,
        token.dep_,
        token.shape_,
        token.is_alpha,
        token.is_stop,
    )

In [None]:
doc = nlp(data)
for token in doc:
    print(token.text, token.pos_, token.tag_)

# Morphology
https://spacy.io/usage/linguistic-features#morphology

In [None]:
print("Pipeline:", nlp.pipe_names)
doc = nlp("I was reading the paper.")
token = doc[0]  # 'I'
print(token.morph)  # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
print(token.morph.get("PronType"))  # ['Prs']

# Constituency

# Collocation

# Word vectors

# Dependency

# Named entities

Needs a spaCy pipeline that supports entity recognition. After running the pipeline the named entities can be accesed via Doc.ents.

In [None]:
from collections import defaultdict

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
# in our case
doc = nlp(data)

for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.label_)

In [None]:
def named_entities_spacy(doc):
    """Fetch named entities from spaCy Doc object.

    [Args]:
            doc[class object]: The doc object of the data of interest after application of a spaCy pipeline.

    [Returns]:
            [dict]: Dictionary containing the identified named entities by name and type of label given by
                    spaCy associated with a list containing the indices to the start and end char for each
                    individual instance."""

    # define defaultdict to store the named entities
    named_entities = defaultdict(list)

    for ent in doc.ents:
        # add the entities label, start_char and end_char to the dictionary
        named_entities[
            "Text: {} |Label: {} |IOB: {}".format(ent.text, ent.label_, ent.label)
        ].append([ent.start_char, ent.end_char])

    return named_entities

In [None]:
named_ent = named_entities_spacy(doc)

In [None]:
print(named_ent)

In [None]:
for idx in named_ent["Text: Audi |Label: ORG |IOB: 383"]:
    print(data[idx[0] - 20 : idx[1] + 20])

In [None]:
print(doc[0])

# Unique Tokens or smthg :)


Nicht das was named entities eigentlich will...
Für einzelne Tokens: [Matcher](https://spacy.io/usage/rule-based-matching) \
Für ganze Sätze: [Phrasematcher](https://spacy.io/usage/rule-based-matching#phrasematcher) \
Ich schätze für den Moment sind wir nur an einzelnen Tokens interessiert? Oder an allen einzigartigen Token im ganzen Text?

In [None]:
from spacy.matcher import Matcher

# this works on the same example as above

nlp = spacy.load("en_core_web_sm")
# initialize the matcher, the vocab has to be the same as for the text
matcher = Matcher(nlp.vocab)

# idealy the user specifies what he wants to search and what attribute to assign?
terms = [[{"LOWER": "audi"}], [{"LOWER": "improvements"}], [{"LOWER": "parking"}]]

# also supports regular expressions:
terms += [[{"TEXT": {"REGEX": r"^[Ii](\.?|f)$"}}]]  # search for I, i, If, if

print("Query: {}".format(terms))

# add the terms to look for to the mathcer
matcher.add("Query", terms)

# load the data into doc
doc = nlp(data)
# run the matcher on the text in doc
matches = matcher(doc)

# get the indices (would correspond to corpus position from cwb?)
indices = [[start, end] for _, start, end in matches]

print(indices)

dict_out = defaultdict(
    list
)  # default dict initializes the value of a new key that is added with an empty list
# which we can then append to

# put the found indices to access the searched terms in a dictionary where they are available via said terms
# We have to go through all the found entities to confirm what term they correspond to...
# For large texts where we have many hits faster to split the query beforehand? Maybe parallel searching?
for index_pair in indices:
    dict_out["{}".format(doc[index_pair[0] : index_pair[1]])].append(index_pair)

# display the output
for key in dict_out:
    print("{} found at location {}.".format(key, dict_out[key]))

# for index in indices:
#    print('{} found at index location {}'.format(doc[index], index))

# print(matches)

# for match_id, start, end in matches:
#    string_id = nlp.vocab.strings[match_id] #Get string representation
#    span = doc[start:end] # matched span
#    print(match_id, string_id, start, end, span.text)

In [None]:
# can also search for words of certain length or above/below certain lengths
matcher = Matcher(nlp.vocab)

pattern = [[{"LENGTH": {"==": 10}}]]  # , [{"LENGTH":{"<=":1}}], [{"LENGTH":{">=":12}}]]

matcher.add("Query", pattern)
matches = matcher(doc)

indices = [[start, end] for _, start, end in matches]

dict_out = defaultdict(list)

for index_pair in indices:
    dict_out["{}".format(doc[index_pair[0] : index_pair[1]])].append(index_pair)

for key in dict_out:
    print("{} found at location {}.".format(key, dict_out[key]))

In [None]:
# search for token pattern

matcher = Matcher(nlp.vocab)

# seach for the different types of cars with a "wildcard token pattern" leaving the last token empty
pattern = [[{"ORTH": "Audi"}, {"ORTH": "A"}, {}]]

matcher.add("Query", pattern)

matches = matcher(doc)

indices = [[start, end] for _, start, end in matches]

for _, start, end in matches:
    indices.append([start, end])

dict_out = defaultdict(list)

for index_pair in indices:
    dict_out["{}".format(doc[index_pair[0] : index_pair[1]])].append(index_pair)

for key in dict_out:
    print("{} found at location {}.".format(key, dict_out[key]))

In [None]:
# put it in a function


def search_text(query, nlp, doc):
    matcher = Matcher(nlp.vocab)

    matcher.add("Query", query)

    matches = matcher(doc)

    indices = [[start, end] for _, start, end in matches]

    dict_out = defaultdict(list)

    for index_pair in indices:
        dict_out["{}".format(doc[index_pair[0] : index_pair[1]])].append(index_pair)

    # for key in dict_out:
    #    print("{} found at location {}.".format(key, dict_out[key]))
    return dict_out

In [None]:
query = [[{"ORTH": "Audi"}, {"ORTH": "A"}, {}]]

test1 = search_text(query, nlp, doc)

print(test1)

print("*" * 50)

# can just add different queries together
query += terms

test2 = search_text(query, nlp, doc)

print(test2)

In [None]:
def get_unique(doc):
    """Get number of unique words in doc"""

    seen = set()
    for token in doc:
        if token.text not in seen:
            seen.add(token.text)
    return seen

In [None]:
def search_token(query, nlp, doc):
    """search text for specific token and return all the found locations in dict."""

    matcher = Matcher(nlp.vocab)

    matcher.add("Query", query)

    matches = matcher(doc)

    indices = [[start, end] for _, start, end in matches]

    dict_out = defaultdict(list)

    dict_out["{}".format(doc[indices[0][0] : indices[0][1]])].append(indices)

    # for key in dict_out:
    #    print("{} found at location {}.".format(key, dict_out[key]))
    return dict_out

In [None]:
def unique_tokens(nlp, doc):
    """Get locations for all umique words in doc into a dictionary, case sensitive"""

    unique_tokens = None

    # get the number of unique tokens in text, so we don't index twice
    unique = get_unique(doc)

    for token in unique:

        if unique_tokens:
            # if the dictionary is not empty:
            if token in unique_tokens:
                # if the token is already in the dictionary:
                pass
            elif token not in unique_tokens:
                # if the token is not already in there add it:
                unique_tokens.update(
                    search_token([[{"ORTH": "{}".format(token)}]], nlp, doc)
                )
        else:
            # if the dictionary hasn't been initialized do so with first token
            unique_tokens = search_token([[{"ORTH": "{}".format(token)}]], nlp, doc)

        if len(unique_tokens) == unique:
            # if we have passed each unique word already there is no need to continue
            break

    return unique_tokens

In [None]:
unique_tok = unique_tokens(nlp, doc)

In [None]:
for key in named_ent:
    print("{}: {}".format(key, named_ent[key]))

In [None]:
print(len(named_ent))

In [None]:
print(named_ent["the"])