# Sentencizer
Needed here as everything has to be a sentence anyways?

# Tokenizer

# Lemmatizer

# POS Tagger

# Morphology

# Constituency

# Colocation

# Word vectors

# Named entities


In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.tokenization import SegtokSentenceSplitter
from collections import defaultdict

# get the sentence
sentence = Sentence("I love Berlin .")
# load the NER model
tagger = SequenceTagger.load("ner")
# apply model to sentence
tagger.predict(sentence)

In [None]:
print(sentence)
print("The following NER tags are found:")

# iterate over entities and print
for entity in sentence.get_spans("ner"):
    print(entity)

In [None]:
path = "../data/Original/iued_test_original.txt"

with open(path, "r") as file:
    data = file.read().replace("\n", "")

In [None]:
print(SegtokSentenceSplitter.__doc__)

Apparently Flair can only work on individual sentences, it does however provide a function to split text into sentences. For the test-data it seems to be unable to do so, turning it into one long "sentence" instead.  
  
-> The problem seems to be related with the spaces before the punctuation in the text. This seems to disable the splitter from recognizing the sentences as separate units.  
  
If we have this text split into sentences from a different programm we could also just use those sentences directly.  
  
-> leads to indexing problems as the char positions are then labeled according to the individual sentences, disregarding the original position in the whole text.

In [None]:
# initialize the splitter
splitter = SegtokSentenceSplitter()

# try to split the test-data as it is provided
sentences = splitter.split(data)

# take a look at one long sentence
for sentence in sentences:
    print(sentence)

In [None]:
# adjust the data to not contain spaces before punctuation -> enable flairs sentencizer.
data1 = data.replace(" . ", ". ")

# separate into sentences and discard any empty entries that may arise
sentences1 = [sentence for sentence in splitter.split(data1) if len(sentence) != 0]

# show new split text
for sentence in sentences1:
    print(sentence)

In [None]:
print(sentences1[-1].to_original_text())
print(len(sentences1[-1].to_original_text()))
print(len(sentences1[-1]))

In [None]:
tagger1 = SequenceTagger.load("ner-fast")

tagger.predict(sentences)

In [None]:
named_ent = defaultdict(list)

# iterate through the sentences
for sentence in sentences:
    # for found entities in the sentence
    for entity in sentence.get_spans():
        # extract the text, label and position of start and end char
        named_ent[
            "Text: {} |Label: {}".format(entity.text, str(entity.labels[0]).split()[0])
        ].append([int(entity.start_pos), int(entity.end_pos)])

print(named_ent)

Flair seems to be quite slow compared to spacy and stanza, even while using the supposedly fast 'ner-fast' model. This is due to the fact that we are working on one really long sentence. If the sentence splitting would acutally work the progress should be much faster as displayed in the next cell.

In [None]:
tagger.predict(sentences1)

significantly faster like this

In [None]:
named_ent1 = defaultdict(list)
idx = 0
for sentence in sentences1:
    for entity in sentence.get_spans():
        named_ent1[
            "Text: {} |Label: {}".format(entity.text, str(entity.labels[0]).split()[0])
        ].append([int(entity.start_pos) + idx, int(entity.end_pos) + idx])
    idx += len(sentence.to_original_text()) + 2

print(named_ent1)

However the problem of only indexing the individual sentences remains! Do we want to keep track of the index manually as above? If so, we need to be sure that the input data will always have a space before punctuation. We could also strip these before and just work on the "reduced" data.

In [None]:
for sentence in sentences:
    print(sentence.to_tagged_string())

In [None]:
# for sentence in sentences:
# get the spans for each entity
# for entity in sentence.get_spans():
# print the text associated
# print(entity.text)
# grab the position in the corpus, subtract one to account for
# the indexing starting at 1 in flair (cwb starts at 0)
# pos = [entity.start_pos, entity.end_pos]

# print(pos)
# print(str(entity.labels[0]).split()[0])

In [None]:
print(sentences[0][0])

Token Indices start at 1 in flair

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.tokenization import SegtokSentenceSplitter
from collections import defaultdict

In [None]:
def named_entities_flair(data):

    splitter = SegtokSentenceSplitter()

    sentences = [sentence for sentence in splitter.split(data) if len(sentence) != 0]

    # load the NER model
    tagger = SequenceTagger.load("ner")
    # apply model to sentence
    tagger.predict(sentences, mini_batch_size=32)
    # set up dictionary
    named_entities = defaultdict(list)
    idx = 0
    # extract NER data from flair
    for sentence in sentences:
        for entity in sentence.get_spans():
            named_entities[
                "Text: {} |Label: {}".format(
                    entity.text, str(entity.labels[0]).split()[0]
                )
            ].append([int(entity.start_pos + idx), int(entity.end_pos + idx)])
        idx += len(sentence.to_original_text()) + 2
    return named_entities

In [None]:
# use already reduced data here
named_ent = named_entities_flair(data1)

In [None]:
print(named_ent)

In [None]:
# compare to original data, only works if every sentence ends in " ." for the original.
for idx in named_ent["Text: Audi A |Label: MISC"]:
    print(data[idx[0] - 20 : idx[1] + 20])

# Try with pre-separated sentences from spacy

In [None]:
import spacy
from spacy.lang.en import English

In [None]:
# Sentencize with spacy
nlp = English()
nlp.add_pipe("sentencizer")
doc = nlp(data)

sentences = []
for sent in doc.sents:
    sentences.append(str(sent))

In [None]:
# transform the sentences from spacy objects to flair objects
new_sent = []
for sentence in sentences:
    new_sent.append(Sentence(sentence))

# check out the new sentences
for sentence in new_sent:
    print(sentence)

In [None]:
# run the NER
tagger.predict(new_sent)

In [None]:
# get the results
temp = defaultdict(list)

for sentence in new_sent:
    for entity in sentence.get_spans():
        temp[
            "Text: {} |Label: {}".format(entity.text, str(entity.labels[0]).split()[0])
        ].append([int(entity.start_pos), int(entity.end_pos)])

In [None]:
# look at the results
print(temp)

Seems to be significantly faster than working on one long "sentence". We do however lose the continuous char indexing and only get the indices for the respective sentences.