In [1]:
from __future__ import print_function, unicode_literals
import spacy
import warnings
from spacy.util import minibatch, compounding
import de_core_news_lg
import pandas as pd
import numpy as np
from random import sample
import io, csv
import re
import random
import de_core_news_lg
import json
nlp = spacy.load('de_core_news_lg')

In [2]:
#out_sentences=open("input/tagged_sentences.csv", "r").readlines()
with open('input/tagged_sentences.json') as json_file:
    out_sentences = json.load(json_file)
out_sentences[:5]

[['Wie viel Eigenkapital  hat  in Bauma  ?', {'entities': []}],
 ['Welches ist das Eigenkapital   in Marthalen ?', {'entities': []}],
 ['Was ist der Anteil an Bruttoverschuldungsanteil   in Kleinandelfingen ?',
  {'entities': []}],
 ['Wie viel Steuerbares Vermögen natürliche Pers.  hat  Birmensdorf  ?',
  {'entities': []}],
 ['Welches ist das Steuerbares Vermögen natürliche Pers.   in Rorbas ?',
  {'entities': []}]]

In [3]:
npr_df = pd.read_csv("external/deu_news_2015_3M-sentences.txt", delimiter = "\t")
npr_df=npr_df.sample(frac=1)
npr_df.head()

Unnamed: 0,1,­
1753803,1851654,Letztes aktuelles Video: Die Jordas-Regel Quel...
1188595,1250516,ElektromobilitätDie Stromverschenker Aldi Süd ...
757695,792404,Der vermutliche Pilot des Phantom wurde damals...
24426,25059,Ab Berlin gelangt man mit United (www.
2559025,2700206,Wirtschafts- & Finanzinfo Banking & Trading » ...


In [4]:
# create an nlp object as we'll use this to seperate the sentences and identify existing entities
#loaded already above
#nlp = spacy.load('de_core_news_lg')

In [5]:
revision_texts = []

# convert the articles to spacy objects to better identify the sentences. Disabled unneeded components. # takes ~ 4 minutes
for doc in nlp.pipe(npr_df.iloc[:10000,1], batch_size=30, disable=["tagger", "ner"]):
    for sentence in doc.sents:
        if  40 < len(sentence.text) < 80:
            # some of the sentences had excessive whitespace in between words, so we're trimming that
            revision_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))

In [6]:
revisions = []

# Use the existing spaCy model to predict the entities, then append them to revision
for doc in nlp.pipe(revision_texts, batch_size=50, disable=["tagger", "parser"]):
    
    # don't append sentences that have no entities
    if len(doc.ents) > 0:
        revisions.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))

In [7]:
# print an example of the revision sentence
print(revisions[0][0])

# print an example of the revision data
print(revisions[0][1])


ElektromobilitätDie Stromverschenker Aldi
{'entities': [(0, 36, 'ORG'), (37, 41, 'ORG')]}


In [8]:
# create arrays to store the revision data
TRAIN_REVISION_DATA = []
TEST_REVISION_DATA = []

# create dictionaries to keep count of the different entities
TRAIN_ENTITY_COUNTER = {}
TEST_ENTITY_COUNTER = {}

# This will help distribute the entities (i.e. we don't want 1000 PERSON entities, but only 80 ORG entities)
REVISION_SENTENCE_SOFT_LIMIT = 200

# helper function for incrementing the revision counters
def increment_revision_counters(entity_counter, entities):
    for entity in entities:
        label = entity[2]
        if label in entity_counter:
            entity_counter[label] += 1
        else:
            entity_counter[label] = 1

random.shuffle(revisions)
for revision in revisions:
    # get the entities from the revision sentence
    entities = revision[1]["entities"]

    # simple hack to make sure spaCy entities don't get too one-sided
    should_append_to_train_counter = 0
    for _, _, label in entities:
        if label in TRAIN_ENTITY_COUNTER and TRAIN_ENTITY_COUNTER[label] > REVISION_SENTENCE_SOFT_LIMIT:
            should_append_to_train_counter -= 1
        else:
            should_append_to_train_counter += 1

    # simple switch for deciding whether to append to train data or test data
    if should_append_to_train_counter >= 0:
        TRAIN_REVISION_DATA.append(revision)
        increment_revision_counters(TRAIN_ENTITY_COUNTER, entities)
    else:
        TEST_REVISION_DATA.append(revision)
        increment_revision_counters(TEST_ENTITY_COUNTER, entities)

In [9]:
TRAIN_ENTITY_COUNTER

{'MISC': 202, 'LOC': 206, 'ORG': 202, 'PER': 211}

In [10]:
TEST_ENTITY_COUNTER

{'PER': 553, 'LOC': 581, 'ORG': 393, 'MISC': 545}

In [11]:
TRAIN_REVISION_DATA

[('Das hast Du jetzt auch wieder Recht, stimmt.',
  {'entities': [(9, 17, 'MISC')]}),
 ('Gewürdigt wurde auch das Aktienresearch der Bank.',
  {'entities': [(25, 39, 'MISC')]}),
 ('Die Kluft zwischen Arm und Reich darf nicht weiter auseinandergehen.',
  {'entities': [(27, 32, 'LOC')]}),
 ('"Behörden wie die EPA sind inzwischen bis aufs Blut gereizt.',
  {'entities': [(18, 21, 'ORG')]}),
 ('Hier gibt es aktuelle TV-News im Überblick.',
  {'entities': [(22, 29, 'MISC')]}),
 ('Seine Lieder handeln von Schönheit, von der Schönheit Rio de Janeiros.',
  {'entities': [(54, 69, 'LOC')]}),
 ('Gewerkschaft der Polizei findet das bedenklich.',
  {'entities': [(0, 24, 'ORG')]}),
 ('ORF.at / Meldung: "Musik trennt und verbindet"',
  {'entities': [(0, 6, 'MISC')]}),
 ('Guardiola: "Das interessiert mich überhaupt nicht"',
  {'entities': [(0, 9, 'PER')]}),
 ('Er plant offensichtlich lieber mit Leipzig.',
  {'entities': [(35, 42, 'LOC')]}),
 ('Das Windows-Geschäft schrumpfte dagegen im weiterhin schwac

In [12]:
random.shuffle(out_sentences)
TRAIN_STAT_DATA=out_sentences[:len(out_sentences)//2]
TEST_STAT_DATA=out_sentences[len(out_sentences)//2:]

In [13]:
print(len(out_sentences))
print(len(TRAIN_STAT_DATA))
print(len(TEST_STAT_DATA))
print("REVISION", len(TRAIN_REVISION_DATA))
TRAIN_DATA = TRAIN_REVISION_DATA + TRAIN_STAT_DATA
print("COMBINED", len(TRAIN_DATA))

510
255
255
REVISION 562
COMBINED 817


In [None]:
#spacy 3.0 attempt


ner = nlp.get_pipe("ner")
ner.add_label("LOCALITY")
from spacy.training import Example
from spacy.tokens import Doc


# get the names of the components we want to disable during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# start the training loop, only training NER
epochs = 30
optimizer = nlp.resume_training()
#optimizer = nlp.initialize()
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    sizes = compounding(1.0, 4.0, 1.001)
    
    # batch up the examples using spaCy's minibatc
    for epoch in range(epochs):
        random.shuffle(TRAIN_DATA)
        #text = []
        #annots=[]
        examples=[]


        for text,annots in TRAIN_DATA:
            #text.append(t)
            #annots.append(a)
            doc = nlp.make_doc(text)    
            example = Example.from_dict(doc, annots)
            examples.append(example)
        
        losses = {}
        
        nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)

        print("Losses ({}/{})".format(epoch + 1, epochs), losses)

Losses (1/30) {'ner': 1036.7250943763693}
Losses (2/30) {'ner': 989.7197784991635}
Losses (3/30) {'ner': 826.3420708181119}
Losses (4/30) {'ner': 838.7834664121892}
Losses (5/30) {'ner': 733.5376520684233}
Losses (6/30) {'ner': 724.8222034484029}
Losses (7/30) {'ner': 685.6173731966405}
Losses (8/30) {'ner': 625.7451128661105}
Losses (9/30) {'ner': 656.9089558631244}
Losses (10/30) {'ner': 619.3330717421031}
Losses (11/30) {'ner': 606.5922888267305}
Losses (12/30) {'ner': 549.0530150022387}
Losses (13/30) {'ner': 554.0773956898993}
Losses (14/30) {'ner': 513.9042212301647}
Losses (15/30) {'ner': 551.873147565741}
Losses (16/30) {'ner': 512.5156379520868}
Losses (17/30) {'ner': 473.285986641853}
Losses (18/30) {'ner': 449.35281474152}
Losses (19/30) {'ner': 411.3688000677557}
Losses (20/30) {'ner': 420.41035345885757}
Losses (21/30) {'ner': 413.47846204106463}
Losses (22/30) {'ner': 365.22585047420375}
Losses (23/30) {'ner': 403.39692376903076}
Losses (24/30) {'ner': 362.4690711737503}


In [None]:
spacy.displacy.render(nlp("Ich heisse Christian und war heute in Zürich bei IBM."), style="ent")
spacy.displacy.render(nlp("Wie viele Kühe hat die Gemeinde Bülach?"), style="ent")
spacy.displacy.render(nlp("Wie hoch ist Eigenkapital auf Bezirksebene?"), style="ent")
spacy.displacy.render(nlp("Ich brauche die Daten pro Bezirk"), style="ent")
spacy.displacy.render(nlp("Ich brauche die Daten für den gesamten Kanton."), style="ent")
spacy.displacy.render(nlp("Wie viel Bauinv. EFH 5 Jahre  hat  in Regensdorf  ?"), style="ent")
spacy.displacy.render(nlp("Was ist der Anteil an MIV-Anteil (Modal Split)   auf Bezirksebene ?"), style="ent")
spacy.displacy.render(nlp("Was ist der Anteil an Geb.Vol. Dienstleistungen: Zunahme   in Flaach ?"), style="ent")
spacy.displacy.render(nlp("Welches ist das Schül. Sekundarstufe II   für den gesamten Kanton ?"), style="ent")
spacy.displacy.render(nlp("Welche Gemeinde hat die grösste Bevölkerung?"), style="ent")

In [None]:
# dictionary to hold our evaluation data
stat_evaluation = {
    "locality": {
        "correct": 0,
        "total": 0,
    }
}

word_evaluation = {
    "locality": {
        "correct": 0,
        "total": 0
    }

}

for stat in TEST_STAT_DATA:
    # extract the sentence and correct stat entities according to our test data
    sentence = stat[0]
    entities = stat[1]["entities"]

    # for each entity, use our updated model to make a prediction on the sentence
    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]
        n_worded_stat =  len(correct_text.split())

        # if we find that there's a match for predicted entity and predicted text, increment correct counters
        for ent in doc.ents:
            if ent.label_ == entity[2] and ent.text == correct_text:
                stat_evaluation["locality"]["correct"] += 1
                if n_worded_stat > 0:
                    word_evaluation["locality"]["correct"] += 1

                # this break is important, ensures that we're not double counting on a correct match
                break

        #  increment total counters after each entity loop
        stat_evaluation["locality"]["total"] += 1
        if n_worded_stat > 0:
            word_evaluation["locality"]["total"] += 1

In [None]:
for key in word_evaluation:
    correct = word_evaluation[key]["correct"]
    total = word_evaluation[key]["total"]

    print(f"{key}: {correct / total * 100:.2f}%")

stat_total_sum = 0
stat_correct_sum = 0

print("---")
for key in stat_evaluation:
    correct = stat_evaluation[key]["correct"]
    total = stat_evaluation[key]["total"]
    
    stat_total_sum += total
    stat_correct_sum += correct

    print(f"{key}: {correct / total * 100:.2f}%")

print(f"\nTotal: {stat_correct_sum/stat_total_sum * 100:.2f}%")

In [None]:
# dictionary which will be populated with the entities and result information
entity_evaluation = {}

# helper function to udpate the entity_evaluation dictionary
def update_results(entity, metric):
    if entity not in entity_evaluation:
        entity_evaluation[entity] = {"correct": 0, "total": 0}
    
    entity_evaluation[entity][metric] += 1

# same as before, see if entities from test set match what spaCy currently predicts
for data in TEST_REVISION_DATA:
    sentence = data[0]
    entities = data[1]["entities"]

    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]

        for ent in doc.ents:
            if ent.label_ == entity[2] and ent.text == correct_text:
                update_results(ent.label_, "correct")
                break

        update_results(entity[2], "total")

In [None]:
sum_total = 0
sum_correct = 0

for entity in entity_evaluation:
    total = entity_evaluation[entity]["total"]
    correct = entity_evaluation[entity]["correct"]

    sum_total += total
    sum_correct += correct
    
    print("{} | {:.2f}%".format(entity, correct / total * 100))

print()
print("Overall accuracy: {:.2f}%".format(sum_correct / sum_total * 100))

In [None]:
nlp.meta["name"] = "stat_entity_extractor_v1"
nlp.to_disk("./models/v1")

In [None]:
TRAIN_STAT_DATA[5]


In [None]:
doc = nlp(u'Welche Gemeinde hat die grösste Bevölkerung?')

# show universal pos tags
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.pos_) for t in doc))
# output: Ich/PRON bin/AUX ein/DET Berliner/NOUN ./PUNCT

# show German specific pos tags (STTS)
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.tag_) for t in doc))
# output: Ich/PPER bin/VAFIN ein/ART Berliner/NN ./$.

# show dependency arcs
print('\n'.join('{child:<8} <{label:-^7} {head}'.format(child=t.orth_, label=t.dep_, head=t.head.orth_) for t in doc))
# output: (sb: subject, nk: noun kernel, pd: predicate)

#named entities
print("Named Entity Recognition:")
for ent in doc.ents:
    print(ent.text)
print("Noun chunks:")
for chunk in doc.noun_chunks:
    print(chunk.text)