In [1]:
from __future__ import print_function, unicode_literals
import spacy
import warnings
from spacy.util import minibatch, compounding
import de_core_news_lg
import pandas as pd
import numpy as np
from random import sample
import io, csv
import re
import random
import de_core_news_lg
import json
from spacy.training import Example
from spacy.tokens import Doc
nlp = spacy.load('de_core_news_lg')

In [2]:
# helper function for incrementing the revision counters
def increment_revision_counters(entity_counter, entities):
    for entity in entities:
        label = entity[2]
        if label in entity_counter:
            entity_counter[label] += 1
        else:
            entity_counter[label] = 1

In [3]:
#out_sentences=open("input/tagged_sentences.csv", "r").readlines()
with open('input/tagged_sentences.json') as json_file:
    out_sentences = json.load(json_file)
print("LENGTH OF DATASET: ",len(out_sentences))
dataset_dict={}

for sent in out_sentences:
    entities = sent[1]["entities"]
    increment_revision_counters(dataset_dict, entities)
#    print(entity[1])
    #helper_dict.append(entity[1]['entities'][0][2])
#out_sentences[:5]
print(dataset_dict)

LENGTH OF DATASET:  1020
{'DATA': 1010, 'GRAN': 282}


In [4]:
npr_df = pd.read_csv("external/deu_news_2015_3M-sentences.txt", delimiter = "\t")
npr_df=npr_df.sample(frac=1)
npr_df.head()

Unnamed: 0,1,­
1207161,1269596,"Er führte aus, wie er während seiner Ems-Amtsz..."
411754,427638,"Daher ist fraglich, wie repräsentativ die Erge..."
1570656,1657722,Im vergangenen August machte sie bereits Gebra...
1206548,1268972,"Er fordert: ""Wettbewerb muss fair sein und bra..."
1845540,1949795,Mit Thomas Hügli* sprach Dominique Eigenmann. ...


In [5]:
# create an nlp object as we'll use this to seperate the sentences and identify existing entities
#loaded already above
#nlp = spacy.load('de_core_news_lg')

In [46]:
revision_texts = []

# convert the articles to spacy objects to better identify the sentences. Disabled unneeded components. # takes ~ 4 minutes
for doc in nlp.pipe(npr_df.iloc[:100000,1], batch_size=30, disable=["tagger", "ner"]):
    for sentence in doc.sents:
        if  40 < len(sentence.text) < 80:
            # some of the sentences had excessive whitespace in between words, so we're trimming that
            revision_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))

In [47]:
revisions = []

# Use the existing spaCy model to predict the entities, then append them to revision
for doc in nlp.pipe(revision_texts, batch_size=50, disable=["tagger", "parser"]):
    
    # don't append sentences that have no entities
    if len(doc.ents) > 0:
        revisions.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))

In [48]:
# print an example of the revision sentence
print(revisions[0][0])

# print an example of the revision data
print(revisions[0][1])


Bente Kraus aus Berlin musste mit Rang zwölf zufrieden sein.
{'entities': [(0, 11, 'PER'), (16, 22, 'LOC')]}


In [49]:
# create arrays to store the revision data
TRAIN_REVISION_DATA = []
TEST_REVISION_DATA = []

# create dictionaries to keep count of the different entities
TRAIN_ENTITY_COUNTER = {}
TEST_ENTITY_COUNTER = {}

# This will help distribute the entities (i.e. we don't want 1000 PERSON entities, but only 80 ORG entities)
REVISION_SENTENCE_SOFT_LIMIT = 100



random.shuffle(revisions)
for revision in revisions:
    # get the entities from the revision sentence
    entities = revision[1]["entities"]

    # simple hack to make sure spaCy entities don't get too one-sided
    should_append_to_train_counter = 0
    for _, _, label in entities:
        if label in TRAIN_ENTITY_COUNTER and TRAIN_ENTITY_COUNTER[label] > REVISION_SENTENCE_SOFT_LIMIT:
            should_append_to_train_counter -= 1
        else:
            should_append_to_train_counter += 1

    # simple switch for deciding whether to append to train data or test data
    if should_append_to_train_counter >= 0:
        TRAIN_REVISION_DATA.append(revision)
        increment_revision_counters(TRAIN_ENTITY_COUNTER, entities)
    else:
        TEST_REVISION_DATA.append(revision)
        increment_revision_counters(TEST_ENTITY_COUNTER, entities)

In [50]:
TRAIN_ENTITY_COUNTER

{'DATA': 101, 'PER': 114, 'ORG': 117, 'MISC': 116, 'LOC': 145, 'GRAN': 101}

In [51]:
TEST_ENTITY_COUNTER

{'PER': 7682, 'ORG': 6820, 'MISC': 7621, 'LOC': 7211, 'DATA': 573, 'GRAN': 76}

In [52]:
TRAIN_REVISION_DATA

[('Warum können die nicht reinschreiben ob man das für 30 oder 60 FPS braucht..',
  {'entities': [(74, 76, 'DATA')]}),
 ('Stefan Aust mag es, Einfluss zu haben und Einfluss zu nehmen.',
  {'entities': [(0, 11, 'PER')]}),
 ('Angela Merkel - herausragende Momente einer Kanzlerin dpa,',
  {'entities': [(0, 13, 'PER')]}),
 ('Titan Aerospace könnte fliegende Internet-Sendestationen bauen.',
  {'entities': [(0, 15, 'ORG'), (33, 56, 'MISC')]}),
 ('Persönliche Nachricht Hat der IS versucht, Geert Wilders zu ermorden?',
  {'entities': [(30, 32, 'ORG'), (43, 56, 'PER')]}),
 ('Die Ferien sind vorbei“ ist das Dolce far niente eher eine Bürde:',
  {'entities': [(32, 37, 'MISC')]}),
 ('Dann ließen die Mächtigen von Hannover 96',
  {'entities': [(30, 41, 'ORG')]}),
 (' 1915867 Minetti gilt als Schlüsselzeugin in der Ruby-Affäre.',
  {'entities': [(26, 41, 'MISC'), (49, 60, 'MISC')]}),
 ('Sie teilen sich das Fleisch mit den Löwen.',
  {'entities': [(36, 41, 'ORG')]}),
 ('Allein an der Elisabeth-Selber

In [53]:
random.shuffle(out_sentences)
TRAIN_STAT_DATA=out_sentences[:len(out_sentences)//2]
TEST_STAT_DATA=out_sentences[len(out_sentences)//2:]

In [54]:
print(len(out_sentences))
print(len(TRAIN_STAT_DATA))
print(len(TEST_STAT_DATA))
print("REVISION", len(TRAIN_REVISION_DATA))
TRAIN_DATA = TRAIN_REVISION_DATA + TRAIN_STAT_DATA
print("COMBINED", len(TRAIN_DATA))

1020
510
510
REVISION 479
COMBINED 989


In [55]:
ner = nlp.get_pipe("ner")
ner.add_label("GRAN")
ner.add_label("DATA")



# get the names of the components we want to disable during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# start the training loop, only training NER
epochs = 30
optimizer = nlp.resume_training()
#optimizer = nlp.initialize()
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    sizes = compounding(1.0, 4.0, 1.001)
    
    # batch up the examples using spaCy's minibatc
    for epoch in range(epochs):
        random.shuffle(TRAIN_DATA)
        #text = []
        #annots=[]
        examples=[]


        for text,annots in TRAIN_DATA:
            #text.append(t)
            #annots.append(a)
            doc = nlp.make_doc(text)    
            example = Example.from_dict(doc, annots)
            examples.append(example)
        
        losses = {}
        
        nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)

        print("Losses ({}/{})".format(epoch + 1, epochs), losses)

Losses (1/30) {'ner': 603.6519041378497}
Losses (2/30) {'ner': 521.958930143036}
Losses (3/30) {'ner': 528.4129105627999}
Losses (4/30) {'ner': 497.753090302857}
Losses (5/30) {'ner': 532.9863592055688}
Losses (6/30) {'ner': 509.5232007509852}
Losses (7/30) {'ner': 491.3220482238376}
Losses (8/30) {'ner': 447.5450727470882}
Losses (9/30) {'ner': 454.8763724614837}
Losses (10/30) {'ner': 424.7050009609724}
Losses (11/30) {'ner': 462.0955569751394}
Losses (12/30) {'ner': 463.85766683346446}
Losses (13/30) {'ner': 379.27706512248824}
Losses (14/30) {'ner': 333.7394603521992}
Losses (15/30) {'ner': 346.1249449538723}
Losses (16/30) {'ner': 317.42312924982207}
Losses (17/30) {'ner': 341.20657504827284}
Losses (18/30) {'ner': 318.87029372153853}
Losses (19/30) {'ner': 312.57004010324545}
Losses (20/30) {'ner': 285.87410267170617}
Losses (21/30) {'ner': 306.41960576960395}
Losses (22/30) {'ner': 253.47754865200034}
Losses (23/30) {'ner': 246.98746294407502}
Losses (24/30) {'ner': 258.78243863

In [97]:
statbot_colors = {"GRAN": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
                  "DATA": "linear-gradient(90deg, #ffff00, #ff8c00)"}
statbot_options = {"ents": ["PER","LOC","ORG","MISC","GRAN","DATA"], "colors": statbot_colors}
spacy.displacy.render(nlp("Ich heisse Christian und war heute in Zürich bei IBM im Internet."), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie viele Kühe hat die Gemeinde Bülach?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie hoch ist Eigenkapital auf Bezirksebene?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Ich brauche die Daten pro Bezirk"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Ich brauche die Daten für den gesamten Kanton."), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie viel Bauinv. EFH 5 Jahre  hat  in Regensdorf  ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Was ist der Anteil an MIV-Anteil (Modal Split)   auf Bezirksebene ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Was ist der Anteil an Geb.Vol. Dienstleistungen: Zunahme   in Flaach ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Welches ist das Schül. Sekundarstufe II   für den gesamten Kanton ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Welche Gemeinde hat die grösste Bevölkerung?"), style="ent",options=statbot_options)

In [57]:
# dictionary to hold our evaluation data
stat_evaluation = {
    "GRAN": {
        "correct": 0,
        "total": 0,
    },
    "DATA": {
        "correct": 0,
        "total": 0,
    }
}

word_evaluation = {
    "GRAN": {
        "correct": 0,
        "total": 0
    },
    "DATA": {
        "correct": 0,
        "total": 0,
    }

}


for stat in TEST_STAT_DATA:
    # extract the sentence and correct stat entities according to our test data
    sentence = stat[0]
    entities = stat[1]["entities"]

    # for each entity, use our updated model to make a prediction on the sentence
    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]
        n_worded_stat =  len(correct_text.split())
        print(n_worded_stat)

        # if we find that there's a match for predicted entity and predicted text, increment correct counters
        for ent in doc.ents:
            print("ENT_LABEL",ent.label_)
            print("ENTITY2",entity[2])
            print("ENT_TEXT",ent.text)
            print("CORRECT:TEXT",correct_text)
            if ent.label_ == entity[2] and ent.text == correct_text:
                
                stat_evaluation[entity[2]]["correct"] += 1
                if n_worded_stat > 0:
                    word_evaluation[entity[2]]["correct"] += 1

                # this break is important, ensures that we're not double counting on a correct match
                break

        #  increment total counters after each entity loop
        stat_evaluation[entity[2]]["total"] += 1
        if n_worded_stat > 0:
            word_evaluation[entity[2]]["total"] += 1

2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Anteil Gewaesserflaeche
CORRECT:TEXT Anteil Gewaesserflaeche
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Wahlbeteiligung Nationalrat
CORRECT:TEXT Wahlbeteiligung Nationalrat
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT VZÄ im Sekundaersektor
CORRECT:TEXT VZÄ im Sekundaersektor
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Hotels
CORRECT:TEXT Kanton
ENT_LABEL GRAN
ENTITY2 GRAN
ENT_TEXT Kanton
CORRECT:TEXT Kanton
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Hotels
CORRECT:TEXT Hotels
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Steuerb.
CORRECT:TEXT Kanton
ENT_LABEL GRAN
ENTITY2 GRAN
ENT_TEXT Kanton
CORRECT:TEXT Kanton
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerb.
CORRECT:TEXT Steuerb. Einkommen natuerl. Pers. 25%Quantil
ENT_LABEL GRAN
ENTITY2 DATA
ENT_TEXT Kanton
CORRECT:TEXT Steuerb. Einkommen natuerl. Pers. 25%Quantil
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bauinvestitionen
CORRECT:TEXT Region
ENT_LABEL GRAN
ENTITY2 GRAN
ENT_TEXT Region
CORRECT:TEXT Region
1
ENT_LABEL DATA
E

2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerung:
CORRECT:TEXT Bevoelkerung: Maenner
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauinv.
CORRECT:TEXT Bauinv. EFH 5 Jahre
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Geb.
CORRECT:TEXT Geb.Vol. Dienstleistungen: Zunahme
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Arbeitslosenanteil an Bevoelkerung 1564 Jahre
CORRECT:TEXT Arbeitslosenanteil an Bevoelkerung 1564 Jahre
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Aerzte
CORRECT:TEXT Aerzte
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerb.
CORRECT:TEXT Steuerb. Einkommen natuerl. Pers. 25%Quantil
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Arbeitsstaetten im Sekundaersektor
CORRECT:TEXT Arbeitsstaetten im Sekundaersektor
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Arbeitslose
CORRECT:TEXT Bezirksebene
ENT_LABEL GRAN
ENTITY2 GRAN
ENT_TEXT Bezirksebene
CORRECT:TEXT Bezirksebene
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Arbeitslose
CORRECT:TEXT Arbeitslose
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerb.
CORRECT:TEXT Steuerb. Ein

1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Reingewinn juristische Personen
CORRECT:TEXT Kanton
ENT_LABEL GRAN
ENTITY2 GRAN
ENT_TEXT Kanton
CORRECT:TEXT Kanton
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Reingewinn juristische Personen
CORRECT:TEXT Reingewinn juristische Personen
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerkraft berichtigt pro Kopf
CORRECT:TEXT Steuerkraft berichtigt pro Kopf
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT 1Personenhaushalte
CORRECT:TEXT 1Personenhaushalte
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Kleine Betriebe 1049 VZÄ
CORRECT:TEXT Kleine Betriebe 1049 VZÄ
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Geb.Vol.
CORRECT:TEXT Geb.Vol. Wohnen: Zunahme
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerung:
CORRECT:TEXT Bevoelkerung: Frauen
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Hotelbetten pro Hotel
CORRECT:TEXT Hotelbetten pro Hotel
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerb.
CORRECT:TEXT Steuerb. Vermoegen natuerl. Pers. Median
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Sozialhilfe

5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT MIVWege Quell, Ziel und Binnenverkehr
CORRECT:TEXT MIVWege Quell, Ziel und Binnenverkehr
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT PWNeuzulassungen pro 1000 Einw.
CORRECT:TEXT PWNeuzulassungen pro 1000 Einw.
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Arbeitsstaetten im Tertiaersektor
CORRECT:TEXT Arbeitsstaetten im Tertiaersektor
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bestand Dieselmotor
CORRECT:TEXT Bestand Dieselmotor
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Anteil Landwirtschaftsflaeche
CORRECT:TEXT Anteil Landwirtschaftsflaeche
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Sterbefaelle
CORRECT:TEXT Sterbefaelle
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Sterberate
CORRECT:TEXT Sterberate 5J.mittel
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Anteil 5 Zi.
CORRECT:TEXT Anteil 5 Zi.Wohnungen
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Investitionsanteil
CORRECT:TEXT Investitionsanteil
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauzonen ueberbaut
CORRECT:TEXT Bauzonen ueberbaut
1
E

3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT NRW
CORRECT:TEXT NRW Waehleranteil CVP
ENT_LABEL GRAN
ENTITY2 DATA
ENT_TEXT Bezirk
CORRECT:TEXT NRW Waehleranteil CVP
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT EFHBestand
CORRECT:TEXT EFHBestand
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerertrag Total
CORRECT:TEXT Steuerertrag Total
6
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerkraft ber. arith.
CORRECT:TEXT Steuerkraft ber. arith. Mittel 3 Jahre
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerungszunahme 5 Jahre
CORRECT:TEXT Bevoelkerungszunahme 5 Jahre
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Baulandpreis Modell Median
CORRECT:TEXT Baulandpreis Modell Median
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bestand anderer Antrieb
CORRECT:TEXT Bestand anderer Antrieb
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauinv.
CORRECT:TEXT Bauinv. ÖV, Kommunik. 5 Jahre
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauzonen nicht ueberbaut
CORRECT:TEXT Bauzonen nicht ueberbaut
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT KRW Waehleranteil

4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauinv.
CORRECT:TEXT Bauinv. Landwirtschaft 5 Jahre
ENT_LABEL GRAN
ENTITY2 DATA
ENT_TEXT Region
CORRECT:TEXT Bauinv. Landwirtschaft 5 Jahre
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Wahlbeteiligung
CORRECT:TEXT Bezirk
ENT_LABEL GRAN
ENTITY2 GRAN
ENT_TEXT Bezirk
CORRECT:TEXT Bezirk
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Wahlbeteiligung
CORRECT:TEXT Wahlbeteiligung Kantonsrat
ENT_LABEL GRAN
ENTITY2 DATA
ENT_TEXT Bezirk
CORRECT:TEXT Wahlbeteiligung Kantonsrat
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bevoelkerung: Heimat
CORRECT:TEXT Bezirk
ENT_LABEL GRAN
ENTITY2 GRAN
ENT_TEXT Bezirk
CORRECT:TEXT Bezirk
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerung: Heimat
CORRECT:TEXT Bevoelkerung: Heimat Schweiz
ENT_LABEL GRAN
ENTITY2 DATA
ENT_TEXT Bezirk
CORRECT:TEXT Bevoelkerung: Heimat Schweiz
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Schweine
CORRECT:TEXT Schweine
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Scheidungen
CORRECT:TEXT Scheidungen
1
ENT_LABEL DATA
ENTITY2 

1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Landwirtschaftsbetriebe
CORRECT:TEXT Landwirtschaftsbetriebe
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Wohnungsbestand
CORRECT:TEXT Wohnungsbestand
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Erwerb des Schweizer Buergerrechts
CORRECT:TEXT Erwerb des Schweizer Buergerrechts
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT MotorradNeuzulassungen
CORRECT:TEXT MotorradNeuzulassungen
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Schafe
CORRECT:TEXT Schafe
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Apotheken
CORRECT:TEXT Apotheken
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Steuerkraft ber. arith.
CORRECT:TEXT Kanton
ENT_LABEL GRAN
ENTITY2 GRAN
ENT_TEXT Kanton
CORRECT:TEXT Kanton
6
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerkraft ber. arith.
CORRECT:TEXT Steuerkraft ber. arith. Mittel 3 Jahre
ENT_LABEL GRAN
ENTITY2 DATA
ENT_TEXT Kanton
CORRECT:TEXT Steuerkraft ber. arith. Mittel 3 Jahre
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Landwirtschaftsbetriebe
CORRECT:TEXT Landwirtschaftsbetriebe


2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Verkaeufe Stockwerkeigentum
CORRECT:TEXT Verkaeufe Stockwerkeigentum
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerungsdichte
CORRECT:TEXT Bevoelkerungsdichte
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Restaurants
CORRECT:TEXT Restaurants
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Total geschaffene Stellen, Beschaeftigte
CORRECT:TEXT Total geschaffene Stellen, Beschaeftigte
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Sterbefaelle
CORRECT:TEXT Sterbefaelle
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Anteil 5 Zi.
CORRECT:TEXT Gemeindeebene
ENT_LABEL GRAN
ENTITY2 GRAN
ENT_TEXT Gemeindeebene
CORRECT:TEXT Gemeindeebene
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Anteil 5 Zi.
CORRECT:TEXT Anteil 5 Zi.Wohnungen
ENT_LABEL GRAN
ENTITY2 DATA
ENT_TEXT Gemeindeebene
CORRECT:TEXT Anteil 5 Zi.Wohnungen
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT 5Personenhaushalte
CORRECT:TEXT 5Personenhaushalte
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Schuel.
CORRECT:TEXT Gemeinde
ENT_LABEL GRAN
ENTITY2 

In [58]:
for key in word_evaluation:
    correct = word_evaluation[key]["correct"]
    total = word_evaluation[key]["total"]

    print(f"{key}: {correct / total * 100:.2f}%")

stat_total_sum = 0
stat_correct_sum = 0

print("---")
for key in stat_evaluation:
    correct = stat_evaluation[key]["correct"]
    total = stat_evaluation[key]["total"]
    
    stat_total_sum += total
    stat_correct_sum += correct

    print(f"{key}: {correct / total * 100:.2f}%")

print(f"\nTotal: {stat_correct_sum/stat_total_sum * 100:.2f}%")

GRAN: 100.00%
DATA: 68.50%
---
GRAN: 100.00%
DATA: 68.50%

Total: 75.38%


In [34]:
# dictionary which will be populated with the entities and result information
entity_evaluation = {}

# helper function to udpate the entity_evaluation dictionary
def update_results(entity, metric):
    if entity not in entity_evaluation:
        entity_evaluation[entity] = {"correct": 0, "total": 0}
    
    entity_evaluation[entity][metric] += 1

# same as before, see if entities from test set match what spaCy currently predicts
for data in TEST_REVISION_DATA:
    sentence = data[0]
    entities = data[1]["entities"]

    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]

        for ent in doc.ents:
            if ent.label_ == entity[2] and ent.text == correct_text:
                update_results(ent.label_, "correct")
                break

        update_results(entity[2], "total")

In [35]:
sum_total = 0
sum_correct = 0

for entity in entity_evaluation:
    total = entity_evaluation[entity]["total"]
    correct = entity_evaluation[entity]["correct"]

    sum_total += total
    sum_correct += correct
    
    print("{} | {:.2f}%".format(entity, correct / total * 100))

print()
print("Overall accuracy: {:.2f}%".format(sum_correct / sum_total * 100))

MISC | 81.73%
LOC | 82.25%
ORG | 88.79%
PER | 93.97%

Overall accuracy: 85.99%


In [21]:
nlp.meta["name"] = "stat_entity_extractor_v1"
nlp.to_disk("./models/v1")

In [22]:
TRAIN_STAT_DATA[5]


['Ich brauche Schuel. Oberstufe ', {'entities': [[12, 29, 'DATA']]}]

In [23]:
doc = nlp(u'Welche Gemeinde hat die grösste Bevölkerung und welche hatte im 2019 den höchsten Ausländeranteil?')

# show universal pos tags
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.pos_) for t in doc))
# output: Ich/PRON bin/AUX ein/DET Berliner/NOUN ./PUNCT

# show German specific pos tags (STTS)
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.tag_) for t in doc))
# output: Ich/PPER bin/VAFIN ein/ART Berliner/NN ./$.

# show dependency arcs
print('\n'.join('{child:<8} <{label:-^7} {head}'.format(child=t.orth_, label=t.dep_, head=t.head.orth_) for t in doc))
# output: (sb: subject, nk: noun kernel, pd: predicate)

#named entities
print("Named Entity Recognition:")
for ent in doc.ents:
    print(ent.text)
print("Noun chunks:")
for chunk in doc.noun_chunks:
    print(chunk.text)

Welche/DET Gemeinde/NOUN hat/AUX die/DET grösste/ADJ Bevölkerung/NOUN und/CCONJ welche/PRON hatte/AUX im/ADP 2019/PROPN den/DET höchsten/ADJ Ausländeranteil/NOUN ?/PUNCT
Welche/PWAT Gemeinde/NN hat/VAFIN die/ART grösste/ADJA Bevölkerung/NN und/KON welche/PDS hatte/VAFIN im/APPRART 2019/CARD den/ART höchsten/ADJA Ausländeranteil/NN ?/$.
Welche   <--nk--- Gemeinde
Gemeinde <--sb--- hat
hat      <-ROOT-- hat
die      <--nk--- Bevölkerung
grösste  <--nk--- Bevölkerung
Bevölkerung <--oa--- hat
und      <--cd--- Bevölkerung
welche   <--cj--- und
hatte    <--cj--- hat
im       <--mo--- hatte
2019     <--nk--- im
den      <--nk--- Ausländeranteil
höchsten <--nk--- Ausländeranteil
Ausländeranteil <--oa--- hatte
?        <-punct- hat
Named Entity Recognition:
Gemeinde
Noun chunks:
Welche Gemeinde
die grösste Bevölkerung
welche
2019
den höchsten Ausländeranteil
