In [1]:
from __future__ import print_function, unicode_literals
import spacy
import warnings
from spacy.util import minibatch, compounding
import de_core_news_lg
import pandas as pd
import numpy as np
from random import sample
import io, csv
import re
import random
import de_core_news_lg
import json
from spacy.training import Example
from spacy.tokens import Doc
nlp = spacy.load('de_core_news_lg')

In [2]:
# helper function for incrementing the revision counters
def increment_revision_counters(entity_counter, entities):
    for entity in entities:
        label = entity[2]
        if label in entity_counter:
            entity_counter[label] += 1
        else:
            entity_counter[label] = 1

In [3]:
#out_sentences=open("input/tagged_sentences.csv", "r").readlines()
with open('input/tagged_sentences.json') as json_file:
    out_sentences = json.load(json_file)
print("LENGTH OF DATASET: ",len(out_sentences))
dataset_dict={}

for sent in out_sentences:
    entities = sent[1]["entities"]
    increment_revision_counters(dataset_dict, entities)
#    print(entity[1])
    #helper_dict.append(entity[1]['entities'][0][2])
#out_sentences[:5]
print(dataset_dict)

LENGTH OF DATASET:  1020
{'DATA': 1010, 'GRAN': 282}


In [4]:
npr_df = pd.read_csv("external/deu_news_2015_3M-sentences.txt", delimiter = "\t")
npr_df=npr_df.sample(frac=1)
npr_df.head()

Unnamed: 0,1,­
44365,46308,Aber mit den anderen muss man natürlich reden“...
1144264,1203424,"Eine Nutzerin schreibt, niemand sei wütend auf..."
1567360,1654327,Im Sommer 2014 beim überraschenden Aufstieg de...
2450017,2581907,We Care A Lot wurde übrigens für das zweite Al...
2635306,2779648,Zu wenig Individualisierung Im Falle der Locks...


In [5]:
# create an nlp object as we'll use this to seperate the sentences and identify existing entities
#loaded already above
#nlp = spacy.load('de_core_news_lg')

In [6]:
revision_texts = []

# convert the articles to spacy objects to better identify the sentences. Disabled unneeded components. # takes ~ 4 minutes
for doc in nlp.pipe(npr_df.iloc[:100000,1], batch_size=30, disable=["tagger", "ner"]):
    for sentence in doc.sents:
        if  40 < len(sentence.text) < 80:
            # some of the sentences had excessive whitespace in between words, so we're trimming that
            revision_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))

In [7]:
revisions = []

# Use the existing spaCy model to predict the entities, then append them to revision
for doc in nlp.pipe(revision_texts, batch_size=50, disable=["tagger", "parser"]):
    
    # don't append sentences that have no entities
    if len(doc.ents) > 0:
        revisions.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))

In [8]:
# print an example of the revision sentence
print(revisions[0][0])

# print an example of the revision data
print(revisions[0][1])


Aber mit den anderen muss man natürlich reden“, sagte Oppermann.
{'entities': [(54, 63, 'PER')]}


In [9]:
# create arrays to store the revision data
TRAIN_REVISION_DATA = []
TEST_REVISION_DATA = []

# create dictionaries to keep count of the different entities
TRAIN_ENTITY_COUNTER = {}
TEST_ENTITY_COUNTER = {}

# This will help distribute the entities (i.e. we don't want 1000 PERSON entities, but only 80 ORG entities)
REVISION_SENTENCE_SOFT_LIMIT = 100



random.shuffle(revisions)
for revision in revisions:
    # get the entities from the revision sentence
    entities = revision[1]["entities"]

    # simple hack to make sure spaCy entities don't get too one-sided
    should_append_to_train_counter = 0
    for _, _, label in entities:
        if label in TRAIN_ENTITY_COUNTER and TRAIN_ENTITY_COUNTER[label] > REVISION_SENTENCE_SOFT_LIMIT:
            should_append_to_train_counter -= 1
        else:
            should_append_to_train_counter += 1

    # simple switch for deciding whether to append to train data or test data
    if should_append_to_train_counter >= 0:
        TRAIN_REVISION_DATA.append(revision)
        increment_revision_counters(TRAIN_ENTITY_COUNTER, entities)
    else:
        TEST_REVISION_DATA.append(revision)
        increment_revision_counters(TEST_ENTITY_COUNTER, entities)

In [10]:
TRAIN_ENTITY_COUNTER

{'LOC': 107, 'MISC': 102, 'PER': 101, 'ORG': 102}

In [11]:
TEST_ENTITY_COUNTER

{'LOC': 7993, 'MISC': 7495, 'ORG': 5806, 'PER': 7132}

In [12]:
TRAIN_REVISION_DATA

[('In Ferguson hatte der Behörden\xadrassismus System.',
  {'entities': [(3, 11, 'LOC'), (22, 47, 'MISC')]}),
 ('Doch ganz ohne Klischees kommt auch „Barbara“ nicht aus.',
  {'entities': [(37, 44, 'PER')]}),
 ('Seit Mittwoch verschwunden: Wo ist der kleine Elias (6)?',
  {'entities': [(5, 13, 'PER'), (46, 51, 'PER')]}),
 ('Intel strebt das größte Portfolio für IoT-Komopnenten am Markt an.',
  {'entities': [(0, 5, 'ORG'), (38, 53, 'MISC')]}),
 ('Zum großen Handy-Kamera-Test Rechtsschutz\xadversicherung',
  {'entities': [(11, 54, 'MISC')]}),
 ('Resultate und Tabelle: Sampdoria Genua - Fiorentina 0:2 (0:1).',
  {'entities': [(23, 38, 'ORG'), (41, 51, 'ORG')]}),
 ('Berner Stadtpräsident im freien Fall Aktualisiert um 06:53 7 Kommentare',
  {'entities': [(0, 6, 'LOC'), (37, 49, 'MISC')]}),
 ('KRUGFOTO Peter Zeidler ist nicht länger Trainer von Red Bull Salzburg.',
  {'entities': [(0, 22, 'PER'), (52, 69, 'ORG')]}),
 ('In den USA sind die Bauausgaben im Juni schwächer als erwartet gestiegen.

In [13]:
random.shuffle(out_sentences)
TRAIN_STAT_DATA=out_sentences[:len(out_sentences)//2]
TEST_STAT_DATA=out_sentences[len(out_sentences)//2:]

In [14]:
print(len(out_sentences))
print(len(TRAIN_STAT_DATA))
print(len(TEST_STAT_DATA))
print("REVISION", len(TRAIN_REVISION_DATA))
TRAIN_DATA = TRAIN_REVISION_DATA + TRAIN_STAT_DATA
print("COMBINED", len(TRAIN_DATA))

1020
510
510
REVISION 291
COMBINED 801


In [15]:
ner = nlp.get_pipe("ner")
ner.add_label("GRAN")
ner.add_label("DATA")



# get the names of the components we want to disable during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# start the training loop, only training NER
epochs = 30
#optimizer = nlp.resume_training()
#optimizer = nlp.initialize()
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    sizes = compounding(1.0, 4.0, 1.001)
    
    # batch up the examples using spaCy's minibatc
    for epoch in range(epochs):
        random.shuffle(TRAIN_DATA)
        #text = []
        #annots=[]
        examples=[]


        for text,annots in TRAIN_DATA:
            #text.append(t)
            #annots.append(a)
            doc = nlp.make_doc(text)    
            example = Example.from_dict(doc, annots)
            examples.append(example)
        
        losses = {}
        
        nlp.update(examples, drop=0.35, losses=losses)#,sgd=optimizer)

        print("Losses ({}/{})".format(epoch + 1, epochs), losses)

Losses (1/30) {'ner': 2385.5419125936055}
Losses (2/30) {'ner': 2043.9089004172185}
Losses (3/30) {'ner': 1892.4018475627818}
Losses (4/30) {'ner': 1668.3351902225502}
Losses (5/30) {'ner': 1607.4926412801724}
Losses (6/30) {'ner': 1503.0205603179356}
Losses (7/30) {'ner': 1471.0794919212785}
Losses (8/30) {'ner': 1395.5419513410889}
Losses (9/30) {'ner': 1362.8461099738488}
Losses (10/30) {'ner': 1290.3212392951245}
Losses (11/30) {'ner': 1287.6162469459232}
Losses (12/30) {'ner': 1236.1939815005753}
Losses (13/30) {'ner': 1200.5868435432203}
Losses (14/30) {'ner': 1193.479590804316}
Losses (15/30) {'ner': 1179.2460084240884}
Losses (16/30) {'ner': 1154.7695163995668}
Losses (17/30) {'ner': 1118.532352203736}
Losses (18/30) {'ner': 1080.3043447384553}
Losses (19/30) {'ner': 1070.3354025850113}
Losses (20/30) {'ner': 1055.5522190239863}
Losses (21/30) {'ner': 1000.792888879776}
Losses (22/30) {'ner': 997.667265408214}
Losses (23/30) {'ner': 1121.9321248072183}
Losses (24/30) {'ner': 12

In [16]:
statbot_colors = {"GRAN": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
                  "DATA": "linear-gradient(90deg, #ffff00, #ff8c00)"}
statbot_options = {"ents": ["PER","LOC","ORG","MISC","GRAN","DATA"], "colors": statbot_colors}
spacy.displacy.render(nlp("Ich heisse Christian und war heute in Zürich bei IBM im Internet."), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie viele Kühe hat die Gemeinde Bülach?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie hoch ist Eigenkapital auf Bezirksebene?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Ich brauche die Daten pro Bezirk"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Ich brauche die Daten für den gesamten Kanton."), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie viel Bauinv. EFH 5 Jahre  hat  in Regensdorf  ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Was ist der Anteil an MIV-Anteil (Modal Split)   auf Bezirksebene ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Was ist der Anteil an Geb.Vol. Dienstleistungen: Zunahme   in Flaach ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Welches ist das Schül. Sekundarstufe II   für den gesamten Kanton ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Welche Gemeinde hat die grösste Bevölkerung?"), style="ent",options=statbot_options)

In [17]:
# dictionary to hold our evaluation data
stat_evaluation = {
    "GRAN": {
        "correct": 0,
        "total": 0,
    },
    "DATA": {
        "correct": 0,
        "total": 0,
    }
}

word_evaluation = {
    "GRAN": {
        "correct": 0,
        "total": 0
    },
    "DATA": {
        "correct": 0,
        "total": 0,
    }

}


for stat in TEST_STAT_DATA:
    # extract the sentence and correct stat entities according to our test data
    sentence = stat[0]
    entities = stat[1]["entities"]

    # for each entity, use our updated model to make a prediction on the sentence
    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]
        n_worded_stat =  len(correct_text.split())
        print(n_worded_stat)

        # if we find that there's a match for predicted entity and predicted text, increment correct counters
        for ent in doc.ents:
            print("ENT_LABEL",ent.label_)
            print("ENTITY2",entity[2])
            print("ENT_TEXT",ent.text)
            print("CORRECT:TEXT",correct_text)
            if ent.label_ == entity[2] and ent.text == correct_text:
                
                stat_evaluation[entity[2]]["correct"] += 1
                if n_worded_stat > 0:
                    word_evaluation[entity[2]]["correct"] += 1

                # this break is important, ensures that we're not double counting on a correct match
                break

        #  increment total counters after each entity loop
        stat_evaluation[entity[2]]["total"] += 1
        if n_worded_stat > 0:
            word_evaluation[entity[2]]["total"] += 1

2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Schuel.
CORRECT:TEXT Schuel. Tertiaerstufe
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Verkaeufe von unbebautem
CORRECT:TEXT Verkaeufe von unbebautem Land
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerung:
CORRECT:TEXT Bevoelkerung: Frauen
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Apotheken hat  
CORRECT:TEXT Kanton
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Kanton  
CORRECT:TEXT Kanton
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Apotheken hat  
CORRECT:TEXT Apotheken
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Kanton  
CORRECT:TEXT Apotheken
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Nettoaufwand Gesundheit
CORRECT:TEXT Nettoaufwand Gesundheit
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Kuehe
CORRECT:TEXT Kuehe
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Erwerb des Schweizer
CORRECT:TEXT Erwerb des Schweizer Buergerrechts
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Schuel.
CORRECT:TEXT Schuel. Mittelschule
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Steuerpflichtige juristische Personen
CORREC

ENTITY2 DATA
ENT_TEXT Steuerertrag von juristischen
CORRECT:TEXT Steuerertrag von juristischen Personen
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauinv.
CORRECT:TEXT Bauinv. MFH 5 Jahre
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT PWNeuzulassungen  
CORRECT:TEXT PWNeuzulassungen
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirksebene  
CORRECT:TEXT Bezirksebene
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bezirksebene  
CORRECT:TEXT Reingewinn juristische Personen
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bevoelkerung:
CORRECT:TEXT Bezirk
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirk Winterthur
CORRECT:TEXT Bezirk
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerung:
CORRECT:TEXT Bevoelkerung: Anteil 1519Jaehrige
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bezirk Winterthur
CORRECT:TEXT Bevoelkerung: Anteil 1519Jaehrige
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Gemeindebeitrag an ZVV
CORRECT:TEXT Gemeindebeitrag an ZVV
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerung:
CORRECT:TEXT Bevoelkerung: Durchschnittsalter
3


1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Neuerstellte Wohnungen
CORRECT:TEXT Bezirk
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirk Buelach
CORRECT:TEXT Bezirk
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Neuerstellte Wohnungen
CORRECT:TEXT Neuerstellte Wohnungen
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Wanderungsbilanz  in
CORRECT:TEXT Wanderungsbilanz
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Flaeche hat  
CORRECT:TEXT Gemeinden
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Gemeinden  
CORRECT:TEXT Gemeinden
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Flaeche hat  
CORRECT:TEXT Flaeche
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Gemeinden  
CORRECT:TEXT Flaeche
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Anteil GK F
CORRECT:TEXT Kanton
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Kanton ?
CORRECT:TEXT Kanton
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Anteil GK F
CORRECT:TEXT Anteil GK F
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT KRW Waehleranteil EVP
CORRECT:TEXT KRW Waehleranteil EVP
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Geburtenueberschus

ENT_TEXT Zuzuege
CORRECT:TEXT Zuzuege
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Arbeitslose
CORRECT:TEXT Arbeitslose
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Arbeitsstaetten im Tertiaersektor
CORRECT:TEXT Arbeitsstaetten im Tertiaersektor
1
ENT_LABEL LOC
ENTITY2 DATA
ENT_TEXT Hoehe
CORRECT:TEXT Hoehe
6
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauinv.
CORRECT:TEXT Bauinv. Spital, Kultur usw. 5 J.
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerungszunahme 1
CORRECT:TEXT Bevoelkerungszunahme 1 Jahr
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Schuel.
CORRECT:TEXT Schuel. Tertiaerstufe
1
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerungszunahme
CORRECT:TEXT Bevoelkerungszunahme 5 Jahre
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Nettoaufwand
CORRECT:TEXT Nettoaufwand allgemeine Verwaltung
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Verwaltung hat  
CORRECT:TEXT Nettoaufwand allgemeine Verwaltung
1
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Steuerbares Vermoegen
CORRECT:TEXT Kanton
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT

1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Grosse Betriebe 250
CORRECT:TEXT Bezirk
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirk  ?
CORRECT:TEXT Bezirk
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Beschaeftigte im Sekundaersektor
CORRECT:TEXT Beschaeftigte im Sekundaersektor
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Reformierte Kirchensteuerpflichtige
CORRECT:TEXT Reformierte Kirchensteuerpflichtige
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Gemeindeebene ?
CORRECT:TEXT Gemeindeebene
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Gemeindeebene ?
CORRECT:TEXT Total Arbeitsstaetten
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Anteil GK D
CORRECT:TEXT Anteil GK D
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT 3Personenhaushalte hat  
CORRECT:TEXT 3Personenhaushalte
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Nettoaufwand soziale Sicherheit
CORRECT:TEXT Nettoaufwand soziale Sicherheit
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT VZÄ im Tertiaersektor
CORRECT:TEXT VZÄ im Tertiaersektor
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT MotorradNeuzulass

1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Finanzvermoegen  
CORRECT:TEXT Finanzvermoegen
2
ENT_LABEL GRAN
ENTITY2 DATA
ENT_TEXT Steuerertrag
CORRECT:TEXT Steuerertrag Kirchgemeinden
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerungszunahme 1 Jahr in Prozent
CORRECT:TEXT Bevoelkerungszunahme 1 Jahr
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerungszunahme 5 Jahre  
CORRECT:TEXT Bevoelkerungszunahme 5 Jahre
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Beschaeftigte im Tertiaersektor
CORRECT:TEXT Beschaeftigte im Tertiaersektor
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauinv.
CORRECT:TEXT Bauinv. Ver/Entsorg. 5 Jahre
3
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Steuerertrag Politische/
CORRECT:TEXT Bezirk
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirk ?
CORRECT:TEXT Bezirk
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerertrag Politische/
CORRECT:TEXT Steuerertrag Politische/Schulgemeinde
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bezirk ?
CORRECT:TEXT Steuerertrag Politische/Schulgemeinde
1
1
ENT_LABEL GRAN
EN

3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Geb.
CORRECT:TEXT Geb.Vol. Industrie/Lager: Zunahme
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bezirk ?
CORRECT:TEXT Geb.Vol. Industrie/Lager: Zunahme
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT KRW Waehleranteil SP
CORRECT:TEXT Bezirk
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirk ?
CORRECT:TEXT Bezirk
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT KRW Waehleranteil SP
CORRECT:TEXT KRW Waehleranteil SP
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauzonen nicht ueberbaut
CORRECT:TEXT Bauzonen nicht ueberbaut
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT ÖVWege Quell
CORRECT:TEXT ÖVWege Quell, Ziel und Binnenverkehr
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Steuerb.
CORRECT:TEXT Bezirk
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirk  ?
CORRECT:TEXT Bezirk
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerb.
CORRECT:TEXT Steuerb. Einkommen natuerl. Pers. 75%Quantil
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bezirk  ?
CORRECT:TEXT Steuerb. Einkommen natuerl. Pers. 75%Quantil
1
ENT_LABEL DATA
ENTITY2 D

ENT_TEXT Anteil Verkehrsflaeche
CORRECT:TEXT Anteil Verkehrsflaeche
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bauinvestitionen  fuer
CORRECT:TEXT Bezirke
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirke ?
CORRECT:TEXT Bezirke
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauinvestitionen  fuer
CORRECT:TEXT Bauinvestitionen
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bezirke ?
CORRECT:TEXT Bauinvestitionen
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerung:
CORRECT:TEXT Bevoelkerung: Durchschnittsalter
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Geb.Vol.
CORRECT:TEXT Geb.Vol. Wohnen: Zunahme
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Ø Eigenkapital
CORRECT:TEXT Ø Eigenkapital steuerpfl. jur. Pers.
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Steuerb.
CORRECT:TEXT Gemeinden
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Gemeinden ?
CORRECT:TEXT Gemeinden
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerb.
CORRECT:TEXT Steuerb. Vermoegen natuerl. Pers. 75%Quantil
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Gemeinden ?
CORRECT:TEXT Steuerb. Ver

In [18]:
for key in word_evaluation:
    correct = word_evaluation[key]["correct"]
    total = word_evaluation[key]["total"]

    print(f"{key}: {correct / total * 100:.2f}%")

stat_total_sum = 0
stat_correct_sum = 0

print("---")
for key in stat_evaluation:
    correct = stat_evaluation[key]["correct"]
    total = stat_evaluation[key]["total"]
    
    stat_total_sum += total
    stat_correct_sum += correct

    print(f"{key}: {correct / total * 100:.2f}%")

print(f"\nTotal: {stat_correct_sum/stat_total_sum * 100:.2f}%")

GRAN: 15.56%
DATA: 22.51%
---
GRAN: 15.56%
DATA: 22.51%

Total: 21.04%


In [None]:
# dictionary which will be populated with the entities and result information
entity_evaluation = {}

# helper function to udpate the entity_evaluation dictionary
def update_results(entity, metric):
    if entity not in entity_evaluation:
        entity_evaluation[entity] = {"correct": 0, "total": 0}
    
    entity_evaluation[entity][metric] += 1

# same as before, see if entities from test set match what spaCy currently predicts
for data in TEST_REVISION_DATA:
    sentence = data[0]
    entities = data[1]["entities"]

    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]

        for ent in doc.ents:
            if ent.label_ == entity[2] and ent.text == correct_text:
                update_results(ent.label_, "correct")
                break

        update_results(entity[2], "total")

In [None]:
sum_total = 0
sum_correct = 0

for entity in entity_evaluation:
    total = entity_evaluation[entity]["total"]
    correct = entity_evaluation[entity]["correct"]

    sum_total += total
    sum_correct += correct
    
    print("{} | {:.2f}%".format(entity, correct / total * 100))

print()
print("Overall accuracy: {:.2f}%".format(sum_correct / sum_total * 100))

In [None]:
nlp.meta["name"] = "stat_entity_extractor_v1"
nlp.to_disk("./models/v1")

In [None]:
TRAIN_STAT_DATA[5]


In [None]:
doc = nlp(u'Welche Gemeinde hat die grösste Bevölkerung und welche hatte im 2019 den höchsten Ausländeranteil?')

# show universal pos tags
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.pos_) for t in doc))
# output: Ich/PRON bin/AUX ein/DET Berliner/NOUN ./PUNCT

# show German specific pos tags (STTS)
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.tag_) for t in doc))
# output: Ich/PPER bin/VAFIN ein/ART Berliner/NN ./$.

# show dependency arcs
print('\n'.join('{child:<8} <{label:-^7} {head}'.format(child=t.orth_, label=t.dep_, head=t.head.orth_) for t in doc))
# output: (sb: subject, nk: noun kernel, pd: predicate)

#named entities
print("Named Entity Recognition:")
for ent in doc.ents:
    print(ent.text)
print("Noun chunks:")
for chunk in doc.noun_chunks:
    print(chunk.text)