In [1]:
from __future__ import print_function, unicode_literals
import spacy
import warnings
from spacy.util import minibatch, compounding
import de_core_news_lg
import pandas as pd
import numpy as np
from random import sample
import io, csv
import re
import random
import de_core_news_lg
import json
from spacy.training import Example
from spacy.tokens import Doc
nlp = spacy.load('de_core_news_lg')

In [2]:
# helper function for incrementing the revision counters
def increment_revision_counters(entity_counter, entities):
    for entity in entities:
        label = entity[2]
        if label in entity_counter:
            entity_counter[label] += 1
        else:
            entity_counter[label] = 1

In [3]:
#out_sentences=open("input/tagged_sentences.csv", "r").readlines()
with open('input/tagged_sentences.json') as json_file:
    out_sentences = json.load(json_file)
print("LENGTH OF DATASET: ",len(out_sentences))
dataset_dict={}

for sent in out_sentences:
    entities = sent[1]["entities"]
    increment_revision_counters(dataset_dict, entities)
#    print(entity[1])
    #helper_dict.append(entity[1]['entities'][0][2])
#out_sentences[:5]
print(dataset_dict)

LENGTH OF DATASET:  1020
{'DATA': 1010, 'GRAN': 282}


In [4]:
npr_df = pd.read_csv("external/deu_news_2015_3M-sentences.txt", delimiter = "\t")
npr_df=npr_df.sample(frac=1)
npr_df.head()

Unnamed: 0,1,­
2504830,2640453,Wer wirklich hinter diesen Straßenbanden steht...
2278471,2402490,"Um zu retten, was zu retten ist, beantrage Sch..."
717308,749815,Der Minister war laut Yonhap dabei ertappt wor...
2588861,2731655,Zu Beginn der Sommerpause galt der im Juli 201...
1602296,1691821,In der Tabelle baute Besiktas (41 Punkte) die ...


In [5]:
# create an nlp object as we'll use this to seperate the sentences and identify existing entities
#loaded already above
#nlp = spacy.load('de_core_news_lg')

In [6]:
revision_texts = []

# convert the articles to spacy objects to better identify the sentences. Disabled unneeded components. # takes ~ 4 minutes
for doc in nlp.pipe(npr_df.iloc[:100000,1], batch_size=30, disable=["tagger", "ner"]):
    for sentence in doc.sents:
        if  40 < len(sentence.text) < 80:
            # some of the sentences had excessive whitespace in between words, so we're trimming that
            revision_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))

In [7]:
revisions = []

# Use the existing spaCy model to predict the entities, then append them to revision
for doc in nlp.pipe(revision_texts, batch_size=50, disable=["tagger", "parser"]):
    
    # don't append sentences that have no entities
    if len(doc.ents) > 0:
        revisions.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))

In [8]:
# print an example of the revision sentence
print(revisions[0][0])

# print an example of the revision data
print(revisions[0][1])


Nö, ich zieh bei einer Paywall oder Desinteresse als Spiele-Nomade weiter.
{'entities': [(0, 2, 'MISC')]}


In [9]:
# create arrays to store the revision data
TRAIN_REVISION_DATA = []
TEST_REVISION_DATA = []

# create dictionaries to keep count of the different entities
TRAIN_ENTITY_COUNTER = {}
TEST_ENTITY_COUNTER = {}

# This will help distribute the entities (i.e. we don't want 1000 PERSON entities, but only 80 ORG entities)
REVISION_SENTENCE_SOFT_LIMIT = 100



random.shuffle(revisions)
for revision in revisions:
    # get the entities from the revision sentence
    entities = revision[1]["entities"]

    # simple hack to make sure spaCy entities don't get too one-sided
    should_append_to_train_counter = 0
    for _, _, label in entities:
        if label in TRAIN_ENTITY_COUNTER and TRAIN_ENTITY_COUNTER[label] > REVISION_SENTENCE_SOFT_LIMIT:
            should_append_to_train_counter -= 1
        else:
            should_append_to_train_counter += 1

    # simple switch for deciding whether to append to train data or test data
    if should_append_to_train_counter >= 0:
        TRAIN_REVISION_DATA.append(revision)
        increment_revision_counters(TRAIN_ENTITY_COUNTER, entities)
    else:
        TEST_REVISION_DATA.append(revision)
        increment_revision_counters(TEST_ENTITY_COUNTER, entities)

In [10]:
TRAIN_ENTITY_COUNTER

{'MISC': 108, 'LOC': 106, 'PER': 101, 'ORG': 104}

In [11]:
TEST_ENTITY_COUNTER

{'MISC': 7664, 'LOC': 7929, 'ORG': 6219, 'PER': 7331}

In [12]:
TRAIN_REVISION_DATA

[('Wodka wird trotz Rubel-Krise in Russland billiger',
  {'entities': [(17, 28, 'MISC'), (32, 40, 'LOC')]}),
 ('Die aktuelle Rohstoff-Tagesausgabe finden Sie hier.',
  {'entities': [(13, 34, 'LOC')]}),
 ('Yanis Varoufakis wird griechischer Finanzminister.',
  {'entities': [(0, 16, 'PER'), (22, 34, 'MISC')]}),
 ('Es war eine frustrierende Session, klagt Grosjean über seine Situation:',
  {'entities': [(41, 49, 'PER')]}),
 ('Hinter OpenStack stehen Unternehmen wie Cisco, Dell, Intel, HP und SAP.',
  {'entities': [(7, 16, 'MISC'),
    (40, 45, 'ORG'),
    (47, 51, 'ORG'),
    (53, 58, 'ORG'),
    (60, 62, 'ORG'),
    (67, 70, 'ORG')]}),
 ('Ein Euro-Austritt des Landes werde seitens der Kommission nicht angestrebt.',
  {'entities': [(22, 28, 'LOC'), (47, 57, 'ORG')]}),
 ('Damit sind die Kieler Störche seit zwölf Spielen in Folge ungeschlagen.',
  {'entities': [(15, 29, 'ORG')]}),
 ('Bahrain: Die Highlights des WEC-Qualifyings: (04:44 Min.)',
  {'entities': [(0, 7, 'LOC'), (28, 43, 'MISC')]

In [13]:
random.shuffle(out_sentences)
TRAIN_STAT_DATA=out_sentences[:len(out_sentences)//2]
TEST_STAT_DATA=out_sentences[len(out_sentences)//2:]

In [14]:
print(len(out_sentences))
print(len(TRAIN_STAT_DATA))
print(len(TEST_STAT_DATA))
print("REVISION", len(TRAIN_REVISION_DATA))
TRAIN_DATA = TRAIN_REVISION_DATA + TRAIN_STAT_DATA
print("COMBINED", len(TRAIN_DATA))

1020
510
510
REVISION 279
COMBINED 789


In [24]:
ner = nlp.get_pipe("ner")
ner.add_label("GRAN")
ner.add_label("DATA")



# get the names of the components we want to disable during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# start the training loop, only training NER
epochs = 30
#optimizer = nlp.resume_training()
#optimizer = nlp.initialize()
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    sizes = compounding(1.0, 4.0, 1.001)
    
    # batch up the examples using spaCy's minibatc
    for epoch in range(epochs):
        random.shuffle(TRAIN_DATA)
        #text = []
        #annots=[]
        examples=[]


        for text,annots in TRAIN_DATA:
            #text.append(t)
            #annots.append(a)
            doc = nlp.make_doc(text)    
            example = Example.from_dict(doc, annots)
            examples.append(example)
        
        losses = {}
        
        nlp.update(examples, drop=0.35, losses=losses)#,sgd=optimizer)

        print("Losses ({}/{})".format(epoch + 1, epochs), losses)

Losses (1/30) {'ner': 2209.544247150421}
Losses (2/30) {'ner': 1916.1894636750221}
Losses (3/30) {'ner': 1874.5526918172836}
Losses (4/30) {'ner': 1647.1635103821754}
Losses (5/30) {'ner': 1620.7749798893929}
Losses (6/30) {'ner': 1573.4089802503586}
Losses (7/30) {'ner': 1434.5390892624855}
Losses (8/30) {'ner': 1387.2959500292782}
Losses (9/30) {'ner': 1319.2121549844742}
Losses (10/30) {'ner': 1244.5019130408764}
Losses (11/30) {'ner': 1096.9259266243316}
Losses (12/30) {'ner': 998.9878923632205}
Losses (13/30) {'ner': 964.4241520593278}
Losses (14/30) {'ner': 963.216992600821}
Losses (15/30) {'ner': 948.7985789775848}
Losses (16/30) {'ner': 997.9088933165986}
Losses (17/30) {'ner': 1015.0706471204758}
Losses (18/30) {'ner': 962.7696940898895}
Losses (19/30) {'ner': 908.8512578904629}
Losses (20/30) {'ner': 855.6864865246462}
Losses (21/30) {'ner': 826.1343455111255}
Losses (22/30) {'ner': 845.9458135262037}
Losses (23/30) {'ner': 846.8364398970734}
Losses (24/30) {'ner': 821.068152

In [16]:
statbot_colors = {"GRAN": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
                  "DATA": "linear-gradient(90deg, #ffff00, #ff8c00)"}
statbot_options = {"ents": ["PER","LOC","ORG","MISC","GRAN","DATA"], "colors": statbot_colors}
spacy.displacy.render(nlp("Ich heisse Christian und war heute in Zürich bei IBM im Internet."), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie viele Kühe hat die Gemeinde Bülach?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie hoch ist Eigenkapital auf Bezirksebene?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Ich brauche die Daten pro Bezirk"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Ich brauche die Daten für den gesamten Kanton."), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie viel Bauinv. EFH 5 Jahre  hat  in Regensdorf  ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Was ist der Anteil an MIV-Anteil (Modal Split)   auf Bezirksebene ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Was ist der Anteil an Geb.Vol. Dienstleistungen: Zunahme   in Flaach ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Welches ist das Schül. Sekundarstufe II   für den gesamten Kanton ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Welche Gemeinde hat die grösste Bevölkerung?"), style="ent",options=statbot_options)



In [17]:
# dictionary to hold our evaluation data
stat_evaluation = {
    "GRAN": {
        "correct": 0,
        "total": 0,
    },
    "DATA": {
        "correct": 0,
        "total": 0,
    }
}

word_evaluation = {
    "GRAN": {
        "correct": 0,
        "total": 0
    },
    "DATA": {
        "correct": 0,
        "total": 0,
    }

}


for stat in TEST_STAT_DATA:
    # extract the sentence and correct stat entities according to our test data
    sentence = stat[0]
    entities = stat[1]["entities"]

    # for each entity, use our updated model to make a prediction on the sentence
    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]
        n_worded_stat =  len(correct_text.split())
        print(n_worded_stat)

        # if we find that there's a match for predicted entity and predicted text, increment correct counters
        for ent in doc.ents:
            print("ENT_LABEL",ent.label_)
            print("ENTITY2",entity[2])
            print("ENT_TEXT",ent.text)
            print("CORRECT:TEXT",correct_text)
            if ent.label_ == entity[2] and ent.text == correct_text:
                
                stat_evaluation[entity[2]]["correct"] += 1
                if n_worded_stat > 0:
                    word_evaluation[entity[2]]["correct"] += 1

                # this break is important, ensures that we're not double counting on a correct match
                break

        #  increment total counters after each entity loop
        stat_evaluation[entity[2]]["total"] += 1
        if n_worded_stat > 0:
            word_evaluation[entity[2]]["total"] += 1

1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Apotheken  in Oberrieden ?
CORRECT:TEXT Apotheken
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Anteil Verkehrsflaeche  fuer alle Gemeinden ?
CORRECT:TEXT Gemeinden
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Anteil Verkehrsflaeche  fuer alle Gemeinden ?
CORRECT:TEXT Anteil Verkehrsflaeche
4
4
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerung:
CORRECT:TEXT Bevoelkerung: Maenner
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Beschaeftigte im Sekundaersektor hat  Glattfelden  ?
CORRECT:TEXT Beschaeftigte im Sekundaersektor
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerb.
CORRECT:TEXT Steuerb. Einkommen natuerl. Pers. 75%Quantil
1
ENT_LABEL GRAN
ENTITY2 DATA
ENT_TEXT Steuerkraft
CORRECT:TEXT Steuerkraft
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Nettoaufwand Bildung hat  in Wiesendangen  ?
CORRECT:TEXT Nettoaufwand Bildung
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerkraft pro Kopf
CORRECT:TEXT Steuerkraft pro Kopf
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Zahnaerzte
CORRECT:

1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT 3Personenhaushalte  fuer alle Bezirke ?
CORRECT:TEXT 3Personenhaushalte
1
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT KRW Waehleranteil AL in Prozent  in Bezirk Andelfingen
CORRECT:TEXT Bezirk
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT KRW Waehleranteil AL in Prozent  in Bezirk Andelfingen
CORRECT:TEXT KRW Waehleranteil AL
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Landwirtschaftliche Nutzflaeche  in Egg ?
CORRECT:TEXT Landwirtschaftliche Nutzflaeche
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirksebene ?
CORRECT:TEXT Bezirksebene
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bezirksebene ?
CORRECT:TEXT Total Quellensteuerpflichtige
1
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Neuerstellte EFH  Mettmenstetten ?
CORRECT:TEXT Neuerstellte EFH
1
ENT_LABEL GRAN
ENTITY2 DATA
ENT_TEXT Bauzonen
CORRECT:TEXT Bauzonen
1
ENT_LABEL MISC
ENTITY2 GRAN
ENT_TEXT Ø
CORRECT:TEXT Bezirksebene
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirksebene  ?
CORRECT:TEXT Bezirksebene
5
ENT_LABEL MISC
ENT

4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauzonen ueberbaut pro Kopf hat  fuer den gesamten Kanton  ?
CORRECT:TEXT Bauzonen ueberbaut pro Kopf
1
1
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauinv.
CORRECT:TEXT Bauinv. EFH 5 Jahre
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Anteil 6 und mehr Personenhaushalte in Prozent  auf Bezirksebene
CORRECT:TEXT Bezirksebene
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Anteil 6 und mehr Personenhaushalte in Prozent  auf Bezirksebene
CORRECT:TEXT Anteil 6 und mehr Personenhaushalte
1
2
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerung: uebrige/andere/ohne Konfession  in Boppelsen ?
CORRECT:TEXT Bevoelkerung: uebrige/andere/ohne Konfession
2
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Motorraeder hat  in Hofstetten bis 2017  ?
CORRECT:TEXT Motorraeder
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Schuel.
CORRECT:TEXT Kanton
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Kanton Zuerich
CORRECT:TEXT Kanton
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Schuel.
CORRECT:TEXT Schuel. Primarschule
E

4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauinv.
CORRECT:TEXT Bauinv. Industr./Lager 5 Jahre
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Baulandpreis Median hat  in Staefa  ?
CORRECT:TEXT Baulandpreis Median
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerb.
CORRECT:TEXT Steuerb. Vermoegen natuerl. Pers. Durchschn.
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Sozialhilfequote in Prozent  in Winterthur
CORRECT:TEXT Sozialhilfequote
1
2
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Verkaeufe von unbebautem Land
CORRECT:TEXT Verkaeufe von unbebautem Land
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT KRW Waehleranteil
CORRECT:TEXT Gemeindeebene
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Gemeindeebene ?
CORRECT:TEXT Gemeindeebene
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT KRW Waehleranteil
CORRECT:TEXT KRW Waehleranteil CVP
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Gemeindeebene ?
CORRECT:TEXT KRW Waehleranteil CVP
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Schuel.
CORRECT:TEXT Schuel. Primarschule
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT S

1
1
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT PWNeuzulassungen pro 1000 Einw.
CORRECT:TEXT Bezirk
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirk ?
CORRECT:TEXT Bezirk
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT PWNeuzulassungen pro 1000 Einw.
CORRECT:TEXT PWNeuzulassungen pro 1000 Einw.
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Anteil Verkehrsflaeche in Prozent  auf Bezirksebene
CORRECT:TEXT Bezirksebene
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Anteil Verkehrsflaeche in Prozent  auf Bezirksebene
CORRECT:TEXT Anteil Verkehrsflaeche
3
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Anteil EFH am Wohnungsbestand in Prozent  Schlatt und Glattfelden
CORRECT:TEXT Anteil EFH am Wohnungsbestand
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Schuel.
CORRECT:TEXT Schuel. Oberstufe
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Personalsteuerpflichtige hat  Huetten bis 2018 und Niederweningen  ?
CORRECT:TEXT Personalsteuerpflichtige
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Motorraeder
CORRECT:TEXT Motorraeder
3
ENT_LABEL DATA
ENTITY2 DATA

1
1
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bevoelkerung:
CORRECT:TEXT Bevoelkerung: Durchschnittsalter
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Gemeinde  ?
CORRECT:TEXT Gemeinde
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Gemeinde  ?
CORRECT:TEXT Nettoinvestitionen
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Zahnaerzte hat  in Neftenbach  ?
CORRECT:TEXT Zahnaerzte
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Erwerb des Schweizer Buergerrechts
CORRECT:TEXT Erwerb des Schweizer Buergerrechts
5
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Steuerb.
CORRECT:TEXT Bezirk
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Bezirk  ?
CORRECT:TEXT Bezirk
5
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerb.
CORRECT:TEXT Steuerb. Einkommen natuerl. Pers. Median
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bezirk  ?
CORRECT:TEXT Steuerb. Einkommen natuerl. Pers. Median
3
ENT_LABEL LOC
ENTITY2 DATA
ENT_TEXT NRW
CORRECT:TEXT NRW Waehleranteil SP
2
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Schuel.
CORRECT:TEXT Schuel. Sekundarstufe I
4
3
ENT_LABEL DATA
EN

3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Region Furttal  ?
CORRECT:TEXT Steuerpflichtige juristische Personen
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Schuel.
CORRECT:TEXT Schuel. Kindergarten
1
ENT_LABEL GRAN
ENTITY2 GRAN
ENT_TEXT Bezirk
CORRECT:TEXT Bezirk
3
ENT_LABEL GRAN
ENTITY2 DATA
ENT_TEXT Bezirk
CORRECT:TEXT Anteil unproduktive Flaeche
4
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Bauinv.
CORRECT:TEXT Bauinv. EFH 5 Jahre
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Schuel.
CORRECT:TEXT Gemeinde
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Gemeinde ?
CORRECT:TEXT Gemeinde
2
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Schuel.
CORRECT:TEXT Schuel. Kindergarten
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Gemeinde ?
CORRECT:TEXT Schuel. Kindergarten
1
ENT_LABEL DATA
ENTITY2 GRAN
ENT_TEXT Gemeinden  ?
CORRECT:TEXT Gemeinden
1
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Gemeinden  ?
CORRECT:TEXT Flaeche
3
ENT_LABEL DATA
ENTITY2 DATA
ENT_TEXT Steuerertrag allgemeine Gemeindesteuern hat  in Buelach  ?
CORRECT:TEXT Steuerertrag allg

In [18]:
for key in word_evaluation:
    correct = word_evaluation[key]["correct"]
    total = word_evaluation[key]["total"]

    print(f"{key}: {correct / total * 100:.2f}%")

stat_total_sum = 0
stat_correct_sum = 0

print("---")
for key in stat_evaluation:
    correct = stat_evaluation[key]["correct"]
    total = stat_evaluation[key]["total"]
    
    stat_total_sum += total
    stat_correct_sum += correct

    print(f"{key}: {correct / total * 100:.2f}%")

print(f"\nTotal: {stat_correct_sum/stat_total_sum * 100:.2f}%")

GRAN: 3.42%
DATA: 15.08%
---
GRAN: 3.42%
DATA: 15.08%

Total: 12.46%


In [19]:
# dictionary which will be populated with the entities and result information
entity_evaluation = {}

# helper function to udpate the entity_evaluation dictionary
def update_results(entity, metric):
    if entity not in entity_evaluation:
        entity_evaluation[entity] = {"correct": 0, "total": 0}
    
    entity_evaluation[entity][metric] += 1

# same as before, see if entities from test set match what spaCy currently predicts
for data in TEST_REVISION_DATA:
    sentence = data[0]
    entities = data[1]["entities"]

    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]

        for ent in doc.ents:
            if ent.label_ == entity[2] and ent.text == correct_text:
                update_results(ent.label_, "correct")
                break

        update_results(entity[2], "total")

In [20]:
sum_total = 0
sum_correct = 0

for entity in entity_evaluation:
    total = entity_evaluation[entity]["total"]
    correct = entity_evaluation[entity]["correct"]

    sum_total += total
    sum_correct += correct
    
    print("{} | {:.2f}%".format(entity, correct / total * 100))

print()
print("Overall accuracy: {:.2f}%".format(sum_correct / sum_total * 100))

MISC | 65.11%
LOC | 66.60%
ORG | 79.32%
PER | 92.48%

Overall accuracy: 75.43%


In [21]:
nlp.meta["name"] = "stat_entity_extractor_v1"
nlp.to_disk("./models/v1")

In [22]:
TRAIN_STAT_DATA[5]


['Ich brauche Bauzonen ', {'entities': [[12, 20, 'DATA']]}]

In [23]:
doc = nlp(u'Welche Gemeinde hat die grösste Bevölkerung und welche hatte im 2019 den höchsten Ausländeranteil?')

# show universal pos tags
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.pos_) for t in doc))
# output: Ich/PRON bin/AUX ein/DET Berliner/NOUN ./PUNCT

# show German specific pos tags (STTS)
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.tag_) for t in doc))
# output: Ich/PPER bin/VAFIN ein/ART Berliner/NN ./$.

# show dependency arcs
print('\n'.join('{child:<8} <{label:-^7} {head}'.format(child=t.orth_, label=t.dep_, head=t.head.orth_) for t in doc))
# output: (sb: subject, nk: noun kernel, pd: predicate)

#named entities
print("Named Entity Recognition:")
for ent in doc.ents:
    print(ent.text)
print("Noun chunks:")
for chunk in doc.noun_chunks:
    print(chunk.text)

Welche/DET Gemeinde/NOUN hat/AUX die/DET grösste/ADJ Bevölkerung/NOUN und/CCONJ welche/PRON hatte/AUX im/ADP 2019/PROPN den/DET höchsten/ADJ Ausländeranteil/NOUN ?/PUNCT
Welche/PWAT Gemeinde/NN hat/VAFIN die/ART grösste/ADJA Bevölkerung/NN und/KON welche/PDS hatte/VAFIN im/APPRART 2019/CARD den/ART höchsten/ADJA Ausländeranteil/NN ?/$.
Welche   <--nk--- Gemeinde
Gemeinde <--sb--- hat
hat      <-ROOT-- hat
die      <--nk--- Bevölkerung
grösste  <--nk--- Bevölkerung
Bevölkerung <--oa--- hat
und      <--cd--- Bevölkerung
welche   <--cj--- und
hatte    <--cj--- hat
im       <--mo--- hatte
2019     <--nk--- im
den      <--nk--- Ausländeranteil
höchsten <--nk--- Ausländeranteil
Ausländeranteil <--oa--- hatte
?        <-punct- hat
Named Entity Recognition:
Noun chunks:
Welche Gemeinde
die grösste Bevölkerung
welche
2019
den höchsten Ausländeranteil
