# 03 Train

#### Connect to GD

In [None]:
# set up connection to your google drive
# please click on the link generated and enter the authorisation code

from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/MyDrive/Colab/statbot

In [None]:
# Install / update spacy
# update spacy
! pip install -U spacy
! python -m spacy info

In [None]:
# load German model
! python -m spacy download de_core_news_lg

## DISCLAIMER: 
A large part of this code for training new entities, based on a pre-trained model, is taken from Isaac Aderogba:  https://deepnote.com/publish/2cc2d19c-c3ac-4321-8853-0bcf2ef565b3 Statistics Canton Zurich humbly tried to adapt his code for our purposes of training statistically relevant entities.

## Script A03
Training is being done based on a pre-trained German bert-base-multilingual-cased model by using transfer learning. At the end of the script there is also an evaluation of the tags. This includes both the previously included NER tags and the new NER tags, as transfer learning can change the accuracy of previously learned tags.

As large parts of this code have been taken from the link mentioned above, we recommend reading it. We consider it the best possible tutorial for this task.  

## Thoughts for improvement
- **add link for POS** (Position of Speech)


In [None]:
from __future__ import print_function, unicode_literals
import spacy
import warnings
from spacy.util import minibatch, compounding
import de_core_news_lg
import pandas as pd
import numpy as np
from random import sample
import io, csv
import re
import random
import json
from spacy.training import Example
from spacy.tokens import Doc
from tqdm import tqdm
nlp = spacy.load('de_core_news_lg')

In [None]:
# helper function for incrementing the revision counters
def increment_revision_counters(entity_counter, entities):
    for entity in entities:
        label = entity[2]
        if label in entity_counter:
            entity_counter[label] += 1
        else:
            entity_counter[label] = 1

In [None]:
#out_sentences=open("input/tagged_sentences.csv", "r").readlines()
with open('input/tagged_sentences.json') as json_file:
    out_sentences = json.load(json_file)
print("LENGTH OF DATASET: ",len(out_sentences))
dataset_dict={}

for sent in out_sentences:
    entities = sent[1]["entities"]
    increment_revision_counters(dataset_dict, entities)
#    print(entity[1])
    #helper_dict.append(entity[1]['entities'][0][2])
#out_sentences[:5]
print(dataset_dict)

In [None]:
npr_df = pd.read_csv("external/deu_news_2015_3M-sentences.txt", delimiter = "\t")
npr_df=npr_df.sample(frac=1)
npr_df.head()

In [None]:
# *** <- löschen?
# create an nlp object as we'll use this to seperate the sentences and identify existing entities
#loaded already above
#nlp = spacy.load('de_core_news_lg')

In [None]:
revision_texts = []

#STAT: Important: THIS IS A HYPER-PARAMETER: Changing it will affect the accuracy of the result
hyper_para_how_many=100000

# convert the articles to spacy objects to better identify the sentences. Disabled unneeded components. # takes ~ 4 minutes
for doc in tqdm(nlp.pipe(npr_df.iloc[:hyper_para_how_many,1], batch_size=30, disable=["tagger", "ner"])):
    for sentence in doc.sents:

        if  40 < len(sentence.text) < 80:
            # some of the sentences had excessive whitespace in between words, so we're trimming that
            revision_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))

In [None]:
revisions = []

# Use the existing spaCy model to predict the entities, then append them to revision
for doc in nlp.pipe(revision_texts, batch_size=50, disable=["tagger", "parser"]):
    
    # don't append sentences that have no entities
    if len(doc.ents) > 0:
        revisions.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))

In [None]:
# print an example of the revision sentence
print(revisions[0][0])

# print an example of the revision data
print(revisions[0][1])


In [None]:
# create arrays to store the revision data
TRAIN_REVISION_DATA = []
TEST_REVISION_DATA = []

# create dictionaries to keep count of the different entities
TRAIN_ENTITY_COUNTER = {}
TEST_ENTITY_COUNTER = {}

# This will help distribute the entities (i.e. we don't want 1000 PERSON entities, but only 80 ORG entities)
REVISION_SENTENCE_SOFT_LIMIT = 100



random.shuffle(revisions)
for revision in revisions:
    # get the entities from the revision sentence
    entities = revision[1]["entities"]

    # simple hack to make sure spaCy entities don't get too one-sided
    should_append_to_train_counter = 0
    for _, _, label in entities:
        if label in TRAIN_ENTITY_COUNTER and TRAIN_ENTITY_COUNTER[label] > REVISION_SENTENCE_SOFT_LIMIT:
            should_append_to_train_counter -= 1
        else:
            should_append_to_train_counter += 1

    # simple switch for deciding whether to append to train data or test data
    if should_append_to_train_counter >= 0:
        TRAIN_REVISION_DATA.append(revision)
        increment_revision_counters(TRAIN_ENTITY_COUNTER, entities)
    else:
        TEST_REVISION_DATA.append(revision)
        increment_revision_counters(TEST_ENTITY_COUNTER, entities)

In [None]:
TRAIN_ENTITY_COUNTER

In [None]:
TEST_ENTITY_COUNTER

In [None]:
TRAIN_REVISION_DATA

In [None]:
random.shuffle(out_sentences)
TRAIN_STAT_DATA=out_sentences[:len(out_sentences)//2]
TEST_STAT_DATA=out_sentences[len(out_sentences)//2:]

In [None]:
print(len(out_sentences))
print(len(TRAIN_STAT_DATA))
print(len(TEST_STAT_DATA))
print("REVISION", len(TRAIN_REVISION_DATA))
TRAIN_DATA = TRAIN_REVISION_DATA + TRAIN_STAT_DATA
print("COMBINED", len(TRAIN_DATA))

In [None]:
#STAT: below is the heart piece of this script, and the code was heavily changed compared to the original
#script taken out of the code on deepnote.com. The reason is thaat this code has been adapted to spacy 3 -
#while the old code was running on spacy 2.X
#central command is nlp-update

ner = nlp.get_pipe("ner")
ner.add_label("GRAN")
ner.add_label("DATA")



# get the names of the components we want to disable during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# start the training loop, only training NER
epochs = 30
#optimizer = nlp.resume_training()
#optimizer = nlp.initialize()
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    sizes = compounding(1.0, 4.0, 1.001)
    
    # batch up the examples using spaCy's minibatc
    for epoch in range(epochs):
        random.shuffle(TRAIN_DATA)
        #text = []
        #annots=[]
        examples=[]


        for text,annots in TRAIN_DATA:
            #text.append(t)
            #annots.append(a)
            doc = nlp.make_doc(text)    
            example = Example.from_dict(doc, annots)
            examples.append(example)
        
        losses = {}
        
        nlp.update(examples, drop=0.35, losses=losses)#,sgd=optimizer)

        print("Losses ({}/{})".format(epoch + 1, epochs), losses)

In [None]:
statbot_colors = {"GRAN": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
                  "DATA": "linear-gradient(90deg, #ffff00, #ff8c00)"}
statbot_options = {"ents": ["PER","LOC","ORG","MISC","GRAN","DATA"], "colors": statbot_colors}
spacy.displacy.render(nlp("Ich heisse Christian und war heute in Zürich bei IBM im Internet."), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie viele Kühe hat die Gemeinde Bülach?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie hoch ist Eigenkapital auf Bezirksebene?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Ich brauche die Daten pro Bezirk"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Ich brauche die Daten für den gesamten Kanton."), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Wie viel Bauinv. EFH 5 Jahre  hat  in Regensdorf  ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Was ist der Anteil an MIV-Anteil (Modal Split)   auf Bezirksebene ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Was ist der Anteil an Geb.Vol. Dienstleistungen: Zunahme   in Flaach ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Welches ist das Schül. Sekundarstufe II   für den gesamten Kanton ?"), style="ent",options=statbot_options)
spacy.displacy.render(nlp("Welche Gemeinde hat die grösste Bevölkerung?"), style="ent",options=statbot_options)

In [None]:
#now check the accuracy of our NERs in this plus the next code chunks!


# dictionary to hold our evaluation data
stat_evaluation = {
    "GRAN": {
        "correct": 0,
        "total": 0,
    },
    "DATA": {
        "correct": 0,
        "total": 0,
    }
}

word_evaluation = {
    "GRAN": {
        "correct": 0,
        "total": 0
    },
    "DATA": {
        "correct": 0,
        "total": 0,
    }

}


for stat in TEST_STAT_DATA:
    # extract the sentence and correct stat entities according to our test data
    sentence = stat[0]
    entities = stat[1]["entities"]

    # for each entity, use our updated model to make a prediction on the sentence
    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]
        n_worded_stat =  len(correct_text.split())
        print(n_worded_stat)

        # if we find that there's a match for predicted entity and predicted text, increment correct counters
        for ent in doc.ents:
            print("ENT_LABEL",ent.label_)
            print("ENTITY2",entity[2])
            print("ENT_TEXT",ent.text)
            print("CORRECT:TEXT",correct_text)
            if ent.label_ == entity[2] and ent.text == correct_text:
                
                stat_evaluation[entity[2]]["correct"] += 1
                if n_worded_stat > 0:
                    word_evaluation[entity[2]]["correct"] += 1

                # this break is important, ensures that we're not double counting on a correct match
                break

        #  increment total counters after each entity loop
        stat_evaluation[entity[2]]["total"] += 1
        if n_worded_stat > 0:
            word_evaluation[entity[2]]["total"] += 1

In [None]:
for key in word_evaluation:
    correct = word_evaluation[key]["correct"]
    total = word_evaluation[key]["total"]

    print(f"{key}: {correct / total * 100:.2f}%")

stat_total_sum = 0
stat_correct_sum = 0

print("---")
for key in stat_evaluation:
    correct = stat_evaluation[key]["correct"]
    total = stat_evaluation[key]["total"]
    
    stat_total_sum += total
    stat_correct_sum += correct

    print(f"{key}: {correct / total * 100:.2f}%")

print(f"\nTotal: {stat_correct_sum/stat_total_sum * 100:.2f}%")

In [None]:
#now test the accuracy of all the old NERs - was there amnesia on them?

# dictionary which will be populated with the entities and result information
entity_evaluation = {}

# helper function to udpate the entity_evaluation dictionary
def update_results(entity, metric):
    if entity not in entity_evaluation:
        entity_evaluation[entity] = {"correct": 0, "total": 0}
    
    entity_evaluation[entity][metric] += 1

# same as before, see if entities from test set match what spaCy currently predicts
for data in TEST_REVISION_DATA:
    sentence = data[0]
    entities = data[1]["entities"]

    for entity in entities:
        doc = nlp(sentence)
        correct_text = sentence[entity[0]:entity[1]]

        for ent in doc.ents:
            if ent.label_ == entity[2] and ent.text == correct_text:
                update_results(ent.label_, "correct")
                break

        update_results(entity[2], "total")

In [None]:
sum_total = 0
sum_correct = 0

for entity in entity_evaluation:
    total = entity_evaluation[entity]["total"]
    correct = entity_evaluation[entity]["correct"]

    sum_total += total
    sum_correct += correct
    
    print("{} | {:.2f}%".format(entity, correct / total * 100))

print()
print("Overall accuracy: {:.2f}%".format(sum_correct / sum_total * 100))

In [None]:
nlp.meta["name"] = "stat_entity_extractor_v1"
nlp.to_disk("./models/v1")

In [None]:
TRAIN_STAT_DATA[5]


In [None]:
doc = nlp(u'Welche Gemeinde hat die grösste Bevölkerung und welche hatte im 2019 den höchsten Ausländeranteil?')

# show universal pos tags
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.pos_) for t in doc))
# output: Ich/PRON bin/AUX ein/DET Berliner/NOUN ./PUNCT

# show German specific pos tags (STTS)
print(' '.join('{word}/{tag}'.format(word=t.orth_, tag=t.tag_) for t in doc))
# output: Ich/PPER bin/VAFIN ein/ART Berliner/NN ./$.

# show dependency arcs
print('\n'.join('{child:<8} <{label:-^7} {head}'.format(child=t.orth_, label=t.dep_, head=t.head.orth_) for t in doc))
# output: (sb: subject, nk: noun kernel, pd: predicate)

#named entities
print("Named Entity Recognition:")
for ent in doc.ents:
    print(ent.text)
print("Noun chunks:")
for chunk in doc.noun_chunks:
    print(chunk.text)