# Info fields via machine learning

Extract persons from the info fields StartEntryInfo and EndEntryInfo of the [slave registers of Suriname](https://datasets.iisg.amsterdam/dataset.xhtml?persistentId=hdl:10622/CSPBHO) via machine learning

See: https://www.freecodecamp.org/news/getting-started-with-ner-models-using-huggingface/

## 1. Annotating info fields

In [None]:
import nltk
import pandas as pd
import random
import regex

### 1.1 Read data that needs to be classified

In [None]:
DATA_FILE = "../../data/suriname/Dataset Suriname Slave and Emancipation Registers Version 1.1.csv"
DATA_COLUMN = "EndEntryInfo"

In [None]:
def add_column_tokens(train):
    train["tokens"] = [ nltk.word_tokenize(text) for text in train["text"] ]
    return train

In [None]:
def add_column_labels(train):
    train["labels"] = [ len(tokens) * [ "O" ] for tokens in train["tokens"] ]
    return train

In [None]:
def add_column_numeric_labels(train, numeric_labels):
    train["numeric_labels"] = [ [ numeric_labels[label] for label in labels ] for labels in train["labels"] ]
    return train

In [None]:
def is_date(day, month, year):
    return regex.search(r"^\d\d\d\d\b", year) and regex.search(r"^\d\d?$", day) and True

In [None]:
def add_date_tags_to_labels(labels, index):
    labels[index - 2], labels[index - 1], labels[index] = "B-DATE", "I-DATE", "I-DATE"
    return labels

In [None]:
def label_dates(train):
    for index, row in train.iterrows():
        for i in range(2, len(row["tokens"])):
            if is_date(row["tokens"][i-2], row["tokens"][i-1], row["tokens"][i]):
                add_date_tags_to_labels(row["labels"], i)
    return train       

In [None]:
def show_annotations(train):
    for index in range(0, len(train)):
        for i in range(0, len(train["labels"][index])):
            print(train["tokens"][index][i], end="")
            if train["labels"][index][i] != "O":
                print("/" + train["labels"][index][i], end="")
            print(" ", end="")
        print("")

In [None]:
def make_train(data, data_column=DATA_COLUMN, nbr_of_lines=100):
    if nbr_of_lines > 0:
        train = pd.DataFrame(data[data_column].value_counts()[:nbr_of_lines])
    else:
        train = pd.DataFrame(data[data_column].value_counts())
    train = train.rename(columns={data_column: "frequency"})
    train["text"] = train.index
    train["index"] = range(0, len(train))
    train = train.set_index("index")
    return train

In [None]:
def make_info_data_train(data_column=DATA_COLUMN):
    data = pd.read_csv(DATA_FILE, low_memory=False)
    info_data_train = make_train(data, data_column, nbr_of_lines=0)
    info_data_train = add_column_tokens(info_data_train)
    info_data_train = add_column_labels(info_data_train)
    return info_data_train

In [None]:
info_data_train = make_info_data_train(data_column=DATA_COLUMN)

### 1.2 Make data for initial annotation

In [None]:
def make_selected_data_ids(info_data_train, selected_frequent, selected_random):
    selected_data_ids = list(range(0, selected_frequent))
    while len(selected_data_ids) < selected_frequent + selected_random:
        selected_data_id = random.randint(selected_frequent, len(info_data_train) - 1)
        if selected_data_id not in selected_data_ids:
            selected_data_ids.append(selected_data_id)
    return selected_data_ids

In [None]:
def make_selected_data_flags(info_data_train, selected_data_ids):
    selected_data_flags = len(info_data_train) * [ False ]
    for id_value in selected_data_ids:
        selected_data_flags[id_value] = True
    return selected_data_flags

In [None]:
def save_annotated_data(info_data_train, selected_data_flags):
    out_file = open("outfile.json", "w")
    selected_data = []
    for index, row in info_data_train[selected_data_flags].iterrows():
        text = " ".join(row["tokens"])
        selected_data.append({ "eid": DATA_COLUMN[0] + str(index), "text": text, "label": [] })
        print(selected_data[-1], file=out_file)
    out_file.close()
    return selected_data

In [None]:
def make_data(info_data_train, selected_frequent, selected_random):
    random.seed(42)
    selected_data_ids = make_selected_data_ids(info_data_train, selected_frequent, selected_random)
    selected_data_flags = make_selected_data_flags(info_data_train, selected_data_ids)
    selected_data = save_annotated_data(info_data_train, selected_data_flags)
    return selected_data

In [None]:
SELECTED_FREQUENT = 100
SELECTED_RANDOM = 100

# selected_data = make_data(info_data_train, selected_frequent=SELECTED_FREQUENT, selected_random=SELECTED_RANDOM)

## 2. Machine learning

Based on tutorial https://huggingface.co/transformers/v3.2.0/custom_datasets.html#token-classification-with-w-nut-emerging-entities

In [None]:
import joblib
import json
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import regex
from sklearn.model_selection import train_test_split
from spacy import displacy
import torch
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModel
from transformers import BertForTokenClassification
from transformers import pipeline

In [None]:
def render_text(text, entities):
    displacy.render({ "text": regex.sub("\\n", " ", text), 
                      "ents": entities }, 
                      options = { "colors": { "fuzzy_match": "yellow"} }, style = "ent", manual = True)

### 2.1 Read annotated data

In [None]:
ANNOTATIONS_FILE = "../../data/annotated/2000.jsonl"

In [None]:
def make_offset2label_pos(text):
    offset2label_pos = {}
    offset = 0
    token_counter = 0
    for token in text.split():
        offset2label_pos[offset] = token_counter
        offset += len(token) + 1
        token_counter += 1
    return offset2label_pos

In [None]:
def fix_label_start_not_token_initial(text, label_start):
    while regex.search(" ", text[label_start]):
        label_start += 1
    while label_start > 0 and not regex.search(" ", text[label_start - 1]):
        label_start -= 1
    return label_start

In [None]:
def make_labels(data):
    text = data["data"]
    labels = [ "O" for token in text.split() ]
    offset2label_pos = make_offset2label_pos(text)
    for label in data["label"]:
        label[0] = fix_label_start_not_token_initial(text, label[0])
        if label[0] not in offset2label_pos:
            raise Exception(f"{label[0]} not found in labels {offset2label_pos} of text {text}")
        else:
            labels[offset2label_pos[label[0]]] = "B-" + label[2]
            for i in range(label[0] + 1, label[1] + 1):
                if i in offset2label_pos:
                    labels[offset2label_pos[i]] = "I-" + label[2]
    return labels

In [None]:
def read_jsonl_file(file_name):
    annotations_file = open(file_name, "r")
    texts = []
    tags = []
    ids = []
    for line in annotations_file:
        data = json.loads(line)
        texts.append(data["data"].split())
        tags.append(make_labels(data))
        ids.append(data["id"])
    annotations_file.close()
    return texts, tags, ids

In [None]:
def find_duplicates(annotated_texts, annotated_tags):
    seen = {}
    items_to_delete = []
    for i in range(0, len(annotated_texts)):
        text = annotated_texts[i]
        if str(text) in seen:
            print(text)
            items_to_delete = [i] + items_to_delete
        seen[str(text)] = True
    return items_to_delete

In [None]:
def remove_duplicates(annotated_texts, annotated_tags, annotated_ids):
    items_to_delete = find_duplicates(annotated_texts, annotated_tags)
    for i in items_to_delete:
        annotated_texts.pop(i)
        annotated_tags.pop(i)
        annotated_ids.pop(i)
    if len(items_to_delete) == 0:
        print("no duplicates found")
    else:
        print(f"removed {len(items_to_delete)} duplicate", end="")
        if len(items_to_delete) > 1:
            print("s", end="")
        print()
    return annotated_texts, annotated_tags, annotated_ids

In [None]:
annotated_texts, annotated_tags, annotated_ids = read_jsonl_file(ANNOTATIONS_FILE)
len(annotated_texts), len(annotated_tags), len(annotated_ids)

In [None]:
annotated_texts, annotated_tags, annotated_ids = remove_duplicates(annotated_texts, annotated_tags, annotated_ids)

### 2.2 Check annotated data

In [None]:
def get_tags_for_token(target_token, annotated_texts, annotated_tags):
    tags_found = {}
    for tokens, tags in zip(annotated_texts, annotated_tags):
        for token, tag in zip(tokens, tags):
            if token == target_token:
                if tag in tags_found:
                    tags_found[tag] += 1
                else:
                    tags_found[tag] = 1
    print({ pair[0]: pair[1] for pair in sorted(tags_found.items(), key=lambda x: x[1], reverse=True) })

In [None]:
def check_phrase(phrase, phrases_found, text_id, target_text=""):
    if phrase != "":
        if phrase == target_text:
            print(text_id)
        if phrase in phrases_found:
            phrases_found[phrase] += 1
        else:
            phrases_found[phrase] = 1

In [None]:
def get_phrases_for_entity(entity, annotated_texts, annotated_tags, annotated_ids, target_text=""):
    phrases_found = {}
    for tokens, tags, text_id in zip(annotated_texts, annotated_tags, annotated_ids):
        phrase = ""
        for token, tag in zip(tokens, tags):
            if tag == "B-" + entity:
                check_phrase(phrase, phrases_found, text_id, target_text)
                phrase = token
            elif tag == "I-" + entity:
                phrase += " " + token
            else:
                check_phrase(phrase, phrases_found, text_id, target_text)
                phrase = ""
    print({ pair[0]: pair[1] for pair in sorted(phrases_found.items(), key=lambda x: x[1], reverse=True) })

In [None]:
get_tags_for_token("van", annotated_texts, annotated_tags)

In [None]:
get_phrases_for_entity("TOPIC", annotated_texts, annotated_tags, annotated_ids)

### 2.3 Convert data to train set and validation set

A tokenizer needs to be defined for breaking up the texts in known tokens

In [None]:
def load_model(num_labels, model_name="GroNLP/bert-base-dutch-cased"):
    model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [None]:
def add_missing_I_tags(tags):
    missing_tags = []
    for tag in tags:
        i_tag = regex.sub(r"^B-", "I-", tag)
        if i_tag not in tags:
            missing_tags.append(i_tag)
    return list(tags) + missing_tags

In [None]:
def convert_B_to_I_tag(tag):
    return regex.sub(r"^B", "I", tag)

In [None]:
def split_tags(tags_in, encodings):
    tags_out = [ [] for _ in range(len(encodings.offset_mapping,)) ]
    for encodings_doc, tags_in_doc, tags_out_doc in zip(encodings.offset_mapping, tags_in, tags_out):
        CLS_seen = False
        SEP_seen = False
        tags_counter = 0
        for encoding in encodings_doc:
            if encoding[1] == 0:
                if not CLS_seen:
                    tags_out_doc.append("CLS")
                    CLS_seen = True
                elif not SEP_seen:
                    tags_out_doc.append("SEP")
                    SEP_seen = True
                else:
                    tags_out_doc.append("PAD")
            elif encoding[0] == 0:
                tags_out_doc.append(tags_in_doc[tags_counter])
                tags_counter += 1
            else:
                tags_out_doc.append(convert_B_to_I_tag(tags_in_doc[tags_counter - 1]))
    return tags_out

In [None]:
def tags_to_numbers(tags, tag2id):
    return [ [ tag2id[tag] for tag in doc ] for doc in tags ]

In [None]:
class WNUTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def split_annotated_data(annotated_texts, annotated_tags):
    train_texts, val_texts, train_tags, val_tags = train_test_split(annotated_texts, 
                                                                    annotated_tags, 
                                                                    test_size=.2, 
                                                                    random_state=42)
    return train_texts, val_texts, train_tags, val_tags

In [None]:
def analyze_annotated_tags(annotated_tags):
    unique_tags = set(tag for doc in annotated_tags for tag in doc )
    unique_tags = sorted(add_missing_I_tags(unique_tags))
    unique_types = list(set([ regex.sub(r"^[BI]-", "", tag) for tag in unique_tags ]))
    tag2id = { tag: id for id, tag in enumerate(unique_tags) }
    id2tag = { id: tag for tag, id in tag2id.items() }
    return unique_tags, unique_types, tag2id, id2tag

In [None]:
def make_labels(train_tags, val_tags, extra_tags):
    train_labels = tags_to_numbers( split_tags(train_tags, train_encodings),
                                    { **tag2id, **extra_tags})
    val_labels =   tags_to_numbers( split_tags(val_tags, val_encodings),
                                    { **tag2id, **extra_tags})
    return train_labels, val_labels

In [None]:
IGNORE_TAG_ID = -100

extra_tags = { 'CLS': IGNORE_TAG_ID, 'SEP': IGNORE_TAG_ID, 'PAD': IGNORE_TAG_ID }

In [None]:
train_texts, val_texts, train_tags, val_tags = split_annotated_data(annotated_texts, annotated_tags)
unique_tags, unique_types, tag2id, id2tag = analyze_annotated_tags(annotated_tags)

In [None]:
try:
    tokenizer
except NameError:
    print("initializing model and tokenizer...")
    model, tokenizer = load_model(num_labels=len(unique_tags), model_name="GroNLP/bert-base-dutch-cased")

In [None]:
train_encodings = tokenizer(train_texts, 
                            is_split_into_words=True, 
                            return_offsets_mapping=True, 
                            padding=True, 
                            truncation=True)
val_encodings =   tokenizer(val_texts, 
                            is_split_into_words=True, 
                            return_offsets_mapping=True, 
                            padding=True, 
                            truncation=True)

In [None]:
train_labels, val_labels = make_labels(train_tags, val_tags, extra_tags)

In [None]:
train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

### 2.4 Fine-tune model with data

Using Bertje as base model: https://huggingface.co/GroNLP/bert-base-dutch-cased

In [None]:
def train_model(model, nbr_of_epochs=1):
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=nbr_of_epochs,  # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        evaluation_strategy="steps"
    )

    trainer = Trainer(
        model=model,                     # the instantiated 🤗 Transformers model to be trained
        args=training_args,              # training arguments, defined above
        train_dataset=train_dataset,     # training dataset
        eval_dataset=val_dataset         # evaluation dataset
    )

    trainer.train()
    return trainer

In [None]:
def save_model(model_name):
    print(f"saving model {model_name}...")
    model.save_pretrained(f"./models/{model_name}")
    tokenizer.save_pretrained(f"./models/{model_name}")

In [None]:
eval_data = { 
"1600k": { 10: [8.605, 9.388613],   20: [8.1468, 8.581432],  30: [7.2292, 6.912892],  40: [5.2916, 4.459104],
           50: [3.0505, 2.291136],  60: [1.9392, 1.396627],  70: [1.3153, 0.974922],  80: [0.9103, 0.611082]},
"1600l": { 10: [2.7978, 2.72777],   20: [2.7011, 2.594252],  30: [2.5368, 2.377436],  40: [2.3084, 2.102149],
           50: [1.9682, 1.813048],  60: [1.8059, 1.541138],  70: [1.5286, 1.295988],  80: [1.3218, 1.071442],
           90: [1.1927, 1.059575], 100: [1.1952, 1.02582],  110: [1.0601, 0.972512], 120: [1.0243, 0.90479],
          130: [0.9487, 0.831679], 140: [0.9014, 0.74661],  150: [0.7793, 0.673899], 160: [0.7085, 0.587361],
          170: [0.6515, 0.582702], 180: [0.6932, 0.571471], 190: [0.5952, 0.55383],  200: [0.5968, 0.525524],
          210: [0.5394, 0.497014], 220: [0.5214, 0.456926], 230: [0.4567, 0.422593], 240: [0.431, 0.380497],
          250: [0.3958, 0.377924], 260: [0.4473, 0.372003], 270: [0.3626, 0.362811], 280: [0.3715, 0.345728],
          290: [0.3268, 0.331447], 300: [0.3099, 0.309965], 310: [0.2709, 0.291733], 320: [0.2764, 0.274869],
          330: [0.2474, 0.272652], 340: [0.2974, 0.268813], 350: [0.2275, 0.263912], 360: [0.2351, 0.251166],
          370: [0.2027, 0.244896], 380: [0.1863, 0.234641], 390: [0.1571, 0.228038], 400: [0.1837, 0.229123],
          410: [0.1653, 0.224482], 420: [0.2015, 0.21968],  430: [0.1479, 0.216111], 440: [0.1501, 0.211968],
          450: [0.1255, 0.206149], 460: [0.1102, 0.202833], 470: [0.0887, 0.202619], 480: [0.1239, 0.213482]},
"1600m": { 10: [3.0414, 3.001803],  20: [2.9454, 2.856211],  30: [2.7619, 2.614135],  40: [2.4844, 2.284899],
           50: [2.1267, 1.891226],  60: [1.8582, 1.584162],  70: [1.5569, 1.312878],  80: [1.3307, 1.080446],
           90: [1.2072, 1.06774],  100: [1.1987, 1.032933], 110: [1.067, 0.978802],  120: [1.0432, 0.911087],
          130: [0.9554, 0.834676], 140: [0.8992, 0.745508], 150: [0.7885, 0.660629], 160: [0.7031, 0.568473],
          170: [0.6488, 0.563569], 180: [0.6766, 0.55112],  190: [0.5832, 0.530326], 200: [0.5905, 0.500823],
          210: [0.526, 0.470368],  220: [0.4952, 0.431339], 230: [0.4296, 0.391956], 240: [0.4074, 0.349349],
          250: [0.3719, 0.347294], 260: [0.4158, 0.342064], 270: [0.3355, 0.333896], 280: [0.3472, 0.319556], 
          290: [0.3024, 0.306712], 300: [0.2787, 0.288847], 310: [0.2306, 0.271399], 320: [0.2517, 0.257995],
          330: [0.2279, 0.256285], 340: [0.2737, 0.252748], 350: [0.2027, 0.248475], 360: [0.2106, 0.239845],
          370: [0.1792, 0.23603],  380: [0.1602, 0.22973],  390: [0.1281, 0.222972], 400: [0.1683, 0.227971],
          410: [0.1554, 0.222322], 420: [0.1879, 0.216514], 430: [0.1279, 0.212594], 440: [0.1291, 0.209587],
          450: [0.1072, 0.206074], 460: [0.0913, 0.206748], 470: [0.0697, 0.205872], 480: [0.1197, 0.22059],
          490: [0.1455, 0.210196], 500: [0.1858, 0.202405], 510: [0.1059, 0.192074], 520: [0.135, 0.189939],
          530: [0.1383, 0.187261], 540: [0.1395, 0.179227], 550: [0.1186, 0.17271],  560: [0.1097, 0.162364],
          570: [0.0839, 0.169346], 580: [0.1037, 0.17176],  590: [0.0733, 0.167507], 600: [0.0823, 0.160418],
          610: [0.1072, 0.156252], 620: [0.0786, 0.155454], 630: [0.0787, 0.153239], 640: [0.0995, 0.157059]
         },
"2000a": { 10: [3.1072, 3.052986],  20: [3.0062, 2.904115],  30: [2.8227, 2.666382],  40: [2.5515, 2.343706],
           50: [2.2182, 1.966769],  60: [1.8726, 1.637894],  70: [1.5951, 1.37603],   80: [1.3811, 1.116854],
           90: [1.1845, 0.876394], 100: [0.9552, 0.707807], 110: [0.9001, 0.701295], 120: [0.8582, 0.680713],
          130: [0.766, 0.64848],   140: [0.7666, 0.610998], 150: [0.739, 0.567632],  160: [0.5975, 0.515508],
          170: [0.6255, 0.46969],  180: [0.527, 0.415647],  190: [0.4658, 0.365828], 200: [0.4232, 0.325514],
          210: [0.4591, 0.323955], 220: [0.4186, 0.315625], 230: [0.3616, 0.301866], 240: [0.3674, 0.291112],
          250: [0.3563, 0.276382], 260: [0.2481, 0.255775], 270: [0.2955, 0.243],    280: [0.2497, 0.228334],
          290: [0.1997, 0.207621], 300: [0.2092, 0.199684], 310: [0.2703, 0.199463], 320: [0.2252, 0.194289], 
          330: [0.1971, 0.185111], 340: [0.1894, 0.180386], 350: [0.187, 0.175801],  360: [0.1026, 0.167931],
          370: [0.1451, 0.168511], 380: [0.1144, 0.17272],  390: [0.0827, 0.158728], 400: [0.1173, 0.171397]
         }
}

In [None]:
string = """
"""

def convert_eval_scores_to_dict(string):
    eval_dict = {}
    token_list = []
    for token in string.split():
        token_list.append(token)
        if len(token_list) >= 3:
            eval_dict[int(token_list[0])] = [ float(token_list[1]), float(token_list[2]) ]
            token_list = []
    if len(token_list) > 0:
        print(f"there were unprocessed tokens! ({token_list})")
    return eval_dict

convert_eval_scores_to_dict(string)

In [None]:
trainer = train_model(model, nbr_of_epochs=1)

In [None]:
evaluate_texts([ " ".join(text) for text in val_texts ], val_labels, model, tokenizer)

In [None]:
save_model("2000a")

### 2.5 Evaluate fine-tuned model

In [None]:
def make_eval_data(trainer):
    eval_data = {}
    for data in trainer.state.log_history:
        if data["step"] not in eval_data:
            eval_data[data["step"]] = [0 , 0]
        if "loss" in data:
            eval_data[data["step"]][0] = data["loss"]
        if "eval_loss" in data:
            eval_data[data["step"]][1] = data["eval_loss"]
    return eval_data

In [None]:
def plot_eval_data(eval_data):
    plt.figure(figsize=(5, 3))
    plt.plot([data_key for data_key in eval_data], [eval_data[data_key][0] for data_key in eval_data], label="training loss")
    plt.plot([data_key for data_key in eval_data], [eval_data[data_key][1] for data_key in eval_data], label="validation loss")
    plt.legend()

In [None]:
plot_eval_data(make_eval_data(trainer))
#plot_eval_data(eval_data["1600m"])

In [None]:
def results_to_entities(tag_id_list, token_id_list):
    entities = []
    token_counter = 0
    current_tag = ("", -1)
    for tag, token in zip(tag_id_list, token_id_list):
        tag_start = tag[0]
        tag_class = regex.sub(r"^[BI]-", "", tag)
        current_tag_class = current_tag[0]
        current_tag_start = current_tag[1]
        if regex.search(r"^##", token):
            token_counter -= 1
        if current_tag_class != "" and not regex.search(r"^##", token):
            if tag_class == "O" or tag_start == "B" or tag_class != current_tag_class:
                entities.append([current_tag_start, token_counter, current_tag_class])
                current_tag = ("", -1)
                current_tag_class = ""
                current_tag_start = -1
        if tag_class != "O" and current_tag_class == "":
            current_tag = (tag_class, token_counter)
            if regex.search(r"^##", token) and (len(entities) == 0 or entities[-1][2] != token_counter):
                current_tag = (tag_class, token_counter - 1)
        token_counter += 1
    if current_tag_class != "":
        entities.append([current_tag_start, token_counter, current_tag_class])
    return entities

In [None]:
def compute_precision_and_recall(correct_count, missed_count, wrong_count):
    for tag in [ "total" ] + sorted(correct_count):
        precision = 0
        recall = 0
        if correct_count[tag] > 0 or wrong_count[tag] > 0:
            precision = correct_count[tag]/(correct_count[tag] + wrong_count[tag])
        if correct_count[tag] > 0 or missed_count[tag]:
            recall = correct_count[tag]/(correct_count[tag] + missed_count[tag])
        print(f"precision: {int(100*precision):-3d}; recall: {int(100*recall):-3d}; count: {correct_count[tag] + missed_count[tag]:4d}; tag: {tag}")

In [None]:
def get_labels_from_ids(label_ids):
    return [ id2tag[label_id] for label_id in label_ids if label_id != IGNORE_TAG_ID ]

In [None]:
def get_labels_from_results(sentence_result):
    return get_labels_from_ids([ int(regex.sub("^LABEL_", "", token_result["entity"])) for token_result in sentence_result ])

In [None]:
def get_split_tokens_from_results(sentence_result):
    return [ token_result["word"] for token_result in sentence_result ]

In [None]:
def combine_split_tokens(split_tokens):
    combined_tokens = []
    for token in split_tokens:
        if not regex.search(r"^##", token):
            combined_tokens.append(token)
        else:
            combined_tokens[-1] += regex.sub(r"^##", "", token)
    return combined_tokens

In [None]:
def retokenize(text):
    return regex.sub(" ##", "", " ".join(tokenizer.tokenize(" ".join(nltk.word_tokenize(text)))))

In [None]:
def test_tokenization(texts):
    nbr_of_mismatches = 0
    for input_text in texts:
        processed_text = retokenize(input_text)
        if processed_text != input_text:
            nbr_of_mismatches += 0
    if nbr_of_mismatches > 0:
        print(f"tokenization mismatches: {nbr_of_mismatches}")

In [None]:
def render_results(text_entities, text_tokens, error_count):
    text = f"({error_count})"
    tags = []
    token_counter = 0
    in_tag = False
    for entity in text_entities:
        entity_token_start, entity_token_end, entity_label = entity
        for i in range(token_counter, entity_token_start):
            text += " " + text_tokens[i]
        entity_char_start = len(text) + 1
        for i in range(entity_token_start, entity_token_end):
            text += " " + text_tokens[i]
        entity_char_end = len(text)
        tags.append( { "start": entity_char_start, "end": entity_char_end, "label": entity_label } )
        token_counter = entity_token_end
    render_text(text, tags)

In [None]:
def evaluate_results_per_entity(results, correct_label_ids, check_labels=False):
    correct_count = { tag: 0 for tag in unique_types if tag != "O" }
    missed_count = { tag: 0 for tag in unique_types if tag != "O" }
    wrong_count = { tag: 0 for tag in unique_types if tag != "O" }
    errors_per_text = []
    for sentence_result, correct_sentence_label_ids in zip(results, correct_label_ids):
        guessed_labels = get_labels_from_results(sentence_result)
        split_tokens = get_split_tokens_from_results(sentence_result)
        correct_labels = get_labels_from_ids(correct_sentence_label_ids)
        guessed_entities = results_to_entities(guessed_labels, split_tokens)
        correct_entities = results_to_entities(correct_labels, split_tokens)
        error_count = 0
        for entity in correct_entities:
            if entity in guessed_entities:
                correct_count[entity[2]] += 1
            else:
                missed_count[entity[2]] += 1
                error_count += 1
        for entity in guessed_entities:
            if entity not in correct_entities:
                wrong_count[entity[2]] += 1
                error_count += 1
        errors_per_text.append(error_count)
        if check_labels and error_count > 0:
            render_results(guessed_entities, combine_split_tokens(split_tokens), error_count)
            render_results(correct_entities, combine_split_tokens(split_tokens), 0)
            print("")
    return correct_count, missed_count, wrong_count, errors_per_text

In [None]:
def evaluate_texts(texts, labels, model, tokenizer, check_labels=False):
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
    results = [ ner_pipeline(text) for text in texts ]
    test_tokenization(texts)
    correct_count, missed_count, wrong_count , errors_per_text = evaluate_results_per_entity(results, labels, check_labels)
    correct_count["total"] = sum(correct_count.values())
    wrong_count["total"] = sum(wrong_count.values())
    missed_count["total"] = sum(missed_count.values())
    compute_precision_and_recall(correct_count, missed_count, wrong_count)

In [None]:
model, tokenizer = load_model(num_labels=len(unique_tags), model_name="models/1600m")

Now regenerate train and val data?

In [None]:
evaluate_texts([ " ".join(text) for text in val_texts ], val_labels, model, tokenizer)

In [None]:
guessed_tags = [ [ id2tag[list(guesses_per_token).index(max(guesses_per_token))]
                   for guesses_per_token in guesses ] 
                   for guesses in results[0] ]

In [None]:
max_render = 1

text_counter = 0
for sentence_result, error_count in zip(results, errors_per_text):
    guessed_labels = get_labels_from_results(sentence_result)
    split_tokens = get_split_tokens_from_results(sentence_result)
    guessed_entities = results_to_entities(guessed_labels, split_tokens)
    print(guessed_entities, combine_split_tokens(split_tokens), error_count)
    render_results(guessed_entities, combine_split_tokens(split_tokens), error_count)
    text_counter += 1
    if text_counter >= max_render:
        break

### 2.6 Select extra data for training

Training data selection process:

1. 100 most frequent data from each half and 100 randomly selected (total 400)
2. 50 with most of ENSLAVED|FREED|OWNER tags and 50 random with one of these tags (total 200)
3. 50 randomly selcted data of each half with one of the tags ENSLAVED|FREED (total 100)
4. 150 randomly selcted data of each half (total 300)

Total: 1000 (3 duplicates)

In [None]:
def token_id_entities_to_char_id_entities(token_id_entities, split_tokens):
    char_id_entities = []
    tokens = combine_split_tokens(split_tokens)
    for token_id_entity in token_id_entities:
        char_start = 0
        for i in range(0, token_id_entity[0]):
            char_start += len(tokens[i]) + 1
        char_end = char_start
        for i in range(token_id_entity[0], token_id_entity[1]):
            char_end += len(tokens[i]) + 1
        char_id_entities.append([char_start, char_end - 1, token_id_entity[2]])
    return char_id_entities

In [None]:
def recognized_entities_to_annotation_labels(entities):
    split_tokens = get_split_tokens_from_results(entities)
    labels = get_labels_from_results(entities)
    token_id_entities = results_to_entities(labels, split_tokens)
    char_id_entities = token_id_entities_to_char_id_entities(token_id_entities, split_tokens)
    return char_id_entities

# recognized_entities_to_annotation_labels(entities)

In [None]:
def show_sample_of_selected_extra_data(selected_extra_data, sample_size=10):
    for i in range(0, sample_size):
        text = selected_extra_data[i]["data"]["text"]
        labels = [{"start": data[0], "end": data[1], "label": data[2]} for data in selected_extra_data[i]["data"]["label"] ]
        render_text(text, labels)

In [None]:
# model = joblib.load('./model/model-1000h.joblib')
model = BertForTokenClassification.from_pretrained("models/1600m", num_labels=len(unique_tags))
tokenizer = AutoTokenizer.from_pretrained("models/1600m")

In [None]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
DATA_COLUMN = "EndEntryInfo"

info_data_train = make_info_data_train(data_column=DATA_COLUMN)
extra_data = make_data(info_data_train, selected_frequent=0, selected_random=700)

In [None]:
selected_extra_data = []
for data in extra_data:
    if nltk.word_tokenize(data["text"]) not in annotated_texts:
        tag_counter = 0
        entities = ner_pipeline(data["text"])
        data["label"] = recognized_entities_to_annotation_labels(entities)
        data["text"] = retokenize(data["text"])
        selected_extra_data.append({ "tag_counter": tag_counter, "data": data })
len(selected_extra_data)

In [None]:
show_sample_of_selected_extra_data(selected_extra_data, sample_size=10)

In [None]:
out_file = open("outfile.json", "w")
for data in sorted(selected_extra_data, key=lambda data: data["tag_counter"], reverse=True)[:400]:
    print(json.dumps(data["data"]), file=out_file)
out_file.close()

### 2.7 Process other data with fine-tuned model

In [None]:
def add_labels(selected_entities):
    for entity_list in selected_entities:
        for entity in entity_list:
            entity["label"] = id2tag[int(regex.sub(r"^LABEL_", "", entity["entity"]))]
    return selected_entities

In [None]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(selected_data[0]["text"]))

In [None]:
selected_data = make_data(info_data_train, selected_frequent=0, selected_random=10)
selected_entities = [ ner_pipeline(data["text"]) for data in selected_data ]
selected_entities = add_labels(selected_entities)

In [None]:
render_results(selected_entities, 
               [ tokenizer.convert_tokens_to_ids(tokenizer.tokenize(data["text"])) for data in selected_data],
               len(entities) * [0])

In [None]:
selected_entities[0]