# Info fields via machine learning

Extract persons from the info fields StartEntryInfo and EndEntryInfo of the [slave registers of Suriname](https://datasets.iisg.amsterdam/dataset.xhtml?persistentId=hdl:10622/CSPBHO) via machine learning

See: https://www.freecodecamp.org/news/getting-started-with-ner-models-using-huggingface/

## 1. Annotating info fields

In [None]:
import nltk
import pandas as pd
import random
import regex

### 1.1 Read relevant data

In [None]:
DATA_FILE = "../../data/suriname/Dataset Suriname Slave and Emancipation Registers Version 1.1.csv"
DATA_COLUMN = "EndEntryInfo"

data = pd.read_csv(DATA_FILE, low_memory=False)

In [None]:
def add_column_tokens(train):
    train["tokens"] = [ nltk.word_tokenize(text) for text in train["text"] ]
    return train

In [None]:
def add_column_labels(train):
    train["labels"] = [ len(tokens) * [ "O" ] for tokens in train["tokens"] ]
    return train

In [None]:
def add_column_numeric_labels(train, numeric_labels):
    train["numeric_labels"] = [ [ numeric_labels[label] for label in labels ] for labels in train["labels"] ]
    return train

In [None]:
def is_date(day, month, year):
    return regex.search(r"^\d\d\d\d\b", year) and regex.search(r"^\d\d?$", day) and True

In [None]:
def add_date_tags_to_labels(labels, index):
    labels[index - 2], labels[index - 1], labels[index] = "B-DATE", "I-DATE", "I-DATE"
    return labels

In [None]:
def label_dates(train):
    for index, row in train.iterrows():
        for i in range(2, len(row["tokens"])):
            if is_date(row["tokens"][i-2], row["tokens"][i-1], row["tokens"][i]):
                add_date_tags_to_labels(row["labels"], i)
    return train       

In [None]:
def show_annotations(train):
    for index in range(0, len(train)):
        for i in range(0, len(train["labels"][index])):
            print(train["tokens"][index][i], end="")
            if train["labels"][index][i] != "O":
                print("/" + train["labels"][index][i], end="")
            print(" ", end="")
        print("")

In [None]:
def make_train(data, nbr_of_lines=100):
    if nbr_of_lines > 0:
        train = pd.DataFrame(data[DATA_COLUMN].value_counts()[:nbr_of_lines])
    else:
        train = pd.DataFrame(data[DATA_COLUMN].value_counts())
    train = train.rename(columns={DATA_COLUMN: "frequency"})
    train["text"] = train.index
    train["index"] = range(0, len(train))
    train = train.set_index("index")
    return train

In [None]:
info_data_train = make_train(data, nbr_of_lines=0)
info_data_train = add_column_tokens(info_data_train)
info_data_train = add_column_labels(info_data_train)

### 1.2 Make data for annotation

In [None]:
SELECTED_FREQUENT = 100
SELECTED_RANDOM = 100

In [None]:
def make_selected_data_ids(info_data_train, selected_frequent=SELECTED_FREQUENT, selected_random=SELECTED_RANDOM):
    selected_data_ids = list(range(0, selected_frequent))
    while len(selected_data_ids) < selected_frequent + selected_random:
        selected_data_id = random.randint(selected_frequent, len(info_data_train) - 1)
        if selected_data_id not in selected_data_ids:
            selected_data_ids.append(selected_data_id)
    return selected_data_ids

In [None]:
def make_selected_data_flags(info_data_train, selected_data_ids):
    selected_data_flags = len(info_data_train) * [ False ]
    for id_value in selected_data_ids:
        selected_data_flags[id_value] = True
    return selected_data_flags

In [None]:
def save_annotated_data(info_data_train, selected_data_flags):
    out_file = open("outfile.json", "w")
    selected_data = []
    for index, row in info_data_train[selected_data_flags].iterrows():
        text = " ".join(row["tokens"])
        selected_data.append({ "eid": DATA_COLUMN[0] + str(index), "text": text, "label": [] })
        print(selected_data[-1], file=out_file)
    out_file.close()
    return selected_data

In [None]:
def make_data(info_data_train, selected_frequent=SELECTED_FREQUENT, selected_random=SELECTED_RANDOM):
    random.seed(42)
    selected_data_ids = make_selected_data_ids(info_data_train, selected_frequent, selected_random)
    selected_data_flags = make_selected_data_flags(info_data_train, selected_data_ids)
    selected_data = save_annotated_data(info_data_train, selected_data_flags)
    return selected_data

In [None]:
selected_data = make_data(info_data_train, selected_frequent=SELECTED_FREQUENT, selected_random=SELECTED_RANDOM)

## 2. Machine learning

Based on tutorial https://huggingface.co/transformers/v3.2.0/custom_datasets.html#token-classification-with-w-nut-emerging-entities

In [None]:
import joblib
import json
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import regex
from sklearn.model_selection import train_test_split
from spacy import displacy
import torch
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModel
from transformers import BertForTokenClassification
from transformers import pipeline

In [None]:
def render_text(text, entities):
    displacy.render({ "text": regex.sub("\\n", " ", text), 
                      "ents": entities }, 
                      options = { "colors": { "fuzzy_match": "yellow"} }, style = "ent", manual = True)

### 2.1 Read annotated data

In [None]:
ANNOTATIONS_FILE = "../../data/annotated/700.jsonl"

In [None]:
def make_offset2label_pos(text):
    offset2label_pos = {}
    offset = 0
    token_counter = 0
    for token in text.split():
        offset2label_pos[offset] = token_counter
        offset += len(token) + 1
        token_counter += 1
    return offset2label_pos

In [None]:
def fix_label_start_not_token_initial(text, label_start):
    while regex.search(" ", text[label_start]):
        label_start += 1
    while label_start > 0 and not regex.search(" ", text[label_start - 1]):
        label_start -= 1
    return label_start

In [None]:
def make_labels(data):
    text = data["data"]
    labels = [ "O" for token in text.split() ]
    offset2label_pos = make_offset2label_pos(text)
    for label in data["label"]:
        label[0] = fix_label_start_not_token_initial(text, label[0])
        if label[0] not in offset2label_pos:
            raise Exception(f"{label[0]} not found in labels {offset2label_pos} of text {text}")
        else:
            labels[offset2label_pos[label[0]]] = "B-" + label[2]
            for i in range(label[0] + 1, label[1] + 1):
                if i in offset2label_pos:
                    labels[offset2label_pos[i]] = "I-" + label[2]
    return labels

In [None]:
def read_jsonl_file(file_name):
    annotations_file = open(file_name, "r")
    texts = []
    tags = []
    for line in annotations_file:
        data = json.loads(line)
        texts.append(data["data"].split())
        tags.append(make_labels(data))
    annotations_file.close()
    return texts, tags

In [None]:
def find_duplicates(annotated_texts, annotated_tags):
    seen = {}
    items_to_delete = []
    for i in range(0, len(annotated_texts)):
        text = annotated_texts[i]
        if str(text) in seen:
            print(text)
            items_to_delete = [i] + items_to_delete
        seen[str(text)] = True
    return items_to_delete

In [None]:
def remove_duplicates(annotated_texts, annotated_tags):
    items_to_delete = find_duplicates(annotated_texts, annotated_tags)
    for i in items_to_delete:
        annotated_texts.pop(i)
        annotated_tags.pop(i)
    return annotated_texts, annotated_tags

In [None]:
annotated_texts, annotated_tags = read_jsonl_file(ANNOTATIONS_FILE)

In [None]:
annotated_texts, annotated_tags = remove_duplicates(annotated_texts, annotated_tags)

### 2.2 Convert data to train set and validation set

In [None]:
def add_missing_I_tags(tags):
    missing_tags = []
    for tag in tags:
        i_tag = regex.sub(r"^B-", "I-", tag)
        if i_tag not in tags:
            missing_tags.append(i_tag)
    return list(tags) + missing_tags

In [None]:
train_texts, val_texts, train_tags, val_tags = train_test_split(annotated_texts, 
                                                                annotated_tags, 
                                                                test_size=.2, 
                                                                random_state=42)

In [None]:
unique_tags = set(tag for doc in annotated_tags for tag in doc )
unique_tags = add_missing_I_tags(unique_tags)
unique_types = list(set([ regex.sub(r"^[BI]-", "", tag) for tag in unique_tags ]))
tag2id = { tag: id for id, tag in enumerate(unique_tags) }
id2tag = { id: tag for tag, id in tag2id.items() }

In [None]:
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

In [None]:
train_encodings = tokenizer(train_texts, 
                            is_split_into_words=True, 
                            return_offsets_mapping=True, 
                            padding=True, 
                            truncation=True)
val_encodings =   tokenizer(val_texts, 
                            is_split_into_words=True, 
                            return_offsets_mapping=True, 
                            padding=True, 
                            truncation=True)

In [None]:
def convert_B_to_I_tag(tag):
    return regex.sub(r"^B", "I", tag)

In [None]:
def split_tags(tags_in, encodings):
    tags_out = [ [] for _ in range(len(encodings.offset_mapping,)) ]
    for encodings_doc, tags_in_doc, tags_out_doc in zip(encodings.offset_mapping, tags_in, tags_out):
        CLS_seen = False
        SEP_seen = False
        tags_counter = 0
        for encoding in encodings_doc:
            if encoding[1] == 0:
                if not CLS_seen:
                    tags_out_doc.append("CLS")
                    CLS_seen = True
                elif not SEP_seen:
                    tags_out_doc.append("SEP")
                    SEP_seen = True
                else:
                    tags_out_doc.append("PAD")
            elif encoding[0] == 0:
                tags_out_doc.append(tags_in_doc[tags_counter])
                tags_counter += 1
            else:
                tags_out_doc.append(convert_B_to_I_tag(tags_in_doc[tags_counter - 1]))
    return tags_out

In [None]:
def tags_to_numbers(tags, tag2id):
    return [ [ tag2id[tag] for tag in doc ] for doc in tags ]

In [None]:
IGNORE_TAG_ID = -100

extra_tags = { 'CLS': IGNORE_TAG_ID, 'SEP': IGNORE_TAG_ID, 'PAD': IGNORE_TAG_ID }

In [None]:
train_labels = tags_to_numbers( split_tags(train_tags, train_encodings),
                                { **tag2id, **extra_tags})
val_labels =   tags_to_numbers( split_tags(val_tags, val_encodings),
                                { **tag2id, **extra_tags})

In [None]:
class WNUTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

### 2.3 Fine-tune model with data

Using Bertje as base model: https://huggingface.co/GroNLP/bert-base-dutch-cased

In [None]:
model = joblib.load('./model/cstom-setfit-model.joblib')

In [None]:
model = BertForTokenClassification.from_pretrained("GroNLP/bert-base-dutch-cased", num_labels=len(unique_tags))

In [None]:
# model = BertForTokenClassification.from_pretrained("./model")
# tokenizer = AutoTokenizer.from_pretrained("./model")

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=7,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="steps"
)

trainer = Trainer(
    model=model,                     # the instantiated 🤗 Transformers model to be trained
    args=training_args,              # training arguments, defined above
    train_dataset=train_dataset,     # training dataset
    eval_dataset=val_dataset         # evaluation dataset
)

In [None]:
eval_data = { "400a": { 10: [ 2.973100 , 2.855659], 20: [2.874400 ,2.633519], 30 : [2.598700 ,2.267503], 40 : [2.211500 ,1.768432], 50 : [1.778300 ,1.324655],
60:[ 1.550900, 1.093353], 70: [ 1.220000, 0.903201], 80: [ 1.011200, 0.697839], 90 : [0.778800 ,0.554128], 100 : [0.623100 ,0.440024],
110: [0.488200, 0.349560], 120: [ 0.343000, 0.288387], 130: [ 0.300600, 0.250258], 140: [ 0.226900, 0.229932], 150: [ 0.170800, 0.200462],
160: [ 0.173200, 0.177333], 170: [ 0.101600, 0.182475], 180: [ 0.108400, 0.170770], 190: [ 0.074900, 0.172628], 200: [ 0.075700, 0.177106],
210: [ 0.054100, 0.168911], 220: [ 0.047600, 0.179788], 230: [ 0.038200, 0.168373], 240: [ 0.039000, 0.165006], 250: [ 0.030100, 0.163445],
260: [ 0.027400, 0.174015], 270: [ 0.018400, 0.171961], 280: [ 0.024600, 0.178531], 290: [ 0.015000, 0.189336], 300: [ 0.019300, 0.185390],
310: [ 0.014100, 0.179444], 320: [ 0.009900, 0.196208], 330: [ 0.017200, 0.177975], 340: [ 0.007100, 0.188561], 350: [ 0.011200, 0.175972],
360: [ 0.006600, 0.179566], 370: [ 0.007400, 0.180530], 380: [ 0.012900, 0.199687], 390: [ 0.005000, 0.191614], 400: [ 0.003700, 0.178989] },
"600a": { 10: [ 2.853400, 2.788824], 20: [ 	2.739800, 	2.618280], 30: [ 2.518100, 2.352505], 40: [ 2.215600, 2.010691], 50: [ 1.876800, 1.643065],
60: [ 1.569300, 1.363279], 70: [ 1.279300, 1.098443], 80: [ 1.041300, 0.855244], 90: [ 0.854200, 0.671153], 100: [ 0.616700, 0.533351],
110: [ 0.562700, 0.431195], 120: [ 0.449400, 0.350870], 130: [ 0.334900, 0.301453], 140: [ 0.261000, 0.252131], 150: [ 0.290400, 0.220683],
160: [ 0.190300, 0.203238], 170: [ 0.154600, 0.190547], 180: [ 0.185800, 0.167484], 190: [ 0.108400, 0.160793], 200: [ 0.101400, 0.145170],
210: [ 0.088100, 0.133172], 220: [ 0.081000, 0.135661], 230: [ 0.068500, 0.159878], 240: [ 0.052700, 0.146953], 250: [ 0.038300, 0.169547],
260: [ 0.037700, 0.152970], 270: [ 0.065600, 0.140576], 280: [ 0.027200, 0.173946], 290: [ 0.022900, 0.149843], 300: [ 0.023700, 0.155466] },
"600b": {10: [2.97, 2.932976007461548], 20: [2.8959, 2.7588181495666504], 30: [2.6404, 2.4754464626312256], 40: [2.3043, 2.089912176132202],
50: [1.9301, 1.6922649145126343], 60: [1.6257, 1.433002233505249], 70: [1.3531, 1.1809186935424805], 80: [1.1305, 0.9395545125007629],
90: [0.9414, 0.735508143901825], 100: [0.673, 0.5727341771125793], 110: [0.6081, 0.4573941230773926], 120: [0.4872, 0.3719801604747772],
130: [0.357, 0.3091298043727875], 140: [0.2731, 0.2592621445655823], 150: [0.3059, 0.23325742781162262], 160: [0.2005, 0.20716261863708496],
170: [0.1691, 0.21426241099834442], 180: [0.1963, 0.17854894697666168], 190: [0.1302, 0.17267706990242004], 200: [0.1132, 0.1582951843738556],
210: [0.0941, 0.1499401330947876]},
"600c": {10: [3.0284, 2.9702017307281494], 20: [2.9019, 2.7836408615112305], 30: [2.6567, 2.4800596237182617], 40: [2.3059, 2.0758652687072754],
 50: [1.9075, 1.65205717086792], 60: [1.583, 1.3605468273162842], 70: [1.285, 1.108488917350769], 80: [1.0662, 0.8703526258468628],
 90: [0.8719, 0.6816078424453735], 100: [0.6273, 0.5537048578262329], 110: [0.5864, 0.4511786103248596], 120: [0.4745, 0.3668432831764221],
130: [0.3503, 0.3144031763076782], 140: [0.2816, 0.264466255903244], 150: [0.3042, 0.2330472618341446], 160: [0.1985, 0.21090537309646606],
170: [0.1735, 0.20016834139823914], 180: [0.1975, 0.17414356768131256], 190: [0.1271, 0.173195019364357], 200: [0.1204, 0.16945882141590118],
210: [0.099, 0.1487908661365509], 220: [0.0798, 0.14753003418445587], 230: [0.0831, 0.14581818878650665], 240: [0.0679, 0.14583609998226166]},
"700a": { 10:[ 	2.938300, 	2.900105], 20:[ 	2.838300, 	2.743521], 30:[ 	2.649400, 	2.491823], 40:[ 	2.343200, 	2.163887],
50:[ 	2.032500, 	1.807033], 60:[ 	1.645100, 	1.522994], 70:[ 	1.423600, 	1.299871], 80:[ 	1.233000, 	1.048253],
90:[ 	0.964700, 	0.846890], 100:[ 	0.770100, 	0.695937], 110:[ 	0.573500, 	0.582498], 120:[ 	0.458100, 	0.476454],
130:[ 	0.457500, 	0.393108], 140:[ 	0.359800, 	0.332941], 150:[ 	0.248000, 	0.309698], 160:[ 	0.223600, 	0.275177],
170:[ 	0.189700, 	0.240412], 180:[ 	0.171500, 	0.225659], 190:[ 	0.115100, 	0.214786], 200:[ 	0.094900, 	0.210268],
210:[ 	0.094800, 	0.218391], 220:[ 	0.059200, 	0.219438], 230:[ 	0.052300, 	0.225003], 240:[ 	0.071400, 	0.240677],
250:[ 	0.049300, 	0.207180], 260:[ 	0.040400, 	0.221386], 270:[ 	0.048800, 	0.221953], 280:[ 	0.028200, 	0.244117]},
"700b": {10: [2.8077, 2.74599], 20: [2.7203, 2.598261], 30: [2.5386, 2.361184], 40: [2.2356, 2.056],
 50: [1.9691, 1.759227], 60: [1.657, 1.536577], 70: [1.4585, 1.312596], 80: [1.2722, 1.060864],
90: [0.9803, 0.850179], 100: [0.7778, 0.701402], 110: [0.5651, 0.578009], 120: [0.4599, 0.459326],
130: [0.438, 0.379107], 140: [0.3474, 0.32722], 150: [0.235, 0.294267], 160: [0.2042, 0.260987],
170: [0.1766, 0.228773], 180: [0.1504, 0.205617], 190: [0.1093, 0.202869], 200: [0.0851, 0.195545],
210: [0.0764, 0.197944], 220: [0.0493, 0.202711], 230: [0.0456, 0.204082], 240: [0.0673, 0.206159],
250: [0.0525, 0.193714], 260: [0.0417, 0.188306], 270: [0.042, 0.206414], 280: [0.0277, 0.206935]}
}

In [None]:
string = """
"""

def convert_eval_scores_to_dict(string):
    eval_dict = {}
    token_list = []
    for token in string.split():
        token_list.append(token)
        if len(token_list) >= 3:
            eval_dict[int(token_list[0])] = [ float(token_list[1]), float(token_list[2]) ]
            token_list = []
    if len(token_list) > 0:
        print(f"there were unprocessed tokens! ({token_lidst})")
    return eval_dict

convert_eval_scores_to_dict(string)

In [None]:
trainer.train()

In [None]:
#model.save_pretrained(save_directory="model")

In [None]:
#tokenizer.save_pretrained(save_directory="model")

In [None]:
joblib.dump(model, './model/cstom-setfit-model.joblib')

In [None]:
def make_eval_data(trainer):
    eval_data = {}
    for data in trainer.state.log_history:
        if data["step"] not in eval_data:
            eval_data[data["step"]] = [0 , 0]
        if "loss" in data:
            eval_data[data["step"]][0] = data["loss"]
        if "eval_loss" in data:
            eval_data[data["step"]][1] = data["eval_loss"]
    return eval_data

In [None]:
def plot_eval_data(eval_data):
    plt.figure(figsize=(5, 3))
    plt.plot([data_key for data_key in eval_data], [eval_data[data_key][0] for data_key in eval_data], label="training loss")
    plt.plot([data_key for data_key in eval_data], [eval_data[data_key][1] for data_key in eval_data], label="validation loss")
    plt.legend()

In [None]:
plot_eval_data(make_eval_data(trainer))
#plot_eval_data(eval_data["600c"])

In [None]:
# make_eval_data(trainer)

In [None]:
# trainer.evaluate()

In [None]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
results = [ ner_pipeline(" ".join(val_text)) for val_text in val_texts ]

In [None]:
def results_to_entities(tag_id_list, token_id_list):
    entities = []
    token_counter = 0
    current_tag = ("", -1)
    for tag, token in zip(tag_id_list, token_id_list):
        tag_start = tag[0]
        tag_class = regex.sub(r"^[BI]-", "", tag)
        current_tag_class = current_tag[0]
        current_tag_start = current_tag[1]
        if regex.search(r"^##", token):
            token_counter -= 1
        if current_tag_class != "" and not regex.search(r"^##", token):
            if tag_class == "O" or tag_start == "B" or tag_class != current_tag_class:
                entities.append((current_tag_class, current_tag_start, token_counter))
                current_tag = ("", -1)
                current_tag_class = ""
                current_tag_start = -1
        if tag_class != "O" and current_tag_class == "":
            current_tag = (tag_class, token_counter)
            if regex.search(r"^##", token) and (len(entities) == 0 or entities[-1][2] != token_counter):
                current_tag = (tag_class, token_counter - 1)
        token_counter += 1
    if current_tag_class != "":
        entities.append((current_tag_class, current_tag_start, token_counter))
    return entities

In [None]:
def compute_precision_and_recall(correct_count, missed_count, wrong_count):
    for tag in sorted(correct_count):
        if correct_count[tag] > 0 or missed_count[tag] or wrong_count[tag] > 0:
            precision = correct_count[tag]/(correct_count[tag] + wrong_count[tag])
            recall = correct_count[tag]/(correct_count[tag] + missed_count[tag])
            print(f"precision: {int(100*precision):-3d}; recall: {int(100*recall):-3d}; count: {correct_count[tag] + missed_count[tag]:4d}; tag: {tag}")

In [None]:
def get_labels_from_ids(label_ids):
    return [ id2tag[label_id] for label_id in label_ids if label_id != IGNORE_TAG_ID ]

In [None]:
def get_labels_from_results(sentence_result):
    return get_labels_from_ids([ int(regex.sub("^LABEL_", "", token_result["entity"])) for token_result in sentence_result ])

In [None]:
def get_split_tokens_from_results(sentence_result):
    return [ token_result["word"] for token_result in sentence_result ]

In [None]:
def combine_split_tokens(split_tokens):
    combined_tokens = []
    for token in split_tokens:
        if not regex.search(r"^##", token):
            combined_tokens.append(token)
        else:
            combined_tokens[-1] += regex.sub(r"^##", "", token)
    return combined_tokens

In [None]:
def evaluate_results_per_entity(results, correct_label_ids):
    correct_count = { tag: 0 for tag in unique_types if tag != "O" }
    missed_count = { tag: 0 for tag in unique_types if tag != "O" }
    wrong_count = { tag: 0 for tag in unique_types if tag != "O" }
    errors_per_text = []
    for sentence_result, correct_sentence_label_ids in zip(results, correct_label_ids):
        guessed_labels = get_labels_from_results(sentence_result)
        split_tokens = get_split_tokens_from_results(sentence_result)
        correct_labels = get_labels_from_ids(correct_sentence_label_ids)
        guessed_entities = results_to_entities(guessed_labels, split_tokens)
        correct_entities = results_to_entities(correct_labels, split_tokens)
        error_count = 0
        for entity in correct_entities:
            if entity in guessed_entities:
                correct_count[entity[0]] += 1
            else:
                missed_count[entity[0]] += 1
                error_count += 1
        for entity in guessed_entities:
            if entity not in correct_entities:
                wrong_count[entity[0]] += 1
                error_count += 1
        errors_per_text.append(error_count)
    return correct_count, missed_count, wrong_count, errors_per_text

In [None]:
correct_count, missed_count, wrong_count , errors_per_text = evaluate_results_per_entity(results, val_labels)

In [None]:
compute_precision_and_recall(correct_count, missed_count, wrong_count)

In [None]:
def render_results_1(results, encodings, errors_per_text, max_counter=0):
    counter = 0
    for guess_data, correct_data, token_data, error_count in zip(results[0], results[1], encodings, errors_per_text):
        text = f"{error_count} "
        tags = []
        token_counter = 0
        in_tag = False
        for guess_values, correct_id, token in zip(guess_data, correct_data, tokenizer.convert_ids_to_tokens(token_data)):
            guess_id = list(guess_values).index(max(guess_values))
            if correct_id != IGNORE_TAG_ID:
                if guess_id in [ IGNORE_TAG_ID, tag2id['O'] ]:
                    in_tag = False
                else:
                    start = len(text)
                    end = len(text) + len(token)
                    label = regex.sub(r"^[BI]-", "", id2tag[guess_id])[0:2]
                    if in_tag and tags[-1]["label"] == label:
                        tags[-1]["end"] = end
                    else:
                        tags.append({ "start": start, "end": end, "label": label })
                        in_tag = True
                text =  text + regex.sub(r"^##", "  ", token) + " "
                token_counter += 1
        render_text(text, tags)
        counter += 1
        if max_counter > 0 and counter >= max_counter:
            break

In [None]:
def render_results(text_entities, text_tokens, error_count):
    text = f"({error_count})"
    tags = []
    token_counter = 0
    in_tag = False
    for entity in text_entities:
        entity_label, entity_token_start, entity_token_end = entity
        #print(token_counter, entity_token_start, len(text_tokens), text_tokens)
        for i in range(token_counter, entity_token_start):
            text += " " + text_tokens[i]
        entity_char_start = len(text) + 1
        for i in range(entity_token_start, entity_token_end):
            text += " " + text_tokens[i]
        entity_char_end = len(text)
        tags.append( { "start": entity_char_start, "end": entity_char_end, "label": entity_label } )
        token_counter = entity_token_end
    render_text(text, tags)

In [None]:
guessed_tags = [ [ id2tag[list(guesses_per_token).index(max(guesses_per_token))]
                   for guesses_per_token in guesses ] 
                   for guesses in results[0] ]

In [None]:
# 20230723 
# * standardize arguments of render_results so that it can be used for inspecting processed data
# * check saving and loading fine-tuned model, perhaps use save_pretrained? what about saving tokenize?

In [None]:
max_render = 10

text_counter = 0
for sentence_result, error_count in zip(results, errors_per_text):
    guessed_labels = get_labels_from_results(sentence_result)
    split_tokens = get_tokens_from_results(sentence_result)
    guessed_entities = results_to_entities(guessed_labels, split_tokens)
    render_results(guessed_entities, combine_split_tokens(split_tokens), error_count)
    text_counter += 1
    if text_counter >= max_render:
        break

### 2.4 Select extra data for training

Training data selection process:

1. 100 most frequent data from each half and 100 randomly selected (total 400)
2. 50 with most of ENSLAVED|FREED|OWNER tags and 50 random with one of these tags (total 200)
3. 50 randomly selcted data of each half with one of the tags ENSLAVED|FREED (total 100)

Total: 700 (1 duplicate)

In [None]:
import json
from transformers import pipeline

In [None]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
extra_data = make_data(info_data_train, selected_frequent=0, selected_random=1100)

In [None]:
selected_extra_data = []
for data in extra_data:
    if tokenizer.tokenize(data["text"]) not in annotated_texts:
        tag_counter = 0
        for entity in ner_pipeline(data["text"]):
            label = id2tag[int(regex.sub("LABEL_", "", entity["entity"]))]
            if regex.search("(ENSLAVED|FREED)", label):
                tag_counter += 1
        if tag_counter > 0:
            selected_extra_data.append({ "tag_counter": tag_counter, "data": data })
len(selected_extra_data)

In [None]:
out_file = open("outfile.json", "w")
for data in sorted(selected_extra_data, key=lambda data: data["tag_counter"], reverse=True)[:50]:
    #data["data"].pop("eid", None)
    print(json.dumps(data["data"]), file=out_file)
out_file.close()

### 2.5 Process other data with fine-tuned model

In [None]:
def add_labels(selected_entities):
    for entity_list in selected_entities:
        for entity in entity_list:
            entity["label"] = id2tag[int(regex.sub(r"^LABEL_", "", entity["entity"]))]
    return selected_entities

In [None]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(selected_data[0]["text"]))

In [None]:
selected_data = make_data(info_data_train, selected_frequent=0, selected_random=10)
selected_entities = [ ner_pipeline(data["text"]) for data in selected_data ]
selected_entities = add_labels(selected_entities)

In [None]:
render_results(selected_entities, 
               [ tokenizer.convert_tokens_to_ids(tokenizer.tokenize(data["text"])) for data in selected_data],
               len(entities) * [0])

In [None]:
selected_entities[0]