# Info fields via machine learning

Extract persons from the info fields StartEntryInfo and EndEntryInfo of the [slave registers of Suriname](https://datasets.iisg.amsterdam/dataset.xhtml?persistentId=hdl:10622/CSPBHO) via machine learning

See: https://www.freecodecamp.org/news/getting-started-with-ner-models-using-huggingface/

## 1. Annotating info fields

In [None]:
import json
import nltk
import pandas as pd
import random
import regex
import transformers

### 1.1 Read relevant data

In [None]:
DATA_FILE = "../../data/suriname/Dataset Suriname Slave and Emancipation Registers Version 1.1.csv"
DATA_COLUMN = "StartEntryInfo"

data = pd.read_csv(DATA_FILE, low_memory=False)

In [None]:
def add_column_tokens(train):
    train["tokens"] = [ nltk.word_tokenize(text) for text in train["text"] ]
    return train

In [None]:
def add_column_labels(train):
    train["labels"] = [ len(tokens) * [ "O" ] for tokens in train["tokens"] ]
    return train

In [None]:
def add_column_numeric_labels(train, numeric_labels):
    train["numeric_labels"] = [ [ numeric_labels[label] for label in labels ] for labels in train["labels"] ]
    return train

In [None]:
def is_date(day, month, year):
    return regex.search(r"^\d\d\d\d\b", year) and regex.search(r"^\d\d?$", day) and True

In [None]:
def add_date_tags_to_labels(labels, index):
    labels[index - 2], labels[index - 1], labels[index] = "B-DATE", "I-DATE", "I-DATE"
    return labels

In [None]:
def label_dates(train):
    for index, row in train.iterrows():
        for i in range(2, len(row["tokens"])):
            if is_date(row["tokens"][i-2], row["tokens"][i-1], row["tokens"][i]):
                add_date_tags_to_labels(row["labels"], i)
    return train       

In [None]:
def show_annotations(train):
    for index in range(0, len(train)):
        for i in range(0, len(train["labels"][index])):
            print(train["tokens"][index][i], end="")
            if train["labels"][index][i] != "O":
                print("/" + train["labels"][index][i], end="")
            print(" ", end="")
        print("")

In [None]:
def make_train(data, nbr_of_lines=100):
    if nbr_of_lines > 0:
        train = pd.DataFrame(data[DATA_COLUMN].value_counts()[:nbr_of_lines])
    else:
        train = pd.DataFrame(data[DATA_COLUMN].value_counts())
    train = train.rename(columns={DATA_COLUMN: "frequency"})
    train["text"] = train.index
    train["index"] = range(0, len(train))
    train = train.set_index("index")
    return train

In [None]:
info_data_train = make_train(data, nbr_of_lines=0)
info_data_train = add_column_tokens(info_data_train)
info_data_train = add_column_labels(info_data_train)

### 1.2 Make data for annotation

In [None]:
SELECTED_FREQUENT = 100
SELECTED_RANDOM = 100

In [None]:
def make_selected_data_ids(info_data_train, selected_frequent=SELECTED_FREQUENT, selected_random=SELECTED_RANDOM):
    selected_data_ids = list(range(0, selected_frequent))
    while len(selected_data_ids) < selected_frequent + selected_random:
        selected_data_id = random.randint(selected_frequent, len(info_data_train) - 1)
        if selected_data_id not in selected_data_ids:
            selected_data_ids.append(selected_data_id)
    return selected_data_ids

In [None]:
def make_selected_data_flags(info_data_train, selected_data_ids):
    selected_data_flags = len(info_data_train) * [ False ]
    for id_value in selected_data_ids:
        selected_data_flags[id_value] = True
    return selected_data_flags

In [None]:
def save_annotated_data(info_data_train, selected_data_flags):
    out_file = open("outfile.json", "w")
    selected_data = []
    for index, row in info_data_train[selected_data_flags].iterrows():
        text = " ".join(row["tokens"])
        selected_data.append({ "eid": DATA_COLUMN[0] + str(index), "text": text, "label": [] })
        print(selected_data[-1], file=out_file)
    out_file.close()
    return selected_data

In [None]:
def make_data(info_data_train, selected_frequent=SELECTED_FREQUENT, selected_random=SELECTED_RANDOM):
    random.seed(42)
    selected_data_ids = make_selected_data_ids(info_data_train, selected_frequent, selected_random)
    selected_data_flags = make_selected_data_flags(info_data_train, selected_data_ids)
    selected_data = save_annotated_data(info_data_train, selected_data_flags)
    return selected_data

In [None]:
selected_data = make_data(info_data_train, selected_frequent=SELECTED_FREQUENT, selected_random=SELECTED_RANDOM)

### 1.3 Read annotated data

In [None]:
ANNOTATIONS_FILE = "../../data/annotated/600.jsonl"

In [None]:
def make_offset2label_pos(text):
    offset2label_pos = {}
    offset = 0
    token_counter = 0
    for token in text.split():
        offset2label_pos[offset] = token_counter
        offset += len(token) + 1
        token_counter += 1
    return offset2label_pos

In [None]:
def fix_label_start_not_token_initial(text, label_start):
    while regex.search(" ", text[label_start]):
        label_start += 1
    while label_start > 0 and not regex.search(" ", text[label_start - 1]):
        label_start -= 1
    return label_start

In [None]:
def make_labels(data):
    text = data["data"]
    labels = [ "O" for token in text.split() ]
    offset2label_pos = make_offset2label_pos(text)
    for label in data["label"]:
        label[0] = fix_label_start_not_token_initial(text, label[0])
        if label[0] not in offset2label_pos:
            raise Exception(f"{label[0]} not found in labels {offset2label_pos} of text {text}")
        else:
            labels[offset2label_pos[label[0]]] = "B-" + label[2]
            for i in range(label[0] + 1, label[1] + 1):
                if i in offset2label_pos:
                    labels[offset2label_pos[i]] = "I-" + label[2]
    return labels

In [None]:
def read_jsonl_file(file_name):
    annotations_file = open(file_name, "r")
    texts = []
    tags = []
    for line in annotations_file:
        data = json.loads(line)
        texts.append(data["data"].split())
        tags.append(make_labels(data))
    annotations_file.close()
    return texts, tags

In [None]:
texts, tags = read_jsonl_file(ANNOTATIONS_FILE)

## 2. Machine learning

Based on tutorial https://huggingface.co/transformers/v3.2.0/custom_datasets.html#token-classification-with-w-nut-emerging-entities

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import regex
from sklearn.model_selection import train_test_split
from spacy import displacy
import torch
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModel
from transformers import BertForTokenClassification

In [None]:
def render_text(text, entities):
    displacy.render({ "text": regex.sub("\\n", " ", text), 
                      "ents": entities }, 
                      options = { "colors": { "fuzzy_match": "yellow"} }, style = "ent", manual = True)

In [None]:
render_text("a bee sees the eee", [ { "start": 2, "end": 5, "label": "test" } ])

### 2.1 Preprocess data

In [None]:
def add_missing_I_tags(tags):
    missing_tags = []
    for tag in tags:
        i_tag = regex.sub(r"^B-", "I-", tag)
        if i_tag not in tags:
            missing_tags.append(i_tag)
    return list(tags) + missing_tags

In [None]:
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2, random_state=42)

In [None]:
unique_tags = set(tag for doc in tags for tag in doc )
unique_tags = add_missing_I_tags(unique_tags)
unique_types = list(set([ regex.sub(r"^[BI]-", "", tag) for tag in unique_tags ]))
tag2id = { tag: id for id, tag in enumerate(unique_tags) }
id2tag = { id: tag for tag, id in tag2id.items() }

In [None]:
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

In [None]:
train_encodings = tokenizer(train_texts, 
                            is_split_into_words=True, 
                            return_offsets_mapping=True, 
                            padding=True, 
                            truncation=True)
val_encodings =   tokenizer(val_texts, 
                            is_split_into_words=True, 
                            return_offsets_mapping=True, 
                            padding=True, 
                            truncation=True)

In [None]:
def convert_B_to_I_tag(tag):
    return regex.sub(r"^B", "I", tag)

In [None]:
def split_tags(tags_in, encodings):
    tags_out = [ [] for _ in range(len(encodings.offset_mapping,)) ]
    for encodings_doc, tags_in_doc, tags_out_doc in zip(encodings.offset_mapping, tags_in, tags_out):
        CLS_seen = False
        SEP_seen = False
        tags_counter = 0
        for encoding in encodings_doc:
            if encoding[1] == 0:
                if not CLS_seen:
                    tags_out_doc.append("CLS")
                    CLS_seen = True
                elif not SEP_seen:
                    tags_out_doc.append("SEP")
                    SEP_seen = True
                else:
                    tags_out_doc.append("PAD")
            elif encoding[0] == 0:
                tags_out_doc.append(tags_in_doc[tags_counter])
                tags_counter += 1
            else:
                tags_out_doc.append(convert_B_to_I_tag(tags_in_doc[tags_counter - 1]))
    return tags_out

In [None]:
def tags_to_numbers(tags, tag2id):
    return [ [ tag2id[tag] for tag in doc ] for doc in tags ]

In [None]:
IGNORE_TAG = -100

extra_tags = { 'CLS': IGNORE_TAG, 'SEP': IGNORE_TAG, 'PAD': IGNORE_TAG }

In [None]:
train_labels = tags_to_numbers( split_tags(train_tags, train_encodings),
                                { **tag2id, **extra_tags})
val_labels =   tags_to_numbers( split_tags(val_tags, val_encodings),
                                { **tag2id, **extra_tags})

In [None]:
class WNUTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

### 2.2 Fine-tune model with data

Using Bertje as base model: https://huggingface.co/GroNLP/bert-base-dutch-cased

In [None]:
model = BertForTokenClassification.from_pretrained("GroNLP/bert-base-dutch-cased", num_labels=len(unique_tags))

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=7,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="steps"
)

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

In [None]:
eval_data = { 400: [[ 10 , 2.973100 , 2.855659], [20 ,2.874400 ,2.633519], [30 ,2.598700 ,2.267503], [40 ,2.211500 ,1.768432], [50 ,1.778300 ,1.324655],
[60, 1.550900, 1.093353], [70, 1.220000, 0.903201], [80, 1.011200, 0.697839], [90 ,0.778800 ,0.554128], [100 ,0.623100 ,0.440024],
[110, 0.488200, 0.349560], [120, 0.343000, 0.288387], [130, 0.300600, 0.250258], [140, 0.226900, 0.229932], [150, 0.170800, 0.200462],
[160, 0.173200, 0.177333], [170, 0.101600, 0.182475], [180, 0.108400, 0.170770], [190, 0.074900, 0.172628], [200, 0.075700, 0.177106],
[210, 0.054100, 0.168911], [220, 0.047600, 0.179788], [230, 0.038200, 0.168373], [240, 0.039000, 0.165006], [250, 0.030100, 0.163445],
[260, 0.027400, 0.174015], [270, 0.018400, 0.171961], [280, 0.024600, 0.178531], [290, 0.015000, 0.189336], [300, 0.019300, 0.185390],
[310, 0.014100, 0.179444], [320, 0.009900, 0.196208], [330, 0.017200, 0.177975], [340, 0.007100, 0.188561], [350, 0.011200, 0.175972],
[360, 0.006600, 0.179566], [370, 0.007400, 0.180530], [380, 0.012900, 0.199687], [390, 0.005000, 0.191614], [400, 0.003700, 0.178989]],
600: [ [10, 2.853400, 2.788824], [20, 	2.739800, 	2.618280], [30, 2.518100, 2.352505], [40, 2.215600, 2.010691], [50, 1.876800, 1.643065],
[60, 1.569300, 1.363279], [70, 1.279300, 1.098443], [80, 1.041300, 0.855244], [90, 0.854200, 0.671153], [100, 0.616700, 0.533351],
[110, 0.562700, 0.431195], [120, 0.449400, 0.350870], [130, 0.334900, 0.301453], [140, 0.261000, 0.252131], [150, 0.290400, 0.220683],
[160, 0.190300, 0.203238], [170, 0.154600, 0.190547], [180, 0.185800, 0.167484], [190, 0.108400, 0.160793], [200, 0.101400, 0.145170],
[210, 0.088100, 0.133172], [220, 0.081000, 0.135661], [230, 0.068500, 0.159878], [240, 0.052700, 0.146953], [250, 0.038300, 0.169547],
[260, 0.037700, 0.152970], [270, 0.065600, 0.140576], [280, 0.027200, 0.173946], [290, 0.022900, 0.149843], [300, 0.023700, 0.155466]] }

In [None]:
trainer.train()

In [None]:
def plot_eval_data(eval_data):
    plt.figure(figsize=(5, 3))
    plt.plot([data[0] for data in eval_data], [data[1] for data in eval_data], label="training loss")
    plt.plot([data[0] for data in eval_data], [data[2] for data in eval_data], label="validation loss")
    plt.legend()

In [None]:
plot_eval_data(eval_data[600])

In [None]:
trainer.evaluate()

In [None]:
results = trainer.predict(val_dataset)

In [None]:
def compute_precision_and_recall(correct_count, missed_count, wrong_count):
    for tag in sorted(correct_count):
        if correct_count[tag] > 0 or missed_count[tag] or wrong_count[tag] > 0:
            precision = correct_count[tag]/(correct_count[tag] + wrong_count[tag])
            recall = correct_count[tag]/(correct_count[tag] + missed_count[tag])
            print(f"precision: {precision:.2f}; recall: {recall:.2f}; count: {correct_count[tag] + missed_count[tag]:4d}; tag: {tag}")

In [None]:
def evaluate_results(results):
    correct_count = { tag: 0 for tag in unique_types + [ "" ] }
    missed_count = { tag: 0 for tag in unique_types + [ "" ] }
    wrong_count = { tag: 0 for tag in unique_types + [ "" ] }
    for guesses, corrects in zip(results[0], results[1]):
        for guess_values, correct_id in zip(guesses, corrects):
            if correct_id != IGNORE_TAG:
                guess_id = list(guess_values).index(max(guess_values))
                if correct_id != tag2id['O'] and guess_id == correct_id:
                    correct_count[""] += 1
                else:
                    if correct_id != tag2id['O']:
                        missed_count[""] += 1
                    if guess_id != tag2id['O']:
                        wrong_count[""] += 1
                correct_tag = regex.sub(r"^[BI]-", "", id2tag[int(correct_id)])
                guess_tag = regex.sub(r"^[BI]-", "", id2tag[int(guess_id)])
                if correct_tag != 'O' and guess_tag == correct_tag:
                    correct_count[correct_tag] += 1
                else:
                    if correct_tag != 'O':
                        missed_count[correct_tag] += 1
                    if guess_tag != 'O':
                        wrong_count[guess_tag] += 1                
    return correct_count, missed_count, wrong_count

In [None]:
correct_count, missed_count, wrong_count = evaluate_results(results)

In [None]:
compute_precision_and_recall(correct_count, missed_count, wrong_count)

In [None]:
def render_results(results, encodings, max_counter=0):
    counter = 0
    for guess_data, correct_data, token_data in zip(results[0], results[1], encodings):
        text = ""
        tags = []
        for guess_values, correct_id, token in zip(guess_data, correct_data, tokenizer.convert_ids_to_tokens(token_data)):
            guess_id = list(guess_values).index(max(guess_values))
            if correct_id != IGNORE_TAG:
                if guess_id != IGNORE_TAG and guess_id != tag2id['O']:
                    tags.append({ "start": len(text), "end": len(text) + len(token), "label": regex.sub(r"^[BI]-", "", id2tag[guess_id])[0]})
                text = text + token + " "
        render_text(text, tags)
        counter += 1
        if max_counter > 0 and counter >= max_counter:
            break

In [None]:
render_results(results, val_encodings.input_ids, max_counter=1)

### 2.3 Process text with trained model

In [None]:
import json
from transformers import pipeline

In [None]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

In [None]:
train_data = make_data(info_data_train, selected_frequent=100, selected_random=100)
extra_data = make_data(info_data_train, selected_frequent=100, selected_random=1100)

In [None]:
selected_extra_data = []
for data in extra_data:
    if data not in train_data:
        tag_counter = 0
        for entity in ner_pipeline(data["text"]):
            label = id2tag[int(regex.sub("LABEL_", "", entity["entity"]))]
            if regex.search("(ENSLAVED|FREED|OWNER)", label):
                tag_counter += 1
        if tag_counter > 0:
            selected_extra_data.append({ "tag_counter": tag_counter, "data": data })
len(selected_extra_data)

In [None]:
out_file = open("outfile.json", "w")
for data in sorted(selected_extra_data, key=lambda data: data["tag_counter"], reverse=True)[:100]:
    #data["data"].pop("eid", None)
    print(json.dumps(data["data"]), file=out_file)
out_file.close()