# Process margin texts of birth certificates of Suriname 1828-1921

In [None]:
import nltk
import os
import pandas as pd
import regex
import sys
sys.path.append(os.getcwd() + '/..')
from IPython.display import clear_output
from scripts import ner_analysis

In [None]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None:
        print(text)

## 1. Read data

In [None]:
data_file = "../../data/kantmeldingen.csv"

In [None]:
data = pd.read_csv(data_file, low_memory=False)

## 2. Check data

In [None]:
len(data)

In [None]:
for key in data:
    print(key, end=" ")

In [None]:
birth_date_list = list(data["birth_date"])
birth_year_list = []
for birth_date in birth_date_list:
    try:
        birth_year = birth_date[6:10]
        if len(birth_year) == 4:
            birth_year_list.append(int(birth_year))
    except:
        pass
print(sorted(birth_year_list)[:10], sorted(birth_year_list)[-10:])

In [None]:
def find_rows(column, value):
    return data[data[column].apply(lambda x: bool(re.search(value, x, re.IGNORECASE)))]

In [None]:
find_rows("birth_date", "1988")["birth_date"]

In [None]:
data["note_type"].value_counts(dropna=False)

In [None]:
data["note_type_other"].value_counts(dropna=False)[:15]

In [None]:
data["note_txt"][:20]

## 3. Find entities with standard Dutch NER

In [None]:
ner_analysis_process = ner_analysis.NerAnalysis()

In [None]:
text = data["note_txt"][9]
entity_tokens = ner_analysis_process.process(data["note_txt"][9])
ner_analysis_process.render_text(text, entity_tokens)

In [None]:
entity_tokens

## 4. Find entities with slave register NER

Code copied from noteb ook info_fields_ml.ipynb

In [None]:
from transformers import AutoTokenizer
from transformers import BertForTokenClassification
from transformers import pipeline

In [None]:
IGNORE_TAG_ID = -100

In [None]:
id2tag = { 0: 'B-DATE',
 1: 'B-ENSLAVED',
 2: 'B-FOLIO',
 3: 'B-FREED',
 4: 'B-OWNER',
 5: 'B-PLANT',
 6: 'B-RESNR',
 7: 'B-TOPIC',
 8: 'I-DATE',
 9: 'I-ENSLAVED',
 10: 'I-FOLIO',
 11: 'I-FREED',
 12: 'I-OWNER',
 13: 'I-PLANT',
 14: 'I-RESNR',
 15: 'I-TOPIC',
 16: 'O'}

In [None]:
def make_tags(id2tag):
    tag2id = { id2tag[key]: key for key in id2tag }
    unique_tags = list(tag2id.keys())
    unique_types = list(set([ regex.sub("^.-", "", key) for key in unique_tags]))
    return unique_tags, unique_types, tag2id

In [None]:
def load_model(num_labels, model_name="GroNLP/bert-base-dutch-cased"):
    model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [None]:
def retokenize(text):
    try:
        return regex.sub(" ##", "", " ".join(tokenizer.tokenize(" ".join(nltk.word_tokenize(regex.sub("…","...",text))))))
    except:
        return ""

In [None]:
def process_texts(texts, model, tokenizer):
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
    results = []
    failed_texts = []
    for text in texts:
        try:
            results.append(ner_pipeline(retokenize(text)))
        except:
            results.append([])
            failed_texts.append(f"analysis failed for text: {text}")
        if len(results) % 10 == 0:
            squeal(f"{len(texts)}:{len(results)}")
    squeal(f"{len(texts)}:{len(results)}")
    if len(failed_texts) > 0:
        for failed_text in failed_texts:
            print(failed_text)
    return results

In [None]:
def get_labels_from_ids(label_ids):
    return [ id2tag[label_id] for label_id in label_ids if label_id != IGNORE_TAG_ID ]

In [None]:
def get_split_tokens_from_results(sentence_result):
    return [ token_result["word"] for token_result in sentence_result ]

In [None]:
def get_labels_from_results(sentence_result):
    return get_labels_from_ids([ int(regex.sub("^LABEL_", "", token_result["entity"])) for token_result in sentence_result ])

In [None]:
def results_to_entities(tag_id_list, token_id_list):
    entities = []
    token_counter = 0
    current_tag_class = ""
    current_tag_start = -1
    for tag, token in zip(tag_id_list, token_id_list):
        tag_start = tag[0]
        tag_class = regex.sub(r"^[BI]-", "", tag)
        if regex.search(r"^##", token):
            token_counter -= 1
        if current_tag_class != "" and not regex.search(r"^##", token):
            if tag_class == "O" or tag_start == "B" or tag_class != current_tag_class:
                entities.append([current_tag_start, token_counter, current_tag_class])
                current_tag_class = ""
                current_tag_start = -1
        if tag_class != "O" and current_tag_class == "":
            current_tag_class = tag_class
            current_tag_start = token_counter
            if regex.search(r"^##", token) and (len(entities) == 0 or entities[-1][2] != token_counter):
                current_tag_class = tag_class
                current_tag_start = token_counter - 1
        token_counter += 1
    if current_tag_class != "":
        entities.append([current_tag_start, token_counter, current_tag_class])
    return entities

In [None]:
def combine_split_tokens(split_tokens):
    combined_tokens = []
    for token in split_tokens:
        if not regex.search(r"^##", token):
            combined_tokens.append(token)
        else:
            combined_tokens[-1] += regex.sub(r"^##", "", token)
    return combined_tokens

In [None]:
def token_id_entities_to_char_id_entities(token_id_entities, split_tokens):
    char_id_entities = []
    tokens = combine_split_tokens(split_tokens)
    for token_id_entity in token_id_entities:
        char_start = 0
        for i in range(0, token_id_entity[0]):
            char_start += len(tokens[i]) + 1
        char_end = char_start
        for i in range(token_id_entity[0], token_id_entity[1]):
            char_end += len(tokens[i]) + 1
        char_id_entities.append([char_start, char_end - 1, token_id_entity[2]])
    return char_id_entities

In [None]:
def recognized_entities_to_annotation_labels(entities):
    split_tokens = get_split_tokens_from_results(entities)
    labels = get_labels_from_results(entities)
    token_id_entities = results_to_entities(labels, split_tokens)
    char_id_entities = token_id_entities_to_char_id_entities(token_id_entities, split_tokens)
    return char_id_entities

In [None]:
def process_results(results, data):
    for data, result in zip(data.items(), results):
        index, text = data
        text = retokenize(text)
        labels = recognized_entities_to_annotation_labels(result)
        print(text, labels)
        break

In [None]:
unique_tags, unique_types, tag2id = make_tags(id2tag)

In [None]:
model, tokenizer = load_model(num_labels=len(unique_tags), model_name="models/2000b")

In [None]:
data_sample = data["note_txt"].values[:10]
results = process_texts(data_sample, model, tokenizer)

In [None]:
for text, result in zip(data_sample, results):
    text = retokenize(text)
    labels = recognized_entities_to_annotation_labels(result)
    entities = [ { "start": start, "end": end, "entity": "B-"+label} for start, end, label in labels ] 
    ner_analysis_process.render_text(text, entities)