# Creating the entry classifier
Creating automatically annotated training data and training the model.

Most of the functions used are located in `entry_classifier_utils.py`.

This notebook was largely adapted from a lab in the course EDAN20 â€“ Language Technology at Lund University

In [1]:
import sys
sys.path.append('../../')  # Assuming the parent directory

import joblib
import regex as re
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from utils.scraping_and_segmenting_helpers import *
from utils.paths import *
from utils.entry_classifier_utils import *

entry_classifier_models_folder = "entry_classifier_models/"

In [None]:
training_data_file = 'training_data.json'

classifier_remove_tags = [
    ["<b>", ""],
    ["</b>", ""],
]

## Creating (automatically) annotated training and test data

In [None]:
volumes = edition2_volume_start_end.keys()

labeled_data = []

is_entry = False
first_letter_list: list[str] = []
for volume in tqdm(volumes):
    first_letter_boundary = 0
    volume_letters_index = -1
    page_nbr = 0
    with open(f"{E2_TXT_FOLDER}/" + f"{volume}.txt", "r", encoding='utf-8') as f:
        for line in f:
            pagenbr_matches = re.search(r'page_number=(\d+)', line)
            if pagenbr_matches:
                page_nbr = int(pagenbr_matches.group(1))
                if page_nbr > first_letter_boundary:
                    volume_letters_index += 1
                    first_letter_list = edition2_volume_letters[volume][volume_letters_index][0]
                    # try:
                    first_letter_boundary = edition2_volume_letters[volume][volume_letters_index][1]
                    # except:
                    #     print(f"volume = {volume}, volume_letters_index = {volume_letters_index}, page_nbr: {page_nbr}, ")
                    #     break
            else:
                line = line.rstrip()[:MAX_ENTRY_LENGTH]
                if line: # and (len(line) > 40) and (len(line) > 75 or line.find(". Se ") == -1):
                    item = {}
                    # --- BOLD MATCHING --- create ground truth
                    if line.startswith(tuple([f"<b>{l}" for l in first_letter_list])):
                        line = clean_html_markup(line, classifier_remove_tags)
                        item["class"] = 1
                        is_entry = True


                    elif line and (not line.startswith("Fig. ")) and (not line.startswith("Ord, som saknas under K")) and (not (line[0] in first_letter_list)) and line[0] in ALPHABET:
                        item["class"] = 0
                        is_entry = True
                    
                    if is_entry:
                        item["text"] = line #this one should be last

                        labeled_data.append(item)
                        is_entry = False

dataset = []

for item in labeled_data:
    item_class = item['class']
    text = item['text']
    values = [item_class, text]
    dataset.append(values)

dataset[:6]

## Building ${X}$ and ${y}$
We can now enrich the dataset with a numerical representation of the sentence. We use the utility functions and we call this new version: `dataset_num`

In [None]:
dataset_num = []
for datapoint in tqdm(dataset):
    dataset_num += [list(datapoint) + [build_freq_dict(datapoint[1])]]

In [None]:
X_cat = [x[2] for x in dataset_num]
y_cat = [x[0] for x in dataset_num]

v = DictVectorizer(sparse=False)
X = v.fit_transform(X_cat)


In [None]:
joblib.dump(v, f'{entry_classifier_models_folder}/dict_vectorizer_model.pkl')

## Building the Model

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(50,), max_iter=50, verbose=True)
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state=42)
model = clf.fit(X_train, y_train)

## Predicting

In [None]:
y_test_pred = clf.predict(X_test)

In [None]:
y_test_pred[:20]

In [None]:
y_test[:20]

## Evaluation

In [None]:
conf_matrix = confusion_matrix(y_test, y_test_pred)
report = classification_report(y_test, y_test_pred, target_names=["Not Article", "Article"])

with open(f"{entry_classifier_models_folder}entry_classifier_stats.txt", "a", encoding='utf-8') as file:
    file.write("Confusion matrix for test data: \n")
    file.write(f"{np.array2string(conf_matrix, separator=', ')}\n")
    file.write("------------\n")
    file.write(f"{report}\n")
    file.write("------------\n")
    file.write(f"Micro F1: {f1_score(y_test, y_test_pred, average='micro')}\n")
    file.write(f"Macro F1: {f1_score(y_test, y_test_pred, average='macro')}\n")

## Save the model to a file

In [None]:
joblib.dump(clf, f'{entry_classifier_models_folder}model.pkl')