# Creating the entry classifier
Creating automatically annotated training data and training the model

In [1]:
import hashlib
import joblib
import regex as re
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import confusion_matrix
from tqdm.notebook import tqdm
from collections import Counter
from scraping_and_segmenting_helpers import *

In [2]:
training_data_file = 'training_data.json'

classifier_remove_tags = [
    ["<b>", ""],
    ["</b>", ""],
]

## Creating (automatically) annotated training and test data

In [3]:
volumes = edition2_volume_start_end.keys()

labeled_data = []

is_entry = False
first_letter_list: list[str] = []
for volume in tqdm(volumes):
    first_letter_boundary = 0
    volume_letters_index = -1
    page_nbr = 0
    with open(folder_edition2 + f"{volume}.txt", "r", encoding='utf-8') as f:
        for line in f:
            pagenbr_matches = re.search(r'page_number=(\d+)', line)
            if pagenbr_matches:
                page_nbr = int(pagenbr_matches.group(1))
                if page_nbr > first_letter_boundary:
                    volume_letters_index += 1
                    first_letter_list = edition2_volume_letters[volume][volume_letters_index][0]
                    # try:
                    first_letter_boundary = edition2_volume_letters[volume][volume_letters_index][1]
                    # except:
                    #     print(f"volume = {volume}, volume_letters_index = {volume_letters_index}, page_nbr: {page_nbr}, ")
                    #     break
            else:
                line = line.rstrip()[:MAX_ENTRY_LENGTH]
                if line: # and (len(line) > 40) and (len(line) > 75 or line.find(". Se ") == -1):
                    item = {}
                    # --- BOLD MATCHING --- create ground truth
                    if line.startswith(tuple([f"<b>{l}" for l in first_letter_list])):
                        line = clean_html_markup(line, classifier_remove_tags)
                        item["class"] = 1
                        is_entry = True


                    elif line and (not line.startswith("Fig. ")) and (not line.startswith("Ord, som saknas under K")) and (not (line[0] in first_letter_list)) and line[0] in ALPHABET:
                        item["class"] = 0
                        is_entry = True
                    
                    if is_entry:
                        item["text"] = line #this one should be last

                        labeled_data.append(item)
                        is_entry = False

  0%|          | 0/34 [00:00<?, ?it/s]

['A']
['A']
['B']
['B']
['B']
['C']
['C']
['D']
['D']
['E']
['E']
['F']
['F']
['F']
['G']
['G']
['H']
['H']
['H']
['I']
['J']
['J']
['K']
['K']
['K']
['L']
['L']
['L']
['M']
['M']
['M']
['N']
['N']
['O']
['P']
['P']
['P']
['Q']
['R']
['R']
['R']
['S']
['S']
['S']
['S']
['S']
['T']
['T']
['T']
['U']
['U']
['V']
['V', 'W']
['V', 'W']
['X']
['Y']
['Z']
['Å']
['Ä']
['Ö']


Run the code to split the fields and remove possible whitespaces

In [None]:
dataset = []

for item in labeled_data:
    item_class = item['class']
    text = item['text']
    values = [item_class, text]
    dataset.append(values)

dataset[:6]

## Utilities

Before you can use the dataset to train a model, you need to convert it into numbers. You will carry this with out the following steps and you will write a corresponding function.
1. You will extract the $n$-grams up to trigrams (`all_ngrams()`);
2. Trigrams can create many symbols that most student's machines cannot process. You will reduce their numbers using hash codes (`hash_ngrams()`);
3. You will compute the relative frequencies of the $n$-grams, replaced here by the hash codes (`calc_ref_freq()`).
4. The results will be stored in three dictionaries, for characters, bigrams, and trigrams. You will merge these dictionaries into one (`shift_keys()`).

You will then apply the functions to vectorize the dataset.

### Extracting $n$-grams
The goal of this section is that you extract the $n$-grams from a text. By default, you will lowercase the text. The result will have the form: `[chars, bigrams, trigrams]`

Write a function to extract the $n$-grams of a sentence: `ngrams(sentence, n=1, lc=True)`, `n` is a parameters. You can use list slices for this.

In [None]:
# Write your code here
def ngrams(sentence, n=1, lc=True):
    ngram_l = []
    if lc:
        sentence.lower()
    for i in range(len(sentence) - n + 1):
        # ngram_l.append(tuple(sentence[j] for j in range(i, i + n))) # den är så fin jag vill behålla den
        ngram_l.append(sentence[i:i+n])
    return ngram_l

In [None]:
ngrams('try something.')

In [None]:
ngrams('try something.', n=2)

We now use this function to extract all the $n$-grams

In [None]:
def all_ngrams(sentence, max_ngram=3, lc=True):
    all_ngram_list = []
    for i in range(1, max_ngram + 1):
        all_ngram_list += [ngrams(sentence, n=i, lc=lc)]
    return all_ngram_list

In [None]:
all_ngrams('try something.')

### Hashing

We consider languages with many characters that will make the number of bigrams and trigrams impossible to process. We will use the _hashing trick_ to reduce them, where we will gather $n$-grams into subsets using hash codes.

Each item will have this format:
`[char_hcodes, bigram_hcodes, trigram_hcodes]`.

#### Description

Python has a built-in hashing function that returns a unique numerical signature for a given string

In [None]:
hash('a'), hash('ab'), hash('abc')

If we take the remainder (modulo) of a division by 5, we reduce the possible codes to: 0, 1, 2, 3, or 4

In [None]:
list(map(lambda x: x % 5, (hash('a'), hash('ab'), hash('abc'))))

#### Implementation

We set maximal numbers for our $n$-grams using these divisors

In [None]:
MAX_CHARS = 521
MAX_BIGRAMS = 1031
MAX_TRIGRAMS = 1031

Here strings have integer codes within the range [0, `MAX_CHARS`[

In [None]:
list(map(lambda x: x % MAX_CHARS, (hash('a'), hash('ab'), hash('abc'))))

Hash codes may vary across machines and Marcus Klang wrote this function to have reproducible codes

In [None]:
def reproducible_hash(string):
    """
    reproducible hash on any string
    
    Arguments:
       string: python string object
    
    Returns:
       signed int64
    """
    
    # We are using MD5 for speed not security.
    h = hashlib.md5(string.encode("utf-8"), usedforsecurity=False)
    return int.from_bytes(h.digest()[0:8], 'big', signed=True)

In [None]:
reproducible_hash('a')

In [None]:
reproducible_hash('a') % MAX_CHARS

### Converting $n$-grams to hash codes
You will now convert the $n$-grams to hash codes


In [None]:
MAXES = [MAX_CHARS, MAX_BIGRAMS, MAX_TRIGRAMS]

Create a `hash_ngrams` function that creates a list of hash codes from a list of $n$-grams. As arguments, you will have the list of $n$-grams `[chars, bigrams, trigrams]` as well as the list of dividers (`MAXES`).

The output format will be a list of three lists:

`[char_hcodes, bigram_hcodes, trigram_hcodes]`.

In [None]:
# Write your code
def hash_ngrams(ngrams, modulos):
    hash_codes = []
    chars = [reproducible_hash(i) % MAX_CHARS for i in ngrams[0]]
    bigrams = [reproducible_hash(i) % MAX_BIGRAMS for i in ngrams[1]]
    trigrams = [reproducible_hash(i) % MAX_TRIGRAMS for i in ngrams[2]]
    hash_codes = [chars, bigrams, trigrams]
    return hash_codes

In [None]:
all_ngrams('try something.')

In [None]:
hash_ngrams(all_ngrams('try something.'), MAXES)

### Functions to Count Hash Codes

Write a function `calc_rel_freq(codes)` to count the codes. As in CLD3, you will return the relative frequencies.

This is just an application of `Counter` to a list of codes and then a division by the length.

The input is a list of codes and the output is a `Counter` object of relative frequencies.

In [None]:
# Write your code
def calc_rel_freq(codes):
    cnt = Counter(codes)
    cnt = Counter({k: v / total for total in (sum(cnt.values()),) for k, v in cnt.items()})
    # map(lambda x: x / len(codes), cnt.values())
    # counter = Counter(map(lambda x: x[1], dataset_large))
    return cnt

In [None]:
hash_ngrams(all_ngrams('try something.'), MAXES)

In [None]:
list(map(calc_rel_freq, hash_ngrams(all_ngrams('try something.'), MAXES)))


In [None]:
list(map(calc_rel_freq, hash_ngrams(all_ngrams('try something.'), MAXES)))

### Merge the Dictionaries

In the results above, we have three counter objects with numerical keys (the hash codes). You will build one dictionary of them.

There is a key overlap and we must take care that a same hash code for the unigrams is not the same as in the bigrams. We will then shift the keys.

The keys range from:
1. Unigrams from 0 to 521, [0, MAX_CHARS[
2. Bigrams from 0 to 1031, [0, MAX_BIGRAMS[
3. Trigrams from 1 to 1031, [0, MAX_TRIGRAMS[

You will leave the unigrams keys as they are. You will shift the bigram keys by MAX_CHARS, and the trigram keys by MAX_CHARS + MAX_BIGRAMS. You can reuse the code below

In [None]:
MAX_SHIFT = []
for i in range(len(MAXES)):
    MAX_SHIFT += [sum(MAXES[:i])]

In [None]:
MAX_SHIFT

Write a `shift_keys(dicts, MAX_SHIFT)` function that takes a list of dictionaries as input and the list of shifts and that a new unique dictionary, where the numerical keys have been shifted by the numbers in `MAX_SHIFT`

In [None]:
# Write your code here
def shift_keys(dicts, MAX_SHIFT):
    new_dict = {}
    dicts = list(dicts)
    new_dict.update(dicts[0].items())
    new_dict.update({k + MAX_SHIFT[1]: v for k, v in dicts[1].items()})
    new_dict.update({k + MAX_SHIFT[2]: v for k, v in dicts[2].items()})
    # new_dict.update(dicts[0].items())
    return new_dict

In [None]:
list(map(calc_rel_freq, hash_ngrams(all_ngrams('try something.'), MAXES)))

In [None]:
shift_keys(map(calc_rel_freq, hash_ngrams(all_ngrams('try something.'), MAXES)), MAX_SHIFT)


In [None]:
shift_keys(map(calc_rel_freq, hash_ngrams(all_ngrams('try something.'), MAXES)), MAX_SHIFT)

Finally, we assemble all these utilities in a function

In [None]:
def build_freq_dict(sentence, MAXES=MAXES, MAX_SHIFT=MAX_SHIFT):
    hngrams = hash_ngrams(all_ngrams(sentence), MAXES)
    fhcodes = map(calc_rel_freq, hngrams)
    return shift_keys(fhcodes, MAX_SHIFT)

In [None]:
build_freq_dict('try something.')

## Converting the Dataset
We can now enrich the dataset with a numerical representation of the sentence. We use the utility functions and we call this new version: `dataset_num`

In [None]:
dataset[:2]

In [None]:
dataset_num = []
for datapoint in tqdm(dataset):
    dataset_num += [list(datapoint) + [build_freq_dict(datapoint[1])]]

In [None]:
dataset_num[:2]

## Programming: Building ${X}$

You will now build the ${X}$ matrix.

### Vectorizing the features

The CLD3 architecture uses embeddings. In this lab, we will simplify it and we will use a feature vector instead consisting of the character frequencies. For example, you will represent the text:

`"Let's try something."`

with:

`{'l': 0.05, 'e': 0.1, 't': 0.15, "'": 0.05, 's': 0.1, ' ': 0.1, 
 'r': 0.05, 'y': 0.05, 'o': 0.05, 'm': 0.05, 'h': 0.05, 'i': 0.05, 
 'n': 0.05, 'g': 0.05, '.': 0.05}`

Note that we used characters and not codes to make it more legible.

To create the ${X}$ matrix, we need to transform the dictionaries of `dataset_num` into numerical vectors. The `DictVectorizer` class from the scikit-learn library, see here [https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html], has two methods, `fit()` and `transform()`, and a combination of both `fit_transform()` to convert dictionaries into such vectors.

You will now write the code to:

1. Extract the hash code frequency dictionaries from `dataset_num` corresponding to its 3rd index;
2. Convert the list of dictionaries into an ${X}$ matrix using `DictVectorizer`.

#### Extracting the character frequencies

Produce a new list of datapoints with the $n$-grams. Each item in this list will be a dictionary. You will call it `X_cat`

In [None]:
# Write your code here
X_cat = [x[2] for x in dataset_num]
dataset_num[0]

In [None]:
X_cat[0]

#### Vectorize `X_cat`

Convert you `X_cat` matrix into a numerical representation using `DictVectorizer`: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html. You will set the `sparse` argument to False. Call the result `X`.

In [None]:
# Write your code here
v = DictVectorizer(sparse=False)
X = v.fit_transform(X_cat)
joblib.dump(v, 'dict_vectorizer_model.pkl')

In [None]:
X.shape

In [None]:
X[:6]

## Programming: Building $\mathbf{y}$

You will now convert the list of language symbols into a $\mathbf{y}$ vector

Extract the language symbols from `dataset_small_feat` and call the resulting list `y_cat`

In [None]:
# Write your code here
y_cat = [x[0] for x in dataset_num]

In [None]:
y_cat[:5]

## Programming: Building the Model

Create a neural network using sklearn with a hidden layer of 50 nodes and a relu activation layer: https://scikit-learn.org/stable/modules/neural_networks_supervised.html. Set the maximal number of iterations to 5, in the beginning, and verbose to True. Use the default values for the rest. You will call your classifier `clf`

In [None]:
# Write your code here
clf = MLPClassifier(hidden_layer_sizes=(50,), max_iter=50, verbose=True)

In [None]:
clf

### Training and Validation Sets

You will now split the dataset into a training and validation sets

#### We split the dataset
We use a training set of 80% and a validation set of 20%

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state=42)

### Fitting the model

Fit the model on the training set

In [None]:
# Write your code here
model = clf.fit(X_train, y_train)

## Predicting

Predict the `X_val` languages. You will call the result `y_val_pred`

In [None]:
# Write your code here
y_test_pred = clf.predict(X_test)

In [None]:
y_test_pred[:20]

In [None]:
y_test[:20]

#### Evaluating

Use the `accuracy_score()` function to evaluate your model on the validation set

In [None]:
# evaluate the model
accuracy_score(y_test, y_test_pred)

In [None]:
print(classification_report(y_test, y_test_pred, target_names=["Not Article", "Article"]))
print('Micro F1:', f1_score(y_test, y_test_pred, average='micro'))
print('Macro F1', f1_score(y_test, y_test_pred, average='macro'))

### Confusion Matrix

In [None]:
confusion_matrix(y_test, y_test_pred)

You may try to increase the number of iterations to improve the score. You may also try change the parameters of the multilayer percetron.

## Predict text as article or not article

In [None]:
docs = [
    'Syrus, Publilius, författare af latinska folkskådespel (mimer), född i Antiokia, var slaf, men frigafs af sin husbonde. Sedan han skaffat sig tillräcklig bildning, började han skrifva och uppfö',
    'Thalictrum L., Ängsrutsläktet, bot., ett släkte, hörande till fam. Ranunculaceæ, af oftast högväxta, glatta örter med spiralställda, 2-flerdubbelt pardelade blad, hvilkas gemensam', 
    'Från landshöfdingplatsen, på hvilken han utvecklade en erkännansvärd drift, tog T. af privata skäl afsked 7 aug. 1896 och antog befattningen som verkställande direktör i Trafikaktiebolaget Grängesberg', 
    'Nietzscheöfversättare. 1907 donerade han en stor summa till Nietzschearkivet i Weimar.',
    ]

Create features vectors from this list. Call this matrix `X_test`

In [None]:
# Write your code here
docs_cat = [build_freq_dict(x) for x in docs]
docs_cat[0]


In [None]:
X_test = v.transform(docs_cat)

In [None]:
X_test[:5]

In [None]:
X_test.shape

And run the prediction that you will store in a variable called `pred_languages`

In [None]:
# Write your code here
pred_languages = clf.predict(X_test)
pred_languages

In [None]:
joblib.dump(clf, 'mlp_model.pkl')