# Model testing

Lets take a look at the performance of the model

In [1]:
from google.colab import drive
import os
import pickle
import numpy as np
import tensorflow as tf
from sklearn.metrics import f1_score, classification_report

In [2]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
def load_validation_data(data_path):
    X_val, y_val = [], []
    for f in sorted(os.listdir(data_path)):
        if f.endswith('.pkl'):
            with open(os.path.join(data_path, f), 'rb') as file:
                X, y = pickle.load(file)
                X_val.extend(X)
                y_val.extend(y)
    return np.array(X_val), np.array(y_val)

In [4]:
model_path = '/content/gdrive/MyDrive/opj/data/checkpoints/model_epoch_10.hdf5'
validation_data_path = '/content/gdrive/MyDrive/opj/data/validation'

model = tf.keras.models.load_model(model_path)

X_val, y_val = load_validation_data(validation_data_path)

predictions = model.predict(X_val)



Lets also prepare the data needed to better understand the results.

In [8]:
int_to_tag = {
    0: 'Noun',
    1: 'Verb',
    2: 'Adjective',
    3: 'Pronoun',
    4: 'Adverb',
    5: 'Adposition',
    6: 'Conjunction',
    7: 'Numeral',
    8: 'Particle',
    9: 'Interjection',
    10: 'Abbreviation',
    11: 'Residual'
    #Class 12 (Punctuation) has been removed, and -1 (Padding) is not included.
}

In [18]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

warnings.filterwarnings('ignore', category=UndefinedMetricWarning)

# Converting predictions to class labels
y_pred = np.argmax(predictions, axis=-1)

y_true_flat = y_val.flatten()
y_pred_flat = y_pred.flatten()

# Mask for non-padded tokens
non_padding_indices = y_true_flat != -1
y_true_filtered = y_true_flat[non_padding_indices]
y_pred_filtered = y_pred_flat[non_padding_indices]

f1_scores = f1_score(y_true_filtered, y_pred_filtered, average=None, labels=np.unique(y_pred_filtered))

f1_score_macro = f1_score(y_true_filtered, y_pred_filtered, average='macro', labels=np.unique(y_pred_filtered))
f1_score_weighted = f1_score(y_true_filtered, y_pred_filtered, average='weighted', labels=np.unique(y_pred_filtered))

precision_scores = precision_score(y_true_filtered, y_pred_filtered, average=None, labels=np.unique(y_pred_filtered))
recall_scores = recall_score(y_true_filtered, y_pred_filtered, average=None, labels=np.unique(y_pred_filtered))

print("Metrics for each class (excluding padding):")
for i, (f1, precision, recall) in enumerate(zip(f1_scores, precision_scores, recall_scores)):
    class_name = int_to_tag.get(i, "Unknown Class")
    if class_name != "Unknown Class":
        print(f"{class_name} - Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Overall scores
print("\nOverall Metrics:")
print(f"Macro F1 Score: {f1_score_macro:.4f}")
print(f"Weighted F1 Score: {f1_score_weighted:.4f}")

Metrics for each class (excluding padding):
Noun - Precision: 0.9655, Recall: 0.9744, F1 Score: 0.9699
Verb - Precision: 0.9814, Recall: 0.9774, F1 Score: 0.9794
Adjective - Precision: 0.9457, Recall: 0.9468, F1 Score: 0.9463
Pronoun - Precision: 0.9785, Recall: 0.9836, F1 Score: 0.9810
Adverb - Precision: 0.9385, Recall: 0.9317, F1 Score: 0.9351
Adposition - Precision: 0.9974, Recall: 0.9964, F1 Score: 0.9969
Conjunction - Precision: 0.9723, Recall: 0.9736, F1 Score: 0.9729
Numeral - Precision: 0.9777, Recall: 0.9608, F1 Score: 0.9691
Particle - Precision: 0.8875, Recall: 0.8799, F1 Score: 0.8837
Interjection - Precision: 0.6931, Recall: 0.6667, F1 Score: 0.6796
Abbreviation - Precision: 0.9408, Recall: 0.8903, F1 Score: 0.9149
Residual - Precision: 0.6607, Recall: 0.5608, F1 Score: 0.6067

Overall Metrics:
Macro F1 Score: 0.8335
Weighted F1 Score: 0.9643


## Inference

In [19]:
!git clone https://github.com/facebookresearch/fastText.git
!pip install fastText/.

Cloning into 'fastText'...
remote: Enumerating objects: 3995, done.[K
remote: Counting objects: 100% (1023/1023), done.[K
remote: Compressing objects: 100% (183/183), done.[K
remote: Total 3995 (delta 893), reused 863 (delta 835), pack-reused 2972[K
Receiving objects: 100% (3995/3995), 8.29 MiB | 15.80 MiB/s, done.
Resolving deltas: 100% (2531/2531), done.
Processing ./fastText
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext==0.9.2)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4214524 sha256=fbc9080c6b6ab8f4fed60e458dcbffb0d208fb3b3b86d3819464e02cc601832d
  Stored in directory: /tmp/pip-ephem-wheel-cache-d2aq

In [20]:
import fasttext
import fasttext.util

In [21]:
!curl https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.bin.gz --output cc.hr.300.bin.gz
!gunzip cc.hr.300.bin.gz -d cc.hr.300.bin
ft = fasttext.load_model('cc.hr.300.bin')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4299M  100 4299M    0     0  96.7M      0  0:00:44  0:00:44 --:--:-- 86.0M
gzip: cc.hr.300.bin: unknown suffix -- ignored


In [48]:
int_to_tag = {
    0: 'Imenica',
    1: 'Glagol',
    2: 'Pridjev',
    3: 'Zamjenica',
    4: 'Prilog',
    5: 'Prijedlog',
    6: 'Veznik',
    7: 'Broj',
    8: 'Čestica',
    9: 'Usklik',
    10: 'Kratica',
    11: 'Posuđenica'
}


def predict_tags(sentence, max_sequence_length=30, embedding_dim=300):
    words = sentence.split()

    embeddings = np.zeros((max_sequence_length, embedding_dim))

    for i, word in enumerate(words[:max_sequence_length]):
        embeddings[i] = ft.get_word_vector(word)

    embeddings = np.expand_dims(embeddings, axis=0)

    # Predict tags for each word
    predictions = model.predict(embeddings)
    predicted_classes = np.argmax(predictions, axis=-1)[0]

    # Converting to human-readable tags
    predicted_tags = [int_to_tag.get(cls, "Unknown") for cls in predicted_classes]

    return predicted_tags[:len(words)]

In [37]:
def infer(sentence):
  predicted_tags = predict_tags(sentence)
  words = sentence.split()

  for word, tag in zip(words, predicted_tags):
      print(f"{word} - {tag}")

In [50]:
infer("Sutra je prvi dan jeseni i idem u grad na šetnju.")

Sutra - Prilog
je - Glagol
prvi - Broj
dan - Imenica
jeseni - Imenica
i - Veznik
idem - Glagol
u - Prijedlog
grad - Imenica
na - Prijedlog
šetnju. - Imenica
