In [1]:
import sys

sys.path.append("..")

In [2]:
import random
from collections import defaultdict

from ingredient_parser.en import PreProcessor
from sklearn.model_selection import train_test_split

from train.training_utils import load_datasets, evaluate
from ap.averaged_perceptron import AveragedPerceptron

In [3]:
def prepare_features(
    features: dict[str, str | bool], prev_label: str, prev_label2: str
) -> set[str]:
    """Convert features dict to set of strings.

    The model weight use the features as keys, so they need to be a string rather
    than a key: value pair.
    For string features, the string is prepared by joining the key and value by "=".
    For boolean features, the string is prepared just using the key.

    Additional features are added based on the labels of the previous two tokens.

    Parameters
    ----------
    features : dict[str, str | bool]
        Dictionary of features for token, obtained from PreProcessor.sentence_features()
    prev_label : str
        Label of previous token
    prev_label2 : str
        Label of token before previous token

    Returns
    -------
    set
        Set of features as strings
    """
    prepared_features = set()
    for key, value in features.items():
        if isinstance(value, bool) and value:
            prepared_features.add(key)
        elif isinstance(value, str):
            prepared_features.add(key + "=" + value)

    # Add extra features based on labels of previous tokens.
    prepared_features.add("prev_label=" + prev_label)
    prepared_features.add("prev_label2=" + prev_label2)
    prepared_features.add("prev_label+prev_label2=" + prev_label + "+" + prev_label2)
    prepared_features.add("prev_label+stem=" + prev_label + "+" + features["stem"])

    return prepared_features

In [4]:
def tag(model: AveragedPerceptron, features: list[dict[str, str | bool]]) -> list[str]:
    """Tag a list of features dicts with labels using Averaged Perceptron model.


    Parameters
    ----------
    model : AveragedPerceptron
        Model to use to tag the list of features.
    features : list[dict[str, str | bool]]
        List of dicts of features for all tokens in sentence,
        obtained from PreProcessor.sentence_features()

    Returns
    -------
    list[str]
        List of labels for input tokens.
    """
    labels = []
    for i, feat in enumerate(features):
        if i == 0:
            prev_label, prev_label2 = "-START-", "-START2-"

        feats = prepare_features(feat, prev_label, prev_label2)
        label = model.predict(feats)
        labels.append(label)

        prev_label2 = prev_label
        prev_label = label

    return labels

# Load data
Load training data from database and split into between train and test sets.
80% of the data is used for training.
20% of the data is used for testing.

In [5]:
vectors = load_datasets(
    "../../ingredient-parser/train/data/training.sqlite3",
    "en",
    ["bbc", "cookstr", "nyt"],
)
(
    sentences_train,
    sentences_test,
    features_train,
    features_test,
    truth_train,
    truth_test,
    source_train,
    source_test,
) = train_test_split(
    vectors.sentences,
    vectors.features,
    vectors.labels,
    vectors.source,
    test_size=0.2,
    stratify=vectors.source,
)

[INFO] Loading and transforming training data.
[INFO] 59,933 usable vectors.
[INFO] 67 discarded due to OTHER labels.


# Train model

In [6]:
%%time
model = AveragedPerceptron()
model.labels = {"QTY", "UNIT", "NAME", "PREP", "COMMENT", "PURPOSE", "PUNC", "SIZE"}

training = list(zip(features_train, truth_train))

for iter_ in range(10):
    c = 0  # number of correctly labelled tokens this iteration
    n = 0  # numer of total tokens this iteration
    for sentence_features, sentence_labels in training:
        for i, (features, label) in enumerate(zip(sentence_features, sentence_labels)):
            if i == 0:
                prev_label, prev_label2 = "-START-", "-START2-"

            feats = prepare_features(features, prev_label, prev_label2)
            guess = model.predict(feats)
            model.update(label, guess, feats)

            prev_label2 = prev_label
            # Use the guess here to avoid to model becoming over-reliant on the historical labels
            # being correct
            prev_label = guess

            c += guess == label
            n += 1

    print(f"Iter {iter_}: {c}/{n}={100*c/n:.1f}%")

    random.shuffle(training)
model.average_weights()

model.save("ap.pickle")

Iter 0: 327796/341927=95.9%
Iter 1: 331643/341927=97.0%
Iter 2: 332912/341927=97.4%
Iter 3: 333370/341927=97.5%
Iter 4: 334084/341927=97.7%
Iter 5: 334513/341927=97.8%
Iter 6: 334821/341927=97.9%
Iter 7: 335257/341927=98.0%
Iter 8: 335424/341927=98.1%
Iter 9: 335680/341927=98.2%
CPU times: user 1min 1s, sys: 44.5 ms, total: 1min 1s
Wall time: 1min 1s


# Evaluate
Evaluate model performance using test data. This is data the model was not trained on, so is representative of how the model will perform when used in the wild.

In [7]:
true_labels = []
predicted_labels = []

for sentence_features, sentence_labels in zip(features_test, truth_test):
    true_labels.append(sentence_labels)
    predicted_labels.append(tag(model, sentence_features))

stats = evaluate(true_labels, predicted_labels)
print("Sentence-level results:")
print(f"\tAccuracy: {100*stats.sentence.accuracy:.2f}%")

print()
print("Word-level results:")
print(f"\tAccuracy {100*stats.token.accuracy:.2f}%")
print(f"\tPrecision (micro) {100*stats.token.weighted_avg.precision:.2f}%")
print(f"\tRecall (micro) {100*stats.token.weighted_avg.recall:.2f}%")
print(f"\tF1 score (micro) {100*stats.token.weighted_avg.f1_score:.2f}%")

Sentence-level results:
	Accuracy: 93.55%

Word-level results:
	Accuracy 97.72%
	Precision (micro) 97.76%
	Recall (micro) 97.72%
	F1 score (micro) 97.73%


In [8]:
loaded_model = AveragedPerceptron()
loaded_model.load("ap.pickle")

In [9]:
p = PreProcessor("600 g pork tenderloin, trimmed and cut into 10 cm pieces")
tag(loaded_model, p.sentence_features())

['QTY',
 'UNIT',
 'NAME',
 'NAME',
 'PUNC',
 'PREP',
 'PREP',
 'PREP',
 'PREP',
 'PREP',
 'PREP',
 'PREP']