In [1]:
import sys
sys.path.append("..")
sys.path.append("../../ingredient-parser")

In [7]:
import pickle
import random
from collections import defaultdict

from sklearn.model_selection import train_test_split

from train.training_utils import load_datasets, evaluate
from ap.averaged_perceptron import AveragedPerceptron

Looks like best results are when the prediction is only based on the current token features????

In [37]:
def prepare_features(features, prev_label, prev_label2):

    prepared_features = set()
    for key, value in features.items():
        if isinstance(value, bool) and value:
            prepared_features.add(key)
        elif isinstance(value, str):
            prepared_features.add(key + "=" + value)

    #prepared_features.add("prev_label+stem=" + prev_label + "+" + features["stem"])
    #prepared_features.add("prev_label=" + prev_label)
    #prepared_features.add("prev_label2=" + prev_label2)
    #prepared_features.add("prev_label+prev_label2=" + prev_label + "+" + prev_label2)
    return prepared_features

In [9]:
def tag(features):

    labels = []
    for i, feat in enumerate(features):
        if i == 0:
            prev_label, prev_label2 = "-START-", "-START2-"

        feats = prepare_features(feat, prev_label, prev_label2)
        labels.append(model.predict(feats))

    return labels

In [12]:
vectors = load_datasets("../../ingredient-parser/train/data/training.sqlite3", "en", ["bbc", "cookstr", "nyt"])

[INFO] Loading and transforming training data.
[INFO] 59,933 usable vectors.
[INFO] 67 discarded due to OTHER labels.


In [13]:
(
    sentences_train,
    sentences_test,
    features_train,
    features_test,
    truth_train,
    truth_test,
    source_train,
    source_test,
) = train_test_split(
    vectors.sentences,
    vectors.features,
    vectors.labels,
    vectors.source,
    test_size=0.2,
    stratify=vectors.source,
)

In [38]:
%%time
model = AveragedPerceptron()
model.labels = {"QTY", "UNIT", "NAME", "PREP", "COMMENT", "PURPOSE", "PUNC", "SIZE"}

training = list(zip(features_train, truth_train))

for iter_ in range(7):
    c = 0  # number of correctly labelled tokens this iteration
    n = 0  # numer of total tokens this iteration
    for sentence_features, sentence_labels in training:
        for i, (features, label) in enumerate(zip(sentence_features, sentence_labels)):
            if i == 0:
                prev_label, prev_label2 = "-START-", "-START2-"
                
            feats = prepare_features(features, prev_label, prev_label2)
            guess = model.predict(feats)
            model.update(label, guess, feats)

            prev_label2 = prev_label
            # Use the guess here to avoid to model becoming over-reliant on the historical labels
            # being correct
            prev_label = guess

            c += guess == label
            n += 1

    print(f"Iter {iter_}: {c}/{n}={100*c/n:.1f}%")

    random.shuffle(training)
model.average_weights()

Iter 0: 326317/342542=95.3%
Iter 1: 329913/342542=96.3%
Iter 2: 330990/342542=96.6%
Iter 3: 331635/342542=96.8%
Iter 4: 332208/342542=97.0%
Iter 5: 332506/342542=97.1%
Iter 6: 332750/342542=97.1%
CPU times: user 36.8 s, sys: 3.97 ms, total: 36.8 s
Wall time: 36.8 s


In [39]:
true_labels = []
predicted_labels = []

for sentence_features, sentence_labels in zip(features_test, truth_test):
    true_labels.append(sentence_labels)
    predicted_labels.append(tag(sentence_features))

stats = evaluate(true_labels, predicted_labels)
print("Sentence-level results:")
print(f"\tAccuracy: {100*stats.sentence.accuracy:.2f}%")

print()
print("Word-level results:")
print(f"\tAccuracy {100*stats.token.accuracy:.2f}%")
print(f"\tPrecision (micro) {100*stats.token.weighted_avg.precision:.2f}%")
print(f"\tRecall (micro) {100*stats.token.weighted_avg.recall:.2f}%")
print(f"\tF1 score (micro) {100*stats.token.weighted_avg.f1_score:.2f}%")

Sentence-level results:
	Accuracy: 91.37%

Word-level results:
	Accuracy 97.30%
	Precision (micro) 97.32%
	Recall (micro) 97.30%
	F1 score (micro) 97.31%


In [31]:
with open("ap.pickle", "wb") as f:
    pickle.dump((model.weights, model.labels), f)