In [1]:
import sys
sys.path.append('..')
from data.preparer import load_amazon_dataset

from babble import Explanation
from babble import BabbleStream
from babble.Candidate import Candidate 

from metal.analysis import lf_summary
from metal.analysis import label_coverage
from metal import LabelModel
from metal.tuners import RandomSearchTuner
from babble.utils import ExplanationIO
from snorkel.labeling import filter_unlabeled_dataframe

import nltk
nltk.download("punkt")

import pandas as pd
from datetime import datetime
stat_history = pd.DataFrame()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nofarcarmeli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
DELIMITER = "#"
df_train, df_dev, df_valid, df_test = load_amazon_dataset(delimiter=DELIMITER)
print("{} training examples".format(len(df_train)))
print("{} development examples".format(len(df_dev)))
print("{} validation examples".format(len(df_valid)))
print("{} test examples".format(len(df_test)))

2000 training examples
500 development examples
500 validation examples
1000 test examples


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Transform the data into a format compatible with Babble Labble:

In [59]:
dfs = [df_train.copy(), df_dev.copy()]
dfs[0]['label'] = -1

for df in dfs:
    df["id"] = range(len(df))
    df["label"] += 1

Cs = [df.apply(lambda x: Candidate(x), axis=1) for df in dfs]

# babble labble uses 1 and 2 for labels, while our data uses 0 and 1
# add 1 to convert
Ys = [df.label.values for df in dfs]
Ys[0] -= 1 # no label (training set) should be set to -1

# Amazon Customer Reviews Classification with Babble

### For this task, you will work with Amazon Customer Reviews, writing explanations about how to classify them as positive or negative sentiment.

Only 1 star and 5 star reviews are included.

In [4]:
# start timer!
stat_history.append({"time": datetime.now(), "num_lfs": 0}, ignore_index=True)

Unnamed: 0,num_lfs,time
0,0.0,2020-02-03 16:16:09.843386


In [5]:
#define labels
ABSTAIN = 0
NEGATIVE = 1
POSITIVE = 2

Recall that aliases are a way to refer to a set of words in a rule. 

For example, with
`aliases = {"couples": ["girlfriend", "boyfriend", "wife", "husband"]}` 

--> now you can refer to "couples" in a rule, and the parser will know you mean any of these terms.


In [60]:
aliases = {"positive_adj": ["good", "great", "nice", "awesome", "cool", "amazing", "funny", "enjoy", "enjoyable", "fun", "like", "touching"],
           "negative_adj": ["bad", "worse", "worst", "terrible", "ugly", "boring", "dislike", "hate", "dissapointed", "however"],
           "negation": ["not", "wasn", "weren", "isn", "aren", "don", "doesn", "didn", "no"]} 
babbler = BabbleStream(Cs, Ys, balanced=True, shuffled=True, seed=456, aliases=aliases)

Grammar construction complete.


In [61]:
def prettyprint(candidate):
    # just a helper function to print the candidate nicely
    print(candidate.text)

Let's see an example candidate!

In [62]:
candidate = babbler.next()
prettyprint(candidate)

After reading Shadow Watch i was dissapointed. Most Clancy books are great but this one however was just, well....not good. The beginning was ok with a space shuttle that blows up at its launch date and leaves everybody in the book (including you the reader) in shock. the book from there manages to keep up with an all out assault on a factory in SA. But from there the book dies. None of the characters were described well in this book and it was hard to finish the next 200 some pages. The only real reason this book was published was because Tom Clancy had his name on it. To make it short, spare yourself of this book and get a good Tom Clancy book such as Patriot Games or Red Storm Rising


## Labeling Instructions

All reviews were submitted with either 1 star (negative) or 5 star (positive) ratings. 

Your task is to __create labeling functions__ by writing natural language explanations of labeling rules. Try to write them as quickly and accurately as possible.

You may consult the internet at any time.

## Create Explanations

Creating explanations generally happens in five steps:
1. View candidates
2. Write explanations
3. Get feedback
4. Update explanations 
5. Apply label aggregator

Steps 3-5 are optional; explanations may be submitted without any feedback on their quality. However, in our experience, observing how well explanations are being parsed and what their accuracy/coverage on a dev set are (if available) can quickly lead to simple improvements that yield significantly more useful labeling functions. Once a few labeling functions have been collected, you can use the label aggregator to identify candidates that are being mislabeled and write additional explanations targeting those failure modes.

### Collection

Use `babbler` to show candidates

In [63]:
candidate = babbler.next()
prettyprint(candidate)
print(candidate)
print(candidate.mention_id)

I really enjoy Blackfield's second CD and hope they continue to record in the future. I think Blackfield II is better than the band's first disc (and that one was great). As Steven Wilson's other band (Porcupine Tree) continues to play harder rock, Blackfield seems to be his outlet for good pop rock and akin to Porcupine Tree's "Stupid Dream" and "Lightbulb Sun" era. Now I'm trying to decide if I prefer Porcupine Tree's brand new "Fear of a Blank Planet" disc or Blackfield II. I think I'll just enjoy them both!
{'key': 185974, 'text': 'I really enjoy Blackfield\'s second CD and hope they continue to record in the future. I think Blackfield II is better than the band\'s first disc (and that one was great). As Steven Wilson\'s other band (Porcupine Tree) continues to play harder rock, Blackfield seems to be his outlet for good pop rock and akin to Porcupine Tree\'s "Stupid Dream" and "Lightbulb Sun" era. Now I\'m trying to decide if I prefer Porcupine Tree\'s brand new "Fear of a Blank P

If you don't know whether it's positive or negative, it's okay to make your best guess or skip an example.
For a candidate you decide to label, write an explanation of why you chose that label.

You can consult the internet or refer to the babble tutorial notebook.

In [64]:
e0 = Explanation(
    # feel free to change the name to something that describes your rule better.
    name = "e0", 
    label = NEGATIVE, 
    condition = 'because the word "terrible" occurs', 
    # remember that is argument (candidate) is optional. 
    # You can use it to make sure the explanation applies to the candidate you pass as an argument.
    candidate = 478
)

In [65]:
e1 = Explanation(
    name = "e1", 
    label = POSITIVE, 
    condition = 'a word in the sentence is a positive_adj', 
    candidate = 486
)

In [66]:
e2 = Explanation(
    name = "positive_adj", 
    label = POSITIVE, 
    condition = "a word in the sentence is a positive_adj", 
    candidate = 131
)

In [67]:
e3 = Explanation(
    name = "negative_adj", 
    label = NEGATIVE, 
    condition = "a word in the sentence is a negative_adj", 
    candidate = 479
)

In [68]:
e4 = Explanation(
    name = "e4", 
    label = NEGATIVE, 
    condition = "a word in the sentence is a positive_adj after negation" 
)

In [69]:
e5 = Explanation(
    name = "e5", 
    label = POSITIVE, 
    condition = "a word in the sentence is a negative_adj after negation"
)

In [70]:
e6 = Explanation(
    name = "e6", 
    label = POSITIVE, 
    condition = "because synonyms of 'good' occurs", 
    candidate = candidate.mention_id 
)

In [71]:
e7 = Explanation(
    name = "e7", 
    label = ABSTAIN, 
    condition = "", 
    candidate = candidate.mention_id 
)

In [72]:
e8 = Explanation(
    name = "e8", 
    label = ABSTAIN, 
    condition = "", 
    candidate = candidate.mention_id 
)

In [73]:
e9 = Explanation(
    name = "e9", 
    label = ABSTAIN, 
    condition = "", 
    candidate = candidate.mention_id 
)

In [74]:
e10 = Explanation(
    name = "e10", 
    label = ABSTAIN, 
    condition = "", 
    candidate = candidate.mention_id 
)

Babble will parse your explanations into functions, then filter out functions that are duplicates, incorrectly label their given candidate, or assign the same label to all examples.

In [None]:
# Add any explanations that you haven't committed yet
explanations = [e0, e1, e2, e3, e4, e5, e6, e7, e8, e9]

parses, filtered = babbler.apply(explanations)

Building list of target candidate ids...
Collected 4 unique target candidate ids from 10 explanations.
Gathering desired candidates...
Found 4/4 desired candidates
Linking explanations to candidates...
Linked 8/10 explanations
5 explanation(s) out of 10 were parseable.
14 parse(s) generated from 10 explanation(s).
12 parse(s) remain (2 parse(s) removed by DuplicateSemanticsFilter).
Note: 12 LFs did not have candidates and therefore could not be filtered.
12 parse(s) remain (0 parse(s) removed by ConsistencyFilter).
Applying labeling functions to investigate labeling signature.

### Analysis
See how your explanations were parsed and filtered

In [None]:
try: 
    babbler.analyze(parses)
except ValueError as e:
    print("It seems as though none of your labeling functions were parsed. See the cells above and below for more information.")
    print("ERROR:")
    print(e)

In [None]:
babbler.filtered_analysis(filtered)

In [None]:
babbler.commit()

### Evaluation
Get feedback on the performance of your explanations

In [None]:
Ls = [babbler.get_label_matrix(split) for split in [0,1,2]]
lf_names = [lf.__name__ for lf in babbler.get_lfs()]
lf_summary(Ls[1], Ys[1], lf_names=lf_names)

In [None]:
search_space = {
    'n_epochs': [50, 100, 500],
    'lr': {'range': [0.01, 0.001], 'scale': 'log'},
    'show_plots': False,
}

tuner = RandomSearchTuner(LabelModel, seed=123)

label_aggregator = tuner.search(
    search_space, 
    train_args=[Ls[0]], 
    X_dev=Ls[1], Y_dev=Ys[1], 
    max_search=20, verbose=False, metric='f1')

# record statistics over time
pr, re, f1 = label_aggregator.score(Ls[1], Ys[1], metric=['precision', 'recall', 'f1'])
stats = {
    "precision": pr,
    "recall": re,
    "f1": f1,
    "time": datetime.now(),
    "training_label_coverage": label_coverage(Ls[0]),
    "training_label_size": label_coverage(Ls[0])*len(dfs[0])
}
stat_history = stat_history.append(stats, ignore_index=True)

In [None]:
# view some incorrectly labeled examples for a given LF
j = 0
print(lf_names[j])
# set j to match the value of the LF you're interested in
L_dev = Ls[1].todense()
display(df_dev[L_dev[:,j].A1==abs(df_dev["label"]-3)])

## Train Model
We can train a simple bag of words model on these labels, and see test accuracy.

(This step may take a while).

In [None]:
L_train = Ls[0].todense()
probs_train = label_aggregator.predict_proba(L=L_train)
mask = (L_train != 0).any(axis=1).A1
df_train_filtered = df_train.iloc[mask]
probs_train_filtered = probs_train[mask]
print("{} out of {} examples used for training data".format(len(df_train_filtered), len(df_train)))

from analyzer import train_model_from_probs

for df in df_valid, df_test:
    vc = df["label"].value_counts()
    assert len(vc) == 2
    vc.iloc[0]
    vc.iloc[1]

train_model_from_probs(df_train_filtered, probs_train_filtered, df_valid, df_test)

In [50]:
df_test["label"]

386128    0
297944    1
164196    0
76962     0
52185     1
387208    0
142735    1
41715     0
258897    1
125129    0
313407    0
134859    0
44102     0
321308    0
308179    1
342248    0
139644    1
324172    0
261967    0
4530      0
166087    0
397161    0
111380    1
15494     0
203610    0
368848    1
375145    1
137134    0
354772    0
11561     1
         ..
77463     0
299037    1
282521    1
49066     0
190281    1
292641    1
220224    0
93251     1
94037     0
184303    1
318517    0
63371     1
199418    1
312380    1
266667    1
9380      0
251940    0
136400    0
172166    0
101050    0
283483    1
287443    0
203332    1
176165    1
248590    1
166147    1
195886    0
93541     1
269067    1
160020    1
Name: label, Length: 1000, dtype: int64

In [35]:
def get_keras_logreg(input_dim, output_dim=2):
    model = tf.keras.Sequential()
    if output_dim == 1:
        loss = "binary_crossentropy"
        activation = tf.nn.sigmoid
    else:
        loss = "categorical_crossentropy"
        activation = tf.nn.softmax
    dense = tf.keras.layers.Dense(
        units=output_dim,
        input_dim=input_dim,
        activation=activation,
        kernel_regularizer=tf.keras.regularizers.l2(0.001),
    )
    model.add(dense)
    opt = tf.keras.optimizers.Adam(lr=0.01)
    model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])
    return model

def get_keras_early_stopping(patience=10, monitor="val_accuracy"):
    """Stops training if monitor value doesn't exceed the current max value after patience num of epochs"""
    return tf.keras.callbacks.EarlyStopping(
        monitor=monitor, patience=patience, verbose=1, restore_best_weights=True
    )

In [40]:
    from sklearn.feature_extraction.text import CountVectorizer
    from snorkel.analysis import metric_score
    from snorkel.labeling import filter_unlabeled_dataframe
    from snorkel.utils import preds_to_probs
    import tensorflow as tf
    vectorizer = CountVectorizer(ngram_range=(1, 2))
    X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())

    X_valid = vectorizer.transform(df_valid["text"].tolist())
    X_test = vectorizer.transform(df_test["text"].tolist())

    Y_valid = (df_valid["label"] - 1).values
    Y_test = (df_test["label"]-1).values

    # Define a vanilla logistic regression model with Keras
    keras_model = get_keras_logreg(input_dim=X_train.shape[1])

    keras_model.fit(
        x=X_train,
        y=probs_train_filtered,
        validation_data=(X_valid, preds_to_probs(Y_valid, 2)),
        callbacks=[get_keras_early_stopping()],
        epochs=50,
        verbose=0,
    )

    preds_test = keras_model.predict(x=X_test).argmax(axis=1)
    test_acc = metric_score(golds=Y_test, preds=preds_test, metric="accuracy")
    print(f"Test Accuracy: {test_acc * 100:.1f}%")

ValueError: Could not convert abstained vote to probability

In [38]:
metric_score(golds=Y_test, preds=preds_test, metric="f1")

0.193

## Save
When your time is up, please save your explanations and model!

In [29]:
from types import SimpleNamespace

stat_history.to_csv("babbler_amazon_statistics_history.csv")
FILE = "babbler_amazon_explanations.tsv"
exp_io = ExplanationIO()
for exp in explanations:
    if exp.candidate is None:
        exp.candidate = SimpleNamespace(mention_id = None)
exp_io.write(explanations, FILE)
explanations = exp_io.read(FILE)
label_aggregator.save("babble_amazon_lfmodel.pkl")

Wrote 10 explanations to babbler_amazon_explanations.tsv
Read 10 explanations from babbler_amazon_explanations.tsv
