In [None]:
import sys
sys.path.append("..")
from data.preparer import load_news_dataset
from babble import Explanation
from babble import BabbleStream
from babble.Candidate import Candidate 

from metal.analysis import lf_summary
from metal.analysis import label_coverage
from metal import LabelModel
from metal.tuners import RandomSearchTuner
from babble.utils import ExplanationIO

import pandas as pd
from datetime import datetime
from snorkel.labeling import filter_unlabeled_dataframe

stat_history = pd.DataFrame()
import nltk
nltk.download("punkt")
pd.set_option('display.max_colwidth', -1)

## The Data

These texts discuss either gun politics (1) or computer electronics (0).

If you're not sure about the correct label, that's fine -- either make your best guess or just skip the example.

In [None]:
# Unzip the data. (Don't worry about this, it should be already unzipped.)
# Replace PASSWORD with the password to unzip the data, or download it directly from Kaggle.

#!unzip -P PASSWORD data/data.zip

Load the dataset into training, validation, development, and test sets

In [None]:
df_train, df_dev, df_valid, df_test, _ = load_news_dataset()
print("{} training examples".format(len(df_train)))
print("{} development examples".format(len(df_dev)))
print("{} validation examples".format(len(df_valid)))
print("{} test examples".format(len(df_test)))

Convert the data and labels into a Babble-friendly format

In [None]:
dfs = [df_train, df_dev]
dfs[0]['label'] = -1

for df in dfs:
    df["id"] = range(len(df))
    df["label"] += 1

Cs = [df.apply(lambda x: Candidate(x), axis=1) for df in dfs]

# babble labble uses 1 and 2 for labels, while our data uses 0 and 1
# add 1 to convert
Ys = [df.label.values for df in dfs]
Ys[0] -= 1 # no label (training set) should be set to -1

Define the labels for this task.

In [None]:
ABSTAIN = 0
ELECTRONICS = 1
GUNS = 2

# Babble Tutorial
## News forum classification

### You will work with a subset of the 20 NewsGroup dataset. 
The texts shown are from one of two forums:
 1. Computer Electronics (Label 1)
 2. Gun Politics Forum (Label 2)
Your job is to create a training data set to classify texts as belonging to one of these two forums.

__You will do this by writing natural language explanations of why you would label an example a certain way (1 (ELECTRONICS), 2 (GUNS), or 0 (ABSTAIN or no label)).__
These explanations will be parsed into functions which will be aggregated by Snorkel to create training data from unlabeled examples.

You can evaluate your progress based on the coverage and f1 score of your label model, or by training a logistic regression classifier on the data and evaluating the test result.

In [None]:
# Start the timer!
stat_history = stat_history.append({
    "time": datetime.now(), 
    "num_lfs": 0,
    "f1": 0.0,
    "precision": 0.0,
    "recall": 0.0,
    "training_label_coverage": 0.0,
    "training_label_size": 0.0
}, ignore_index=True)

Load the data into a *BabbleStream*: an object that iteratively displays candidates, collects and parses explanations.

In [None]:
babbler = BabbleStream(Cs, Ys, balanced=True, shuffled=True, seed=456)

Here, you can define aliases (a concise way to refer to a set of terms). 
In a little bit you'll see an example of how to use aliases.

In [None]:
# aliases are a way to refer to a set of words in a rule.
aliases = {
    "unit": ["joules", "volts", "ohms", "MHz"]
}
babbler.add_aliases(aliases)

In [None]:
def prettyprint(candidate):
    # just a helper function to print the candidate nicely
    print("MENTION ID {}".format(candidate.mention_id))
    print()
    print(candidate.text)

Let's look at an example candidate!

In [None]:
# Rerun this cell to get a new example
candidate = babbler.next()
prettyprint(candidate)

__Next, we'll learn how to write a labelling function from a natural language explanation of why you chose a label for a given candidate.__

## Create Explanations

Creating explanations generally happens in five steps:
1. View candidates
2. Write explanations
3. Get feedback
4. Update explanations 
5. Apply label aggregator

Steps 3-5 are optional; explanations may be submitted without any feedback on their quality. However, in our experience, observing how well explanations are being parsed and what their accuracy/coverage on a dev set are (if available) can quickly lead to simple improvements that yield significantly more useful labeling functions. 

Once a few labeling functions have been collected, you can use the label aggregator to identify candidates that are being mislabeled and write additional explanations targeting those failure modes.

Feel free to consult the internet or ask your experiment leader.

*For the real task, you will be asked to write labeling functions as quickly and accurately as possible. You will still be allowed to use the internet in this phase, but not ask your experiment leader. You may refer to this tutorial as needed.*

### Collection

Use `babbler` to show candidates

In [None]:
candidate = babbler.next()
prettyprint(candidate)

Is it about guns or electronics? What makes you think that? (If you don't know, it's okay to make your best guess or skip an example.)

Run the three examples given below, then parse them, and analyze them.
Then, you can try editing them and writing your own functions!

In [None]:
e0 = Explanation(
    # name of this rule, for your reference
    name='electr...', 
    
    # label to assign
    label=ELECTRONICS, 
    
    # natural language description of why you label the candidate this way
    condition='A word in the sentence starts with "electr"', 
    
    # candidate is an optional argument, it should be the id of an example labeled by this rule.
    # This is a fail-safe: if the rule doesn't apply to the candidate you provide, it will be filtered!
    candidate = 5
)

In [None]:
e1 = Explanation(
    name = 'politics', 
    label = GUNS, 
    condition = 'Any of the words "election", "senator", "democrat", "candidate", or "republican" are in the text', 
    candidate = 33 # the candidate's mention ID, optional argument
)

In [None]:
e2 = Explanation(
    name = 'selfdefense', 
    label = GUNS, 
    condition = 'because the word "self" occurs before "defense"'
)

Below is an example of an explanation that uses an alias: "unit".

You can define more aliases where the BabbleStream is initialized.

In [None]:
e3 = Explanation(
    name = "units", 
    label = ELECTRONICS, 
    condition = 'A word in the sentence is a unit' 
)

In [None]:
e4 = Explanation(
    name = "e4", 
    label = ABSTAIN, 
    condition = ""
)

Babble will parse your explanations into functions, then filter out functions that are duplicates, incorrectly label their given candidate, or assign the same label to all examples.

In [None]:
# Add any explanations that you haven't committed yet
explanations = [e0, e1, e2, e3]

parses, filtered = babbler.apply(explanations)
stat_history = stat_history.append({
    "time": datetime.now(), 
    "num_lfs": len(parses),
    "num_explanations": len(explanations),
    "num_filtered": len(filtered)
}, ignore_index=True)

### Analysis
See how your parsed explanations performed

In [None]:
try: 
    dev_analysis = babbler.analyze(parses)
    display(dev_analysis)
    dev_analysis['time'] = datetime.now()
    dev_analysis['eval'] = "dev"
    dev_analysis["lf_id"] = dev_analysis.index
    stat_history = stat_history.append(dev_analysis, sort=False, ignore_index=True)
except ValueError as e:
    print("It seems as though none of your labeling functions were parsed. See the cells above and below for more information.")
    print("ERROR:")
    print(e)

See which explanations were filtered and why

In [None]:
babbler.filtered_analysis(filtered)

In [None]:
babbler.commit()

### Evaluation
Get feedback on the performance of your explanations

In [None]:
search_space = {
    'n_epochs': [50, 100, 500],
    'lr': {'range': [0.01, 0.001], 'scale': 'log'},
    'show_plots': False,
}

tuner = RandomSearchTuner(LabelModel, seed=123)

label_aggregator = tuner.search(
    search_space, 
    train_args=[Ls[0]], 
    X_dev=Ls[1], Y_dev=Ys[1], 
    max_search=20, verbose=False, metric='f1')

# record statistics over time
pr, re, f1, acc = label_aggregator.score(Ls[1], Ys[1], metric=['precision', 'recall', 'f1', 'accuracy'])
stats = {
    "precision": pr,
    "recall": re,
    "f1": f1,
    "accuracy": acc,
    "eval": "dev",
    "model": "label_aggregator",
    "time": datetime.now(),
    "training_label_coverage": label_coverage(Ls[0]),
    "training_label_size": label_coverage(Ls[0])*len(dfs[0])
}
stat_history = stat_history.append(stats, ignore_index=True)

In [None]:
# view some incorrectly labeled examples for a given LF
j = 0
print(lf_names[j])
# set j to match the value of the LF you're interested in
L_dev = Ls[1].todense()
display(df_dev[L_dev[:,j].A1==abs(df_dev["label"]-3)])

## Train Model
We can train a simple bag of words model on these labels, and see test accuracy.

(This step may take a while).

In [None]:
L_train = Ls[0].todense()
probs_train = label_aggregator.predict_proba(L=L_train)
mask = (L_train != 0).any(axis=1).A1
df_train_filtered = df_train.iloc[mask]
probs_train_filtered = probs_train[mask]
print("{} out of {} examples used for training data".format(len(df_train_filtered), len(df_train)))

In [None]:
from analyzer import train_model_from_probs
stats = train_model_from_probs(df_train_filtered, probs_train_filtered, df_valid, df_test)
stats["time"] = datetime.now()
stat_history = stat_history.append(stats, ignore_index=True)

## Save
When your time is up, please save your explanations and model!

In [None]:
# save statistics history 
stat_history.to_csv("babble_tutorial_statistics_history.csv")

# save explanations
FILE = "babble_tutorial_explanations.tsv"
from types import SimpleNamespace
exp_io = ExplanationIO()
for exp in explanations:
    if exp.candidate is None:
        exp.candidate = SimpleNamespace(mention_id = None)
exp_io.write(explanations, FILE)
explanations = exp_io.read(FILE)

# save label model
label_aggregator.save("babble_tutorial_lfmodel.pkl")

In [None]:
stat_history