Load the necessary data and libraries:

In [None]:
# Unzip the data
# Replace PASSWORD with the password to unzip

!unzip -P PASSWORD ../data.zip -d ../

In [None]:
import nltk
import pandas as pd
import pickle
import re
import sys
sys.path.append("..")

from analyzer import load_dataset, update_stats, train_model, save_model
from datetime import datetime
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import LabelingFunction
from snorkel.labeling import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import PandasLFApplier

pd.set_option('display.max_colwidth', -1)
nltk.download("punkt")
nltk.download('vader_lexicon')

Load the data

In [None]:
df_train, df_dev, df_valid, df_test = load_dataset("News")
print("{} training examples".format(len(df_train)))
print("{} development examples".format(len(df_dev)))
print("{} validation examples".format(len(df_valid)))
print("{} test examples".format(len(df_test)))

# Snorkel Tutorial

## News forum classification

### You will work with a subset of the 20 NewsGroup dataset. 
The texts shown are from one of two forums:
 1. Computer Electronics (Label 0)
 2. Gun Politics Forum (Label 1)
Your job is to create a training data set to classify texts as belonging to one of these two forums.

You will do this by writing labeling functions mapping text to 0 (ELECTRONICS), 1 (GUNS), or -1 (ABSTAIN or no label).
These functions will be aggregated by Snorkel to create training data from unlabeled examples.

You can evaluate your progress based on the coverage and f1 score of your label model, or by training a logistic regression classifier on the data and evaluating the test result.


Define the labels for this task:

In [None]:
ABSTAIN = -1
ELECTRONICS = 0
GUNS = 1

# See Some Examples
Let's look at some positive and negative examples.

In [None]:
# rerun this cell to get a new sample
print("ELECTRONICS")
sample = df_dev[df_dev.label==ELECTRONICS].sample(5)
display(sample)
update_stats({"examples": sample, "class": ELECTRONICS}, "show_examples")

print("GUNS")
sample = df_dev[df_dev.label==GUNS].sample(5)
display(sample)
update_stats({"examples": sample, "class": GUNS}, "show_examples")


# Write Labeling Functions

Feel free to consult the internet or ask your experiment leader.

*(For the real task, you will be given 30 minutes. You will still be allowed to use the internet in this phase, but not ask your experiment leader.)*

Your function should take x as an input and output ELECTRONICS, GUNS, or ABSTAIN. 

In [None]:
def lf0(x):
    return GUNS if "firearm" in x.text.lower() else ABSTAIN

If you want, you can write helper functions to reuse. For example:

In [None]:
# example helper function 
# see what ratio of a text is capitalized
def ratio_all_caps(text):
    x_chars = ''.join(text.split())  # remove all whitespace
    if len(x_chars)==0:
        return 0
    x_upper = sum(i.isupper() for i in x_chars) / len(x_chars)
    return x_upper

In [None]:
def lf1(x):
    if ratio_all_caps(x.text) > 0.8:
        return GUNS
    else:
        return ABSTAIN

In [None]:
def lf2(x):
    count = len(re.findall("power", x.text.lower())) 
    if count >= 2:
        return ELECTRONICS  
    else: 
        return GUNS 

In [None]:
def lf3(x):
    if "tesla" in x.text.lower():
        return ELECTRONICS  
    else: 
        return ABSTAIN

In [None]:
def lf4(x):
    return ABSTAIN

### Test your function (optional)

In [None]:
from types import SimpleNamespace

def test_func(lf, example_text):
    x = SimpleNamespace(text=example_text)
    update_stats({"function": lf.__name__, "text": example_text}, "test_function")
    return lf(x)

In [None]:
test_func(lf2, "I've got a powerful powerful tesla coil")

# Apply Functions
This is how we obtain training labels, by training a model to combine the outputs of the noisy labeling functions.
`L_train` and `L_dev` are matrices representing the label returned by each labeling function for each example in the training and development sets.

You need to apply your functions each time to update them to see your results!

In [None]:
# Make sure all your functions are in this list!
my_lfs = [lf0, lf1, lf2, lf3, lf4]


lfs = [LabelingFunction(name="lf{}__".format(i), f = lf ) for i, lf in enumerate(my_lfs)]

# Apply the LFs to the unlabeled training data, and the development data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
L_dev = applier.apply(df_dev)

update_stats({"num_lfs": len(lfs), "lfs": lfs}, "submit_lfs", applier=applier)

Train the snorkel model to combine these noisy labels.

In [None]:
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["pred_label"] = label_model.predict_proba(L=L_train)[:,0]
df_dev["pred_label"] = label_model.predict_proba(L=L_dev)[:,0]
probs_train = df_train["pred_label"]

# record intermediate results
# Don't worry about this code block, we just store some metrics to keep track of your progress.
Y_dev = df_dev.label.values
stats = label_model.score(L=L_dev, Y=Y_dev, metrics=["f1", "precision", "recall"])
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=probs_train, L=L_train)
stats["training_label_coverage"] = len(probs_train_filtered)/len(probs_train)
stats["training_label_size"] = len(probs_train_filtered)
stats["num_lfs"] = len(lfs)
stats["data"] = "dev"

display(stats)
update_stats(stats, "stats", label_model=label_model)

The stats from the output above tell you how good your training data is-- but there are a lot more questions you might have to figure out how to improve. Below are some tools at your disposal.

# More Analysis Tools

### View Probabilistic Label Output

These are the labels created by your aggregated labelmodel, which will be used as training data.

In [None]:
# let's see some examples of aggregated (probabilistic) labels!
# re run this cell for new examples

sample = df_train.sample(5)
update_stats({"examples": sample, "label": "label_model_predictions"}, "show_examples")
display(sample)

### See Performance of Each LF

Evaluate the accuracy of the estimated training labels and development set labels (based on ground truth).

`Polarity` describes the set of outputs of each function, not including `ABSTAIN (-1)`.
For example, a function that returns `ABSTAIN` or `GUNS` has polarity `[1]`

In [None]:
train_analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
display("Training set results:", train_analysis)
train_analysis['data'] = "train"
train_analysis["lf_id"] = train_analysis.index

update_stats({"num_lfs": len(lfs), "lf_ids": train_analysis.j.tolist(), "data": "train"}, "lf_analysis_train")

In [None]:
Y_dev = df_dev.label.values
dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)
display("Dev set results:", dev_analysis)
dev_analysis['data'] = "dev"
dev_analysis["lf_id"] = dev_analysis.index
update_stats({"num_lfs": len(lfs), "lf_ids": dev_analysis.j.tolist(), "data": "dev"}, "lf_analysis_dev")

### View Unlabeled/Least Labeled Examples
You can use these to brainstorm new labeling functions. You may try filtering or sorting them in other ways.

If you see "All Examples are Labeled", this means all your training examples are labeled by at least one LF.

In [None]:
# You can filter for unlabeled data
try:
    df_unlabeled = df_train[~df_train.index.isin(df_train_filtered.index)]
    sample = df_unlabeled.sample(5)
    update_stats({"examples": sample, "label": "unlabeled"}, "show_examples")
    display(sample[["text", "pred_label"]])
except ValueError as e:
    print("All examples are labeled. Showing Lowest Coverage examples.")
    label_sums = (L_train != -1).sum(axis=1)
    print("\nExamples with lowest coverage: ({})".format(min(label_sums)))
    sample = df_train[label_sums == min(label_sums)].head()
    update_stats({"examples": sample, "label": "unlabeled"}, "show_examples")
    display(sample[["text", "pred_label"]])

### View Incorrectly Labeled Examples

__FOR ONE (1) GIVEN LABELING FUNCTION__ see some examples where it underperforms.

In [None]:
# view some incorrectly labeled examples for a given LF
j = 2
# set j to match the value of the LF you're interested in
df_dev["label_from_this_LF"] = L_dev[:,j]
sample = df_dev[L_dev[:,j]==abs(df_dev["label"]-1)]
display(sample)
update_stats({"examples": sample, "label": "incorrect", "lf_id": j}, "show_examples")

# Train Model

The size and accuracy of the training data isn't really enough to know how "good" the training data is. 

For this, we can train a simple bag of words model on these labels, and see test accuracy.

(This step may take a while).

In [None]:
stats = train_model(label_model, L_train)
display(stats)





## FINISHED?

### It's time to save.

When your time is up, please save your explanations and model!

In [None]:
# Enter your name (for file naming)
YOUR_NAME = ""

In [None]:
!mkdir snorkel_tutorial

In [None]:
label_model.save("snorkel_tutorial/lfmodel.pkl")
%history -p -o -f snorkel_tutorial/history.log
!cp snorkel_tutorial.ipynb snorkel_tutorial/notebook.ipynb

In [None]:
assert len(YOUR_NAME) != 0
save_model(YOUR_NAME, "Snorkel", "News")

...And you're done with the tutorial! 

## THANK YOU :]