In [None]:
# Unzip the data
# Replace PASSWORD with the password to unzip

!unzip -P PASSWORD ../data.zip -d ../

Import necessary libraries:

In [None]:
import nltk
import pandas as pd
import pickle
import re
import sys
sys.path.append("..")

from analyzer import load_dataset, update_stats, train_model, save_model
from datetime import datetime
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import LabelingFunction
from snorkel.labeling import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import PandasLFApplier

pd.set_option('display.max_colwidth', -1)
nltk.download("punkt")
nltk.download('vader_lexicon')

In [None]:
df_train, df_dev, df_valid, df_test = load_dataset("News")
print("{} training examples".format(len(df_train)))
print("{} development examples".format(len(df_dev)))
print("{} validation examples".format(len(df_valid)))
print("{} test examples".format(len(df_test)))

# Snorkel Tutorial

## News forum classification

### You will work with a subset of the 20 NewsGroup dataset. 
The texts shown are from one of two forums:
 1. Computer Electronics (Label 0)
 2. Gun Politics Forum (Label 1)
Your job is to create a training data set to classify texts as belonging to one of these two forums.

You will do this by writing labeling functions mapping text to 0 (ELECTRONICS), 1 (GUNS), or -1 (ABSTAIN or no label).
These functions will be aggregated by Snorkel to create training data from unlabeled examples.

You can evaluate your progress based on the coverage and f1 score of your label model, or by training a logistic regression classifier on the data and evaluating the test result.


Define the labels for this task:

In [None]:
ABSTAIN = -1
ELECTRONICS = 0
GUNS = 1

# Ready to Roll
Let's look at some positive and negative examples.

In [None]:
# rerun this cell to get a new sample
print("ELECTRONICS")
sample = df_dev[df_dev.label==ELECTRONICS].sample(5)
display(sample)
update_stats({"examples": sample, "class": ELECTRONICS}, "show_examples")

print("GUNS")
sample = df_dev[df_dev.label==GUNS].sample(5)
display(sample)
update_stats({"examples": sample, "class": GUNS}, "show_examples")


## Writing Labeling Functions

__Your task for this tutorial is to write 5 labeling functions.__

Feel free to consult the internet or ask your experiment leader.

*(For the real task, you will be given 30 minutes. You will still be allowed to use the internet in this phase, but not ask your experiment leader.)*

Your function should take x as an input and output ELECTRONICS, GUNS, or ABSTAIN. 

In [None]:
def lf0(x):
    return GUNS if "firearm" in x.text.lower() else ABSTAIN

Your turn! try writing a function or editing the one above.

If you want, you can write helper functions to reuse. For example:

In [None]:
# example helper function 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
def sentiment(text):
    return sia.polarity_scores(text)["compound"]

In [None]:
def lf1(x):
    return ELECTRONICS if "tesla" in x.text.lower() else ABSTAIN

In [None]:
def lf2(x):
    count = len(re.findall("power", x.text.lower())) 
    return ELECTRONICS if count > 2 else GUNS 

In [None]:
def lf3(x):
    return ABSTAIN

In [None]:
def lf4(x):
    return ABSTAIN

Test your function (optional)

In [None]:
from types import SimpleNamespace

def test_func(lf, example_text):
    x = SimpleNamespace(text=example_text)
    update_stats({"function": lf.__name__, "text": example_text}, "test_function")
    return lf(x)

In [None]:
test_func(lf2, "I've got a powerful powerful tesla coil")

## Applying Functions
This is how we obtain training labels, by training a model to combine the outputs of the noisy labeling functions.
`L_train` and `L_dev` are matrices representing the label returned by each labeling function for each example in the training and development sets.

In [None]:
# Make sure all your functions are in this list!
my_lfs = [lf0]


lfs = [LabelingFunction(name="lf{}__".format(i), f = lf ) for i, lf in enumerate(my_lfs)]

# Apply the LFs to the unlabeled training data, and the development data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
L_dev = applier.apply(df_dev)

update_stats({"num_lfs": len(lfs), "lfs": lfs}, "submit_lfs", applier=applier)

Train the snorkel model to combine these noisy labels.

In [None]:
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["pred_label"] = label_model.predict_proba(L=L_train)[:,0]
probs_train = df_train["pred_label"]




# record intermediate results
# Don't worry about this code block, we just store some metrics to keep track of your progress.
Y_dev = df_dev.label.values
stats = label_model.score(L=L_dev, Y=Y_dev, metrics=["f1", "precision", "recall"])
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=probs_train, L=L_train)
stats["training_label_coverage"] = len(probs_train_filtered)/len(probs_train)
stats["training_label_size"] = len(probs_train_filtered)
stats["num_lfs"] = len(lfs)
stats["data"] = "dev"

update_stats(stats, "stats", label_model=label_model)

In [None]:
# let's see some examples of aggregated (probabilistic) labels!
# re run this cell for new examples

sample = df_train.sample(5)
update_stats({"examples": sample, "label": "label_model_predictions"}, "show_examples")
display(sample)

## View Unlabeled Examples
You can use these to brainstorm new labeling functions. You may try filtering or sorting them in other ways.

If you get a `ValueError: a must be greater than 0 unless no samples are taken`, this means all your training examples are labeled by at least one LF.

In [None]:
# You can filter for unlabeled data
try:
    df_unlabeled = df_train[~df_train.index.isin(df_train_filtered.index)]
    sample = df_unlabeled.sample(5)
    update_stats({"examples": sample, "label": "unlabeled"}, "show_examples")
    display(sample)
except ValueError as e:
    print("ValueError: ")
    print(e)
    label_sums = (L_train != -1).sum(axis=1)
    print("\nExamples with lowest coverage: ({})".format(min(label_sums)))
    display(df_train[label_sums == min(label_sums)].head())

## Analyze Results
Evaluate the accuracy of the estimated training labels and development set labels (based on ground truth).

`Polarity` describes the set of outputs of each function, not including `ABSTAIN (-1)`.
For example, a function that returns `ABSTAIN` or `GUNS` has polarity `[1]`

In [None]:
train_analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
display("Training set results:", train_analysis)
train_analysis['data'] = "train"
train_analysis["lf_id"] = train_analysis.index

update_stats({"num_lfs": len(lfs), "lf_ids": train_analysis.j.tolist(), "data": "train"}, "lf_analysis_train")

In [None]:
Y_dev = df_dev.label.values
dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)
display("Dev set results:", dev_analysis)
dev_analysis['data'] = "dev"
dev_analysis["lf_id"] = dev_analysis.index
update_stats({"num_lfs": len(lfs), "lf_ids": dev_analysis.j.tolist(), "data": "dev"}, "lf_analysis_dev")

### View Incorrectly Labeled Examples

__FOR ONE (1) GIVEN LABELING FUNCTION__

In [None]:
# view some incorrectly labeled examples for a given LF
j = 0
# set j to match the value of the LF you're interested in
display(df_dev[L_dev[:,j]==abs(df_dev["label"]-1)])
update_stats({"examples": sample, "label": "incorrect", "lf_id": j}, "show_examples")

## Train Model
We can train a simple bag of words model on these labels, and see test accuracy.

(This step may take a while).

In [None]:
stats = train_model(label_model, L_train)
display(stats)





## FINISHED?

### It's time to save.

When your time is up, please save your explanations and model!

In [None]:
# Enter your name (for file naming)
YOUR_NAME = ""

In [None]:
!mkdir snorkel_tutorial

In [None]:
label_model.save("snorkel_tutorial/lfmodel.pkl")
%history -p -o -f snorkel_tutorial/history.log
!cp snorkel_tutorial.ipynb snorkel_tutorial/notebook.ipynb

In [None]:
assert len(YOUR_NAME) != 0
save_model(YOUR_NAME, "Snorkel", "News")

...And you're done with the tutorial! 

## THANK YOU :]