In [None]:
# Let's see if you need to unzip the data again.
!ls ../data

In [None]:
# Unzip the data if needed
# Replace PASSWORD with the password to unzip

!unzip -P PASSWORD ../data.zip -d ../

In [None]:
import nltk
import pandas as pd
import pickle
import re
import sys
sys.path.append("..")

from analyzer import load_dataset, update_stats, train_model, save_model
from datetime import datetime
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import LabelingFunction
from snorkel.labeling import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import PandasLFApplier

pd.set_option('display.max_colwidth', -1)
nltk.download("punkt")
nltk.download('vader_lexicon')

In [None]:
df_train, df_dev, df_valid, df_test = load_dataset("Amazon")
print("{} training examples".format(len(df_train)))
print("{} development examples".format(len(df_dev)))
print("{} validation examples".format(len(df_valid)))
print("{} test examples".format(len(df_test)))

Define the labels for this task:

In [None]:
ABSTAIN = -1
NEGATIVE = 0
POSITIVE = 1

# Amazon Customer Reviews Task

### You will work with Amazon Customer Reviews, writing labeling functions that classify them as positive (1) or negative (0) sentiment. 

The reviews are from [Amazon](https://s3.amazonaws.com/amazon-reviews-pds/readme.html).
All reviews were submitted with either 1 star (negative) or 5 star (positive) ratings. 

Your task is to __create labeling functions__ that take the text of the review as input, and output either a NEGATIVE or a POSITIVE or an ABSTAIN label. Try to write them as quickly and accurately as possible.

You can evaluate your progress based on the coverage and f1 score of your label model, or by training a logistic regression classifier on the data and evaluating the test result.

You may consult the internet at any time.

Let's look at some examples:

In [None]:
print("NEGATIVE")
negative_sample = df_dev[df_dev.label==NEGATIVE].sample(5)
update_stats({"examples": negative_sample, "class": NEGATIVE}, "show_examples")
display(negative_sample)

print("POSITIVE")
positive_sample = df_dev[df_dev.label==POSITIVE].sample(5)
update_stats({"examples": positive_sample, "class": POSITIVE}, "show_examples")
display(positive_sample)

## Writing Labeling Functions
Time to write some labeling functions!

In [None]:
# Add more imports or helper functions here


In [None]:
def lf0(x):
    if 'good' in x.text:
        return POSITIVE
    return ABSTAIN

In [None]:
def lf1(x):
    return ABSTAIN

In [None]:
def lf2(x):
    return ABSTAIN

## (Optional) Test your function

In [None]:
from types import SimpleNamespace

def test_func(lf, example_text):
    x = SimpleNamespace(text=example_text)
    update_stats({"function": lf.__name__, "text": example_text}, "test_function")
    return lf(x)


In [None]:
test_func(lf0, "your text here")

## Applying Functions
This is how we obtain training labels, by training a model to combine the outputs of the noisy labeling functions.


`L_train` and `L_dev` are matrices representing the label returned by each labeling function for each example in the training and development sets.

In [None]:
# Make sure all your functions are in this list!
my_lfs = [lf0]


lfs = [LabelingFunction(name="lf{}__".format(i), f = lf ) for i, lf in enumerate(my_lfs)]

# Apply the LFs to the unlabeled training data, and the development data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
L_dev = applier.apply(df_dev)

update_stats({"num_lfs": len(lfs), "lfs": lfs}, "submit_lfs", applier=applier)


Train the snorkel model to combine these noisy labels.

In [None]:
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["pred_label"] = label_model.predict_proba(L=L_train)[:,0]
probs_train = df_train["pred_label"]




# record intermediate results
# Don't worry about this code block, we just store some metrics to keep track of your progress.
Y_dev = df_dev.label.values
stats = label_model.score(L=L_dev, Y=Y_dev, metrics=["f1", "precision", "recall"])
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=probs_train, L=L_train)
stats["training_label_coverage"] = len(probs_train_filtered)/len(probs_train)
stats["training_label_size"] = len(probs_train_filtered)
stats["num_lfs"] = len(lfs)
stats["data"] = "dev"

update_stats(stats, "stats", label_model=label_model)

In [None]:
# let's see some examples of aggregated (probabilistic) labels!
# re run this cell for new examples

sample = df_train.sample(5)
update_stats({"examples": sample, "label": "label_model_predictions"}, "show_examples")
display(sample)

## View Unlabeled Examples
You can use these to brainstorm new labeling functions. You may try filtering or sorting them in other ways.

If you get a `ValueError: a must be greater than 0 unless no samples are taken`, this means all your training examples are labeled by at least one LF.

In [None]:
# You can filter for unlabeled data
try:
    df_unlabeled = df_train[~df_train.index.isin(df_train_filtered.index)]
    sample = df_unlabeled.sample(5)
    update_stats({"examples": sample, "label": "unlabeled"}, "show_examples")
    display(sample)
except ValueError as e:
    print("ValueError: ")
    print(e)
    label_sums = (L_train != -1).sum(axis=1)
    print("\nExamples with lowest coverage: ({})".format(min(label_sums)))
    display(df_train[label_sums == min(label_sums)].head())

## Analyze Results
Evaluate the accuracy of the estimated training labels and development set labels (based on ground truth).

`Polarity` describes the set of outputs of each function, not including `ABSTAIN (-1)`.
For example, a function that returns `ABSTAIN` or `POSITIVE` has polarity `[1]`

In [None]:
train_analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
display("Training set results:", train_analysis)
train_analysis['data'] = "train"
train_analysis["lf_id"] = train_analysis.index

update_stats({"num_lfs": len(lfs), "lf_ids": train_analysis.j.tolist(), "data": "train"}, "lf_analysis_train")

In [None]:
Y_dev = df_dev.label.values
dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)
display("Dev set results:", dev_analysis)
dev_analysis['data'] = "dev"
dev_analysis["lf_id"] = dev_analysis.index
update_stats({"num_lfs": len(lfs), "lf_ids": dev_analysis.j.tolist(), "data": "dev"}, "lf_analysis_dev")

### View Incorrectly Labeled Examples

__FOR ONE (1) GIVEN LABELING FUNCTION__

In [None]:
# view some incorrectly labeled examples for a given LF
j = 0
# set j to match the value of the LF you're interested in
display(df_dev[L_dev[:,j]==abs(df_dev["label"]-1)])
update_stats({"examples": sample, "label": "incorrect", "lf_id": j}, "show_examples")

## Train Model
We can train a simple bag of words model on these labels to see the test accuracy.

(This step may take a while).

In [None]:
stats = train_model(label_model, L_train)
display(stats)

## Save the Model
When you have finished.

In [None]:
# Enter your name (for file naming)
YOUR_NAME = ""

In [None]:
!mkdir snorkel_amazon

In [None]:
label_model.save("snorkel_amazon/lfmodel.pkl")
%history -p -o -f snorkel_amazon/history.log
!cp snorkel_amazon_task.ipynb snorkel_amazon/notebook.ipynb

In [None]:
assert len(YOUR_NAME) != 0
save_model(YOUR_NAME, "Snorkel", "Amazon")