In [None]:
# Let's see if you need to unzip the data again.
!ls ../data

In [None]:
# Unzip the data if needed
# Replace PASSWORD with the password to unzip

!unzip -P PASSWORD ../data.zip -d ../

In [None]:
import nltk
import pandas as pd
import pickle
import re
import sys
sys.path.append("..")

from analyzer import load_dataset, update_stats, train_model, save_model
from datetime import datetime
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import LabelingFunction
from snorkel.labeling import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import PandasLFApplier

pd.set_option('display.max_colwidth', -1)
nltk.download("punkt")
nltk.download('vader_lexicon')

In [None]:
df_train, df_dev, df_valid, df_test = load_dataset("Youtube")
print("{} training examples".format(len(df_train)))
print("{} development examples".format(len(df_dev)))
print("{} validation examples".format(len(df_valid)))
print("{} test examples".format(len(df_test)))

# Youtube Spam Classification Task

### For this task, you will work with comments from 5 different YouTube videos, and classify comments as either spam (1) or legitimate comments (0) by writing labeling functions.

Spam can be defined as irrelevant or unsolicited messages sent over the Internet.

The data is optained [from Kaggle](https://www.kaggle.com/goneee/youtube-spam-classifiedcomments). 

Define variable names for the labels in this task:

In [None]:
#define labels
ABSTAIN = -1
NOT_SPAM = 0
SPAM = 1

# See Some Examples
Let's see some examples from each class

In [None]:
print("Some labeled examples: ")
print("NOT SPAM")
sample = df_dev[df_dev.label==NOT_SPAM].sample(5)
display(sample)
update_stats({"examples": sample, "class": NOT_SPAM}, "show_examples")

print("SPAM")
sample = df_dev[df_dev.label==SPAM].sample(5)
display(sample)
update_stats({"examples": sample, "class": SPAM}, "show_examples")

# Write Labeling Functions

Time to write some labeling functions! 

Your task is to __create labeling functions__ that take the text of the review as input, and output either a SPAM or a NOT_SPAM or an ABSTAIN label. Try to write them as quickly and accurately as possible.

You may consult the internet at any time.

In [None]:
def lf0(x):
    return ABSTAIN

In [None]:
def lf1(x):
    return ABSTAIN

In [None]:
def lf2(x):
    return ABSTAIN

### (Optional) Test your function

In [None]:
from types import SimpleNamespace

def test_func(lf, example_text):
    x = SimpleNamespace(text=example_text)
    update_stats({"function": lf.__name__, "text": example_text}, "test_function")
    return lf(x)

In [None]:
test_func(lf0, "your text here")

# Apply Functions
This is how we obtain training labels, by training a model to combine the outputs of the noisy labeling functions.
`L_train` and `L_dev` are matrices representing the label returned by each labeling function for each example in the training and development sets.

You need to apply your functions each time to update them to see your results!

In [None]:
# Make sure all your functions are in this list!
my_lfs = [lf0]


lfs = [LabelingFunction(name="lf{}__".format(i), f = lf ) for i, lf in enumerate(my_lfs)]

# Apply the LFs to the unlabeled training data, and the development data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
L_dev = applier.apply(df_dev)

update_stats({"num_lfs": len(lfs), "lfs": lfs}, "submit_lfs", applier=applier)

In [None]:
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["pred_label"] = label_model.predict_proba(L=L_train)[:,0]
df_dev["pred_label"] = label_model.predict_proba(L=L_dev)[:,0]

probs_train = df_train["pred_label"]




# record intermediate results
# Don't worry about this code block, we just store some metrics to keep track of your progress.
Y_dev = df_dev.label.values
stats = label_model.score(L=L_dev, Y=Y_dev, metrics=["f1", "precision", "recall"])
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
            X=df_train, y=probs_train, L=L_train)
stats["training_label_coverage"] = len(probs_train_filtered)/len(probs_train)
stats["training_label_size"] = len(probs_train_filtered)
stats["num_lfs"] = len(lfs)
stats["data"] = "dev"

display(stats)
update_stats(stats, "stats", label_model=label_model)

# More Analysis Tools

### View Probabilistic Label Output

These are the labels created by your aggregated labelmodel, which will be used as training data.

In [None]:
# let's see some examples of aggregated (probabilistic) labels!
# re run this cell for new examples

sample = df_train.sample(5)
update_stats({"examples": sample, "label": "label_model_predictions"}, "show_examples")
display(sample)

### See Performance of Each LF
Evaluate the accuracy of the estimated training labels and development set labels (based on ground truth).

`Polarity` describes the set of outputs of each function, not including `ABSTAIN (-1)`.
For example, a function that returns `ABSTAIN` or `SPAM` has polarity `[1]`

In [None]:
train_analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
display("Training set results:", train_analysis)
train_analysis['data'] = "train"
train_analysis["lf_id"] = train_analysis.index

update_stats({"num_lfs": len(lfs), "lf_ids": train_analysis.j.tolist(), "data": "train"}, "lf_analysis_train")

In [None]:
Y_dev = df_dev.label.values
dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)
display("Dev set results:", dev_analysis)
dev_analysis['data'] = "dev"
dev_analysis["lf_id"] = dev_analysis.index
update_stats({"num_lfs": len(lfs), "lf_ids": dev_analysis.j.tolist(), "data": "dev"}, "lf_analysis_dev")

### View Unlabeled/Least Labeled Examples
You can use these to brainstorm new labeling functions. You may try filtering or sorting them in other ways.

If you see "All Examples are Labeled", this means all your training examples are labeled by at least one LF.

In [None]:
# You can filter for unlabeled data
try:
    df_unlabeled = df_train[~df_train.index.isin(df_train_filtered.index)]
    sample = df_unlabeled.sample(5)
    update_stats({"examples": sample, "label": "unlabeled"}, "show_examples")
    display(sample)
except ValueError as e:
    print("ValueError: ")
    print(e)
    label_sums = (L_train != -1).sum(axis=1)
    print("\nExamples with lowest coverage: ({})".format(min(label_sums)))
    display(df_train[label_sums == min(label_sums)].head())

### View Incorrectly Labeled Examples

__FOR ONE (1) GIVEN LABELING FUNCTION__

In [None]:
# view some incorrectly labeled examples for a given LF
j = 2
# set j to match the value of the LF you're interested in
df_dev["label_from_this_LF"] = L_dev[:,j]
sample = df_dev[L_dev[:,j]==abs(df_dev["label"]-1)]
display(sample)
update_stats({"examples": sample, "label": "incorrect", "lf_id": j}, "show_examples")

## Train Model
We can train a simple bag of words model on these labels to see the test accuracy.

(This step may take a while).

In [None]:
stats = train_model(label_model, L_train)
display(stats)

## Save the Model
When you have finished.

In [None]:
# Enter your name (for file naming)
YOUR_NAME = ""

In [None]:
!mkdir snorkel_youtube

In [None]:
label_model.save("snorkel_youtube/lfmodel.pkl")
%history -p -o -f snorkel_youtube/history.log
!cp snorkel_youtube_task.ipynb snorkel_youtube/notebook.ipynb

In [None]:
assert len(YOUR_NAME) != 0
save_model(YOUR_NAME, "Snorkel", "Youtube")