In [1]:
from data.preparer import load_youtube_dataset, load_amazon_dataset
from snorkel.labeling import LabelModel
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import labeling_function
from analyzer import train_model
import re
import pandas as pd
pd.set_option('display.max_colwidth', -1)

## The Data
For this task you will work with comments from 5 different YouTube videos, and classify comments as either spam (1) or legitimate comments (0) by writing labeling functions.

Data is freely available (Alberto, Lochter, and Almeida (2015)).

You must replace `PASSWORD` with the password to unzip the data.

In [None]:
!unzip -P PASSWORD data/data.zip

Archive:  data/data.zip
[data/data.zip] data/Youtube03-LMFAO.csv password: 

In [3]:
#define labels
ABSTAIN = -1
NOT_SPAM = 0
SPAM = 1

In [4]:
DELIMITER = "#"
df_train, df_dev, df_valid, df_test = load_youtube_dataset(delimiter=DELIMITER)

## Writing Labeling Functions
Time to write some labeling functions! Below is an example. Be sure to add your function to the list `lfs`

In [5]:
lfs = []

In [6]:
@labeling_function()
def my_first_labeling_function(x):
    return SPAM if "my" in x.text.lower() else ABSTAIN

lfs.append(my_first_labeling_function)

## Applying Functions
This is how we obtain training labels, by training a model to combine the outputs of the noisy labeling functions.

In [7]:
# Apply the LFs to the unlabeled training data, and the development data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
L_dev = applier.apply(df_dev)

100%|██████████| 800/800 [00:00<00:00, 22353.97it/s]
100%|██████████| 100/100 [00:00<00:00, 9352.68it/s]


In [8]:
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")
display(df_train.sample(5))

Unnamed: 0,author,date,text,label,video
178,George Smith,2015-02-05T01:47:19.970000,Please become my first subscriber. Thank you.﻿,1,3
49,Kikilaronk “Kiki” Loz,2014-08-31T16:39:30.472000,Subscribe to me if u think &quot;swag&quot; is fucking stupid﻿,-1,3
375,Chris Bieber,2015-05-21T15:55:45.911000,Love the way you lie II is nicer in my opinion. :D﻿,1,4
263,Walker's.Enchanted.Music.Channel,2014-09-11T02:15:22,"i think they were drunk when they shot the first half of the video and then the sec on half comes in, and her boobs are magically bigger and she's more beautiful suddenly, and the dude practically vanishes ﻿",-1,2
122,Champagne Pedro,2015-03-21T14:18:39.798000,Check out this playlist on YouTube:<br /><br />﻿,-1,3


## View Unlabeled Examples
You can use these to brainstorm new labeling functions. You may try filtering or sorting them in other ways.

In [9]:
# You can filter for unlabeled data
df_unlabeled = df_train[df_train.label == ABSTAIN]
display(df_unlabeled.sample(5))

Unnamed: 0,author,date,text,label,video
37,Harrys Edits,2014-11-08T10:05:40,if i reach 100 subscribers i will go round in public pouring a bucket of ice water over people and then running away acting like it wasn't me! like so people can see!!﻿,-1,1
49,Kikilaronk “Kiki” Loz,2014-08-31T16:39:30.472000,Subscribe to me if u think &quot;swag&quot; is fucking stupid﻿,-1,3
366,Ferhad Babayev,2015-05-23T08:55:42.007000,Hello. İ am from Azerbaijan<br />﻿,-1,4
238,Eminem King of Rap !,,Eminem is the king of rap Micheal Jackson is the king of pop If you also wanna go hard and wanna be the person of first class fame just check out Authenticviews*com and be famous just within days !! yO ~,-1,4
307,Ramen Noodles,2014-09-22T05:28:44,"She's got it all. Incredible voice, extremely hot, nice tits﻿",-1,2


## Analyze Results
Evaluate the accuracy of the estimated training labels and development set labels (based on ground truth).

In [10]:
# Perform analysis
from snorkel.labeling import LFAnalysis

train_analysis = LFAnalysis(L=L_train, lfs=lfs).lf_summary()
display("Training set results:", train_analysis)

'Training set results:'

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
my_first_labeling_function,0,[1],0.20625,0.0,0.0


In [11]:
Y_dev = df_dev.label.values
dev_analysis = LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)
display("Dev set results:", dev_analysis)

'Dev set results:'

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
my_first_labeling_function,0,[1],0.2,0.0,0.0,19,1,0.95


## Train Model
Train a simple bag of words model on these labels, and report test accuracy.

In [12]:
train_model(label_model, df_train, df_valid, df_test, L_train)





Restoring model weights from the end of the best epoch.
Epoch 00014: early stopping




Test Accuracy: 46.8%
