In [14]:
import pandas as pd

# Read the data
df = pd.read_csv('data/Youtube01-Psy.csv', usecols=['CONTENT', 'CLASS'])

df.rename(columns={'CONTENT': 'text', 'CLASS': 'label'}, inplace=True)

df.head()



Unnamed: 0,text,label
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [15]:
from sklearn.model_selection import train_test_split

# Split the data into train and test
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

print('Train size: ', len(df_train))
print('Test size: ', len(df_test))

Train size:  280
Test size:  70


In [16]:
ABSTAIN = -1
HAM = 0
SPAM = 1

In [17]:
from snorkel.labeling import labeling_function
import re

@labeling_function()
def lf_keyword_my(x):
    """Mark any comment mentioning 'my' as spam (My chanel, My video, My song, etc.)"""
    return SPAM if "my" in x.text.lower() else ABSTAIN

@labeling_function()
def lf_regex_check_out(x):
    """Spam comments say 'check out my video', 'check it out', etc."""
    return SPAM if re.search(r"check.*out", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def lf_short_comment(x):
    """Non-spam comments are often short, such as 'cool video!'."""
    return HAM if len(x.text.split()) < 5 else ABSTAIN



In [18]:
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

# Define the set of labeling functions (LFs)
lfs = [lf_keyword_my, lf_regex_check_out, lf_short_comment]

# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)

# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["snorkel_label"] = label_model.predict(L=L_train, tie_break_policy="abstain")


100%|██████████| 280/280 [00:00<00:00, 11207.76it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.029]
INFO:root:[50 epochs]: TRAIN:[loss=0.003]
INFO:root:[100 epochs]: TRAIN:[loss=0.001]
 20%|██        | 101/500 [00:00<00:00, 990.30epoch/s]INFO:root:[150 epochs]: TRAIN:[loss=0.000]
INFO:root:[200 epochs]: TRAIN:[loss=0.000]
 44%|████▍     | 219/500 [00:00<00:00, 1099.95epoch/s]INFO:root:[250 epochs]: TRAIN:[loss=0.000]
INFO:root:[300 epochs]: TRAIN:[loss=0.000]
INFO:root:[350 epochs]: TRAIN:[loss=0.000]
 70%|███████   | 351/500 [00:00<00:00, 1195.20epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.000]
INFO:root:[450 epochs]: TRAIN:[loss=0.000]
100%|██████████| 500/500 [00:00<00:00, 1155.76epoch/s]
INFO:root:Finished Training


In [23]:
coverage_my, coverage_check_out, coverage_short = (L_train != ABSTAIN).mean(axis=0)

print(f"Keyword 'my' coverage: {coverage_my * 100:.1f}%")
print(f"Regex 'check out' coverage: {coverage_check_out * 100:.1f}%")
print(f"Short comment coverage: {coverage_short * 100:.1f}%")

Keyword 'my' coverage: 18.9%
Regex 'check out' coverage: 10.0%
Short comment coverage: 20.0%


In [24]:
L_train

array([[-1, -1, -1],
       [-1, -1,  0],
       [ 1, -1, -1],
       [-1, -1, -1],
       [-1, -1,  0],
       [-1, -1, -1],
       [-1, -1, -1],
       [ 1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1,  1, -1],
       [-1, -1,  0],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [ 1, -1, -1],
       [-1, -1,  0],
       [-1, -1,  0],
       [-1, -1,  0],
       [ 1,  1,  0],
       [ 1, -1, -1],
       [-1,  1, -1],
       [ 1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1,  0],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [ 1,  1, -1],
       [-1, -1,  0],
       [-1, -1,  0],
       [ 1, -1, -1],
       [-1, -1,  0],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1],
       [ 1, -