In [22]:
from pathlib import Path

files = [str(x) for x in Path('data').glob('*.csv')]
files

['data/Youtube02-KatyPerry.csv',
 'data/Youtube04-Eminem.csv',
 'data/Youtube01-Psy.csv',
 'data/Youtube05-Shakira.csv',
 'data/Youtube03-LMFAO.csv']

In [23]:
import pandas as pd

dfs = []
for csv in files:
    df = pd.read_csv(csv)
    df['filename'] = csv
    dfs.append(df)

df = pd.concat(dfs)

In [24]:
import pandas as pd

df.rename(columns={'CONTENT': 'text', 'CLASS': 'label'}, inplace=True)

df.head()


Unnamed: 0,COMMENT_ID,AUTHOR,DATE,text,label,filename
0,z12pgdhovmrktzm3i23es5d5junftft3f,lekanaVEVO1,2014-07-22T15:27:50,i love this so much. AND also I Generate Free ...,1,data/Youtube02-KatyPerry.csv
1,z13yx345uxepetggz04ci5rjcxeohzlrtf4,Pyunghee,2014-07-27T01:57:16,http://www.billboard.com/articles/columns/pop-...,1,data/Youtube02-KatyPerry.csv
2,z12lsjvi3wa5x1vwh04cibeaqnzrevxajw00k,Erica Ross,2014-07-27T02:51:43,Hey guys! Please join me in my fight to help a...,1,data/Youtube02-KatyPerry.csv
3,z13jcjuovxbwfr0ge04cev2ipsjdfdurwck,Aviel Haimov,2014-08-01T12:27:48,http://psnboss.com/?ref=2tGgp3pV6L this is the...,1,data/Youtube02-KatyPerry.csv
4,z13qybua2yfydzxzj04cgfpqdt2syfx53ms0k,John Bello,2014-08-01T21:04:03,Hey everyone. Watch this trailer!!!!!!!! http...,1,data/Youtube02-KatyPerry.csv


In [25]:
df["label"].value_counts()

label
1    1005
0     951
Name: count, dtype: int64

In [26]:
from sklearn.model_selection import train_test_split

# Split the data into train and test
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

print('Train size: ', len(df_train))
print('Test size: ', len(df_test))

Train size:  1564
Test size:  392


In [27]:
ABSTAIN = -1
HAM = 0
SPAM = 1

In [28]:
from snorkel.labeling import labeling_function
import re

@labeling_function()
def lf_keyword_my(x):
    """Mark any comment mentioning 'my' as spam (My chanel, My video, My song, etc.)"""
    return SPAM if "my" in x.text.lower() else ABSTAIN

@labeling_function()
def lf_regex_check_out(x):
    """Spam comments say 'check out my video', 'check it out', etc."""
    return SPAM if re.search(r"check.*out", x.text, flags=re.I) else ABSTAIN

@labeling_function()
def lf_short_comment(x):
    """Non-spam comments are often short, such as 'cool video!'."""
    return HAM if len(x.text.split()) < 5 else ABSTAIN


@labeling_function()
def lf_contains_link(x):
    """Spam comments often contain a link."""
    return SPAM if "http" in x.text.lower() else ABSTAIN


In [29]:
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

# Define the set of labeling functions (LFs)
lfs = [lf_keyword_my, lf_regex_check_out, lf_short_comment, lf_contains_link]

# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
label_train = applier.apply(df_train)


100%|██████████| 1564/1564 [00:00<00:00, 5320.89it/s]


In [30]:

# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(label_train, n_epochs=500, log_freq=50, seed=123)
df_train["snorkel_label"] = label_model.predict(L=label_train, tie_break_policy="abstain")

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.053]
INFO:root:[50 epochs]: TRAIN:[loss=0.003]
 13%|█▎        | 65/500 [00:00<00:00, 640.06epoch/s]INFO:root:[100 epochs]: TRAIN:[loss=0.000]
 27%|██▋       | 134/500 [00:00<00:00, 668.09epoch/s]INFO:root:[150 epochs]: TRAIN:[loss=0.000]
INFO:root:[200 epochs]: TRAIN:[loss=0.000]
 44%|████▍     | 220/500 [00:00<00:00, 754.82epoch/s]INFO:root:[250 epochs]: TRAIN:[loss=0.000]
 59%|█████▉    | 296/500 [00:00<00:00, 756.38epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.000]
INFO:root:[350 epochs]: TRAIN:[loss=0.000]
 74%|███████▍  | 372/500 [00:00<00:00, 704.76epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.000]
INFO:root:[450 epochs]: TRAIN:[loss=0.000]
100%|██████████| 500/500 [00:00<00:00, 751.08epoch/s]
INFO:root:Finished Training


In [31]:
coverage_my, coverage_check_out, coverage_short, *_ = (label_train != ABSTAIN).mean(axis=0)

print(f"Coverage of keyword_my: {coverage_my * 100:.1f}%")
print(f"Coverage of regex_check_out: {coverage_check_out * 100:.1f}%")
print(f"Coverage of short_comment: {coverage_short * 100:.1f}%")


Coverage of keyword_my: 19.5%
Coverage of regex_check_out: 22.6%
Coverage of short_comment: 24.7%


In [35]:
from snorkel.labeling import LFAnalysis

LFAnalysis(L=label_train, lfs=lfs).lf_summary()


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_keyword_my,0,[1],0.195013,0.108056,0.014706
lf_regex_check_out,1,[1],0.225703,0.088235,0.001918
lf_short_comment,2,[0],0.246803,0.05243,0.05243
lf_contains_link,3,[1],0.097826,0.058184,0.038363
