In [None]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("imdb")
df_train = pd.DataFrame(dataset["train"])
df_test = pd.DataFrame(dataset["test"])

df_train.head()


In [None]:
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
from textblob import TextBlob
import re

ABSTAIN = -1
NEGATIVE = 0
POSITIVE = 1

@labeling_function()
def lf_positive_keywords(x):
    return POSITIVE if re.search(r"\b(great|amazing|love|wonderful|best)\b", x.text.lower()) else ABSTAIN

@labeling_function()
def lf_negative_keywords(x):
    return NEGATIVE if re.search(r"\b(bad|worst|boring|awful|waste)\b", x.text.lower()) else ABSTAIN

@labeling_function()
def lf_textblob_sentiment(x):
    polarity = TextBlob(x.text).sentiment.polarity
    if polarity > 0.3:
        return POSITIVE
    elif polarity < -0.3:
        return NEGATIVE
    return ABSTAIN

lfs = [lf_positive_keywords, lf_negative_keywords, lf_textblob_sentiment]


In [None]:
from snorkel.labeling import PandasLFApplier, LabelModel

applier = PandasLFApplier(lfs)
L_train = applier.apply(df=df_train.sample(2000))   # use subset for speed

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=42)

df_train["label_snorkel"] = label_model.predict(L=L_train)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

vectorizer = CountVectorizer(max_features=5000, stop_words="english")
X_train = vectorizer.fit_transform(df_train["text"])
y_train = df_train["label_snorkel"]

X_test = vectorizer.transform(df_test["text"])
y_test = df_test["label"]

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
print("Test F1:", f1_score(y_test, preds))


In [None]:
from snorkel.slicing import slicing_function

@slicing_function()
def short_review(x):
    return len(x.text.split()) < 10

@slicing_function()
def has_exclamation(x):
    return "!" in x.text

@slicing_function()
def contains_movie_name(x):
    return bool(re.search(r"\b(movie|film|story)\b", x.text.lower()))

sfs = [short_review, has_exclamation, contains_movie_name]


In [None]:
from snorkel.slicing import PandasSFApplier
from snorkel.analysis import Scorer
import numpy as np

applier = PandasSFApplier(sfs)
S_test = applier.apply(df_test)

preds_test = clf.predict(X_test)
probs_test = np.vstack([1 - preds_test, preds_test]).T

scorer = Scorer(metrics=["f1"])
results = scorer.score_slices(S=S_test, golds=y_test, preds=preds_test, probs=probs_test, as_dataframe=True)
print(results)
