In [1]:
# ============================================================
# 03_sentiment_slicing_tutorial.ipynb
# Snorkel + IMDb: Data Slicing (SFs)
# ============================================================

!pip install snorkel datasets scikit-learn -q


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/103.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# STEP 1: Imports

import re
import numpy as np
import pandas as pd
from datasets import load_dataset

from snorkel.slicing import slicing_function, PandasSFApplier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [3]:
# STEP 2: Load IMDb data

dataset = load_dataset("imdb")

df_train = pd.DataFrame(dataset["train"]).sample(3000, random_state=42).reset_index(drop=True)
df_test  = pd.DataFrame(dataset["test"]).sample(2000, random_state=42).reset_index(drop=True)

df_train.head()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Unnamed: 0,text,label
0,"Dumb is as dumb does, in this thoroughly unint...",0
1,I dug out from my garage some old musicals and...,1
2,After watching this movie I was honestly disap...,0
3,This movie was nominated for best picture but ...,1
4,Just like Al Gore shook us up with his painful...,1


In [4]:
# STEP 3: Train a simple baseline model (to later check per-slice performance)

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(df_train.text)
y_train = df_train.label

clf = LogisticRegression(max_iter=300)
clf.fit(X_train, y_train)

X_test = vectorizer.transform(df_test.text)
y_test = df_test.label

baseline_acc = accuracy_score(y_test, clf.predict(X_test))
print("Overall test accuracy:", baseline_acc)


Overall test accuracy: 0.848


In [5]:
# STEP 4: Define slicing functions (SFs)
# These mark whether a review belongs to a special "slice" of data.

@slicing_function()
def sf_short_review(x):
    # Very short reviews (often hard)
    return len(x.text.split()) < 5

@slicing_function()
def sf_contains_emoji(x):
    return any(ch in x.text for ch in ["😊", "😂", "😡", "😭", "😍"])

@slicing_function()
def sf_all_caps(x):
    text = x.text.strip()
    return len(text.split()) > 3 and text.isupper()

@slicing_function()
def sf_mixed_sentiment(x):
    # Contains both positive and negative words
    pos_words = ["good", "great", "amazing", "love", "fantastic"]
    neg_words = ["bad", "boring", "worst", "terrible", "awful"]
    text = x.text.lower()
    has_pos = any(w in text for w in pos_words)
    has_neg = any(w in text for w in neg_words)
    return has_pos and has_neg

@slicing_function()
def sf_question_review(x):
    return "?" in x.text


In [8]:
from snorkel.slicing import slicing_function, PandasSFApplier

@slicing_function()
def sf_short(x):
    return len(x.text.split()) < 8

@slicing_function()
def sf_long(x):
    return len(x.text.split()) > 40

@slicing_function()
def sf_caps(x):
    return x.text.isupper()

sfs = [sf_short, sf_long, sf_caps]

# Snorkel 0.9.x uses *positional only*
sfa = PandasSFApplier(sfs)

S_test = sfa.apply(df_test)

S_test[:10]


100%|██████████| 2000/2000 [00:00<00:00, 19301.15it/s]


rec.array([(0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0),
           (0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0)],
          dtype=[('sf_short', '<i8'), ('sf_long', '<i8'), ('sf_caps', '<i8')])

In [9]:
# STEP 6: Helper to compute slice-wise accuracy

def slice_accuracy(S_matrix, slice_idx, y_true, y_pred):
    # Slice members: where S[:, slice_idx] == 1
    mask = S_matrix[:, slice_idx].astype(bool)
    if mask.sum() == 0:
        return None, 0  # no examples in this slice
    acc = accuracy_score(y_true[mask], y_pred[mask])
    return acc, mask.sum()





In [11]:

    # Convert list-of-dicts to matrix
slice_names = [sf.name for sf in sfs]

S_test_raw = sfa.apply(df_test)
S_test = np.array([[row[name] for name in slice_names] for row in S_test_raw])

100%|██████████| 2000/2000 [00:00<00:00, 2264.47it/s]


In [12]:
y_pred = clf.predict(X_test)

results = []
for i, name in enumerate(slice_names):
    acc, count = slice_accuracy(S_test, i, y_test.values, y_pred)
    results.append((name, count, acc))

results_df = pd.DataFrame(results, columns=["slice_name", "num_examples", "slice_accuracy"])
results_df


Unnamed: 0,slice_name,num_examples,slice_accuracy
0,sf_short,0,
1,sf_long,1983,0.847705
2,sf_caps,0,
