In [10]:
!pip install snorkel==0.9.9
!pip install textblob
!pip install scikit-learn
!pip install torch torchvision torchaudio
!pip install transformers
!pip install datasets
!pip install pandas
!pip install numpy
!pip install matplotlib
!python -m textblob.download_corpora


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [11]:
# --- Imports and setup ---
import os
import random
import numpy as np
import torch
import pandas as pd
import logging

os.environ["PYTHONHASHSEED"] = "0"
SEED = 123
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

logger = logging.getLogger()
logger.setLevel(logging.WARNING)

pd.set_option("display.max_colwidth", 100)


In [12]:
from datasets import load_dataset

dataset = load_dataset("imdb")
df_train = pd.DataFrame(dataset["train"])
df_test = pd.DataFrame(dataset["test"])

print(df_train.head())


                                                                                                  text  \
0  I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded ...   
1  "I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn't matter what one's ...   
2  If only to avoid making this type of film in the future. This film is interesting as an experime...   
3  This film was probably inspired by Godard's Masculin, féminin and I urge you to see that film in...   
4  Oh, brother...after hearing about this ridiculous film for umpteen years all I can think of is t...   

   label  
0      0  
1      0  
2      0  
3      0  
4      0  


In [13]:
from snorkel.slicing import slicing_function
from textblob import TextBlob

@slicing_function()
def short_review(x):
    return len(x.text.split()) < 30

@slicing_function()
def very_positive(x):
    polarity = TextBlob(x.text).sentiment.polarity
    return polarity > 0.8

sfs = [short_review, very_positive]


In [14]:
from snorkel.slicing import slice_dataframe

short_df = slice_dataframe(df_test, short_review)
pos_df = slice_dataframe(df_test, very_positive)

print("Short reviews:")
print(short_df[["text", "label"]].head(3))

print("\nVery positive reviews:")
print(pos_df[["text", "label"]].head(3))


100%|██████████| 25000/25000 [00:00<00:00, 42902.93it/s]
100%|██████████| 25000/25000 [00:33<00:00, 753.81it/s]


Short reviews:
                                                                                                     text  \
41    Widow hires a psychopath as a handyman. Sloppy film noir thriller which doesn't make much of its...   
318                                                     I hope this group of film-makers never re-unites.   
1013  You may like Tim Burton's fantasies, but not in a commercial-like show off lasting 8 minutes. It...   

      label  
41        0  
318       0  
1013      0  

Very positive reviews:
                                                                                                      text  \
12610                                  Brilliant and moving performances by Tom Courtenay and Peter Finch.   
16198  Captain Corelli's Mandolin is a beautiful film with a lovely cast including the wonderful Nicola...   
16712  This was one of my favorites as a child. My family had the 8-track tape soundtrack!! It took us ...   

       label  
12610      1

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from snorkel.utils import preds_to_probs

# Featurize
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=10000)
X_train = vectorizer.fit_transform(df_train["text"])
X_test = vectorizer.transform(df_test["text"])

Y_train = df_train["label"].values
Y_test = df_test["label"].values

# Train logistic regression
model = LogisticRegression(C=0.01, solver="liblinear")
model.fit(X_train, Y_train)

preds_test = model.predict(X_test)
probs_test = preds_to_probs(preds_test, 2)

print(f"Overall F1 score: {f1_score(Y_test, preds_test):.3f}")


Overall F1 score: 0.891


In [16]:
from snorkel.slicing import PandasSFApplier
from snorkel.analysis import Scorer

applier = PandasSFApplier(sfs)
S_test = applier.apply(df_test)

scorer = Scorer(metrics=["f1"])
scores = scorer.score_slices(
    S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True
)
print(scores)


100%|██████████| 25000/25000 [00:33<00:00, 736.87it/s]

                     f1
overall        0.890865
short_review   0.956522
very_positive  1.000000





In [22]:
# --- Fixed MLP definition ---
def get_pytorch_mlp(input_dim, hidden_dim=256, num_layers=2):
    from torch import nn
    layers = []
    in_dim = input_dim
    for _ in range(num_layers - 1):
        layers.append(nn.Linear(in_dim, hidden_dim))
        layers.append(nn.ReLU())
        in_dim = hidden_dim
    layers.append(nn.Linear(in_dim, 2))  # binary output (positive/negative)
    return nn.Sequential(*layers)

# Define architecture correctly
input_dim = X_train.shape[1]   # 10000
mlp = get_pytorch_mlp(input_dim=input_dim, hidden_dim=256)

# Now initialize the slice model using head_dim = 256
slice_model = SliceAwareClassifier(
    base_architecture=mlp,
    head_dim=256,                        # match hidden_dim, not bow_dim
    slice_names=[sf.name for sf in sfs],
    scorer=scorer,
)


In [25]:
from snorkel.classification.data import DictDataset
from snorkel.slicing import SliceAwareClassifier, PandasSFApplier
from snorkel.classification import Trainer
import torch
from torch import nn


def make_snorkel_dataset(X, Y, split):
    if hasattr(X, "toarray"):
        X = X.toarray()
    return DictDataset(
        name=split,
        split=split,
        X_dict={"input_data": torch.tensor(X, dtype=torch.float32)},
        Y_dict={"task": torch.tensor(Y, dtype=torch.long)},
    )

train_dataset = make_snorkel_dataset(X_train, Y_train, "train")
test_dataset = make_snorkel_dataset(X_test, Y_test, "test")


applier = PandasSFApplier(sfs)
S_train = applier.apply(df_train)
S_test = applier.apply(df_test)


class BoWEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
    def forward(self, x):
        return self.relu(self.fc1(x))  # ✅ returns 256-D features

input_dim = X_train.shape[1]  # 10000 features
hidden_dim = 256
mlp_encoder = BoWEncoder(input_dim, hidden_dim)


slice_model = SliceAwareClassifier(
    base_architecture=mlp_encoder,   # ✅ encoder, not logits
    head_dim=hidden_dim,
    slice_names=[sf.name for sf in sfs],
    scorer=scorer,
)


train_dl_slice = slice_model.make_slice_dataloader(
    train_dataset, S_train, shuffle=True, batch_size=64
)
test_dl_slice = slice_model.make_slice_dataloader(
    test_dataset, S_test, shuffle=False, batch_size=64
)


trainer = Trainer(n_epochs=2, lr=1e-4, progress_bar=True)
trainer.fit(slice_model, [train_dl_slice])

results = slice_model.score_slices([test_dl_slice], as_dataframe=True)
print(results)


100%|██████████| 25000/25000 [00:34<00:00, 725.49it/s]
100%|██████████| 25000/25000 [00:33<00:00, 737.19it/s]
Epoch 0:: 100%|██████████| 391/391 [00:06<00:00, 57.33it/s, model/all/train/loss=0.199, model/all/train/lr=0.0001]
Epoch 1:: 100%|██████████| 391/391 [00:06<00:00, 63.03it/s, model/all/train/loss=0.0969, model/all/train/lr=0.0001]


                           label dataset split metric     score
0                           task    test  test     f1  0.895232
1   task_slice:short_review_pred    test  test     f1  0.956522
2  task_slice:very_positive_pred    test  test     f1  1.000000
3           task_slice:base_pred    test  test     f1  0.895232
