In [1]:
!pip install snorkel datasets vaderSentiment scikit-learn matplotlib


Collecting snorkel
  Downloading snorkel-0.10.0-py3-none-any.whl.metadata (9.5 kB)
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Downloading snorkel-0.10.0-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres, vaderSentiment, snorkel
Successfully installed munkres-1.1.4 snorkel-0.10.0 vaderSentiment-3.3.2


In [2]:
# STEP 1: Import libraries

import pandas as pd
from datasets import load_dataset

from snorkel.labeling import labeling_function, PandasLFApplier
from snorkel.labeling.model import LabelModel

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [3]:
# STEP 2: Load IMDb dataset
# Think of this like opening a giant book of movie reviews.

dataset = load_dataset("imdb")

# We'll use a smaller subset so Colab is fast
df_train = pd.DataFrame(dataset["train"]).sample(3000, random_state=42).reset_index(drop=True)
df_test  = pd.DataFrame(dataset["test"]).sample(2000, random_state=42).reset_index(drop=True)

df_train.head()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Unnamed: 0,text,label
0,"Dumb is as dumb does, in this thoroughly unint...",0
1,I dug out from my garage some old musicals and...,1
2,After watching this movie I was honestly disap...,0
3,This movie was nominated for best picture but ...,1
4,Just like Al Gore shook us up with his painful...,1


In [4]:
# STEP 3: Define label constants
# -1 = ABSTAIN means "I don't know"
ABSTAIN = -1
NEGATIVE = 0
POSITIVE = 1


In [5]:
# STEP 4: Create some word lists for simple rules
positive_words = ["amazing", "wonderful", "great", "loved", "fantastic", "awesome", "excellent"]
negative_words = ["boring", "awful", "terrible", "worst", "waste", "bad", "horrible", "dull"]


In [6]:
# STEP 5: Define labeling functions (LFs)
# These are small "if this, then that" rules.
# Each function either:
#   - returns POSITIVE (1)
#   - returns NEGATIVE (0)
#   - or ABSTAIN (-1) if unsure

@labeling_function()
def lf_positive_keywords(x):
    text = x.text.lower()
    if any(w in text for w in positive_words):
        return POSITIVE
    return ABSTAIN

@labeling_function()
def lf_negative_keywords(x):
    text = x.text.lower()
    if any(w in text for w in negative_words):
        return NEGATIVE
    return ABSTAIN

@labeling_function()
def lf_exclamation_positive(x):
    # If review has "!" and a positive word
    text = x.text.lower()
    if "!" in text and any(w in text for w in positive_words):
        return POSITIVE
    return ABSTAIN

@labeling_function()
def lf_question_negative(x):
    # If review looks like complaining in question form
    text = x.text.lower()
    if "?" in text and any(w in text for w in ["why", "how", "what"]) and "good" not in text:
        return NEGATIVE
    return ABSTAIN

analyzer = SentimentIntensityAnalyzer()

@labeling_function()
def lf_vader_sentiment(x):
    # Use VADER sentiment as a soft judge
    score = analyzer.polarity_scores(x.text)["compound"]
    if score > 0.4:
        return POSITIVE
    elif score < -0.4:
        return NEGATIVE
    return ABSTAIN

@labeling_function()
def lf_all_caps_negative(x):
    # If someone is SHOUTING ALL CAPS, often angry
    text = x.text
    if len(text.split()) > 3 and text.isupper():
        return NEGATIVE
    return ABSTAIN


In [7]:
# STEP 6: Apply labeling functions to the training data

lfs = [
    lf_positive_keywords,
    lf_negative_keywords,
    lf_exclamation_positive,
    lf_question_negative,
    lf_vader_sentiment,
    lf_all_caps_negative,
]

applier = PandasLFApplier(lfs=lfs)

# This creates a matrix: rows = reviews, columns = LFs
L_train = applier.apply(df_train)
L_train[:10]


100%|██████████| 3000/3000 [00:17<00:00, 173.57it/s]


array([[-1, -1, -1, -1,  0, -1],
       [ 1, -1, -1, -1,  1, -1],
       [-1,  0, -1, -1,  0, -1],
       [-1, -1, -1, -1,  1, -1],
       [ 1, -1,  1, -1,  1, -1],
       [-1,  0, -1, -1,  1, -1],
       [ 1,  0, -1, -1, -1, -1],
       [-1, -1, -1, -1,  1, -1],
       [-1,  0, -1,  0,  0, -1],
       [-1, -1, -1, -1,  1, -1]])

In [8]:
# STEP 7: Train Snorkel LabelModel to combine noisy labels
# This is like a smart teacher who looks at all noisy rules and decides the final label.

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=300, log_freq=50, seed=42)


100%|██████████| 300/300 [00:00<00:00, 536.97epoch/s]


In [9]:
# STEP 8: Get weak labels from LabelModel

df_train["weak_label"] = label_model.predict(L_train)
df_train[["text", "weak_label"]].head()


Unnamed: 0,text,weak_label
0,"Dumb is as dumb does, in this thoroughly unint...",0
1,I dug out from my garage some old musicals and...,1
2,After watching this movie I was honestly disap...,0
3,This movie was nominated for best picture but ...,1
4,Just like Al Gore shook us up with his painful...,1


In [10]:
# STEP 9: Filter out rows where all LFs abstained (no signal)

mask_has_label = df_train["weak_label"] != ABSTAIN
df_train_filtered = df_train[mask_has_label].reset_index(drop=True)

print("Original train size:", len(df_train))
print("After filtering (Snorkel labeled):", len(df_train_filtered))


Original train size: 3000
After filtering (Snorkel labeled): 2919


In [11]:
# STEP 10: Train a simple classifier (Logistic Regression) on weak labels

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(df_train_filtered.text)
y_train = df_train_filtered.weak_label

clf = LogisticRegression(max_iter=300)
clf.fit(X_train, y_train)


In [12]:
# STEP 11: Evaluate on REAL IMDb test labels (ground truth)

X_test = vectorizer.transform(df_test.text)
y_test = df_test.label  # true labels from IMDb

y_pred = clf.predict(X_test)

print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred, target_names=["NEGATIVE", "POSITIVE"]))


Accuracy on test set: 0.726

Classification report:

              precision    recall  f1-score   support

    NEGATIVE       0.94      0.51      0.66      1040
    POSITIVE       0.64      0.96      0.77       960

    accuracy                           0.73      2000
   macro avg       0.79      0.74      0.71      2000
weighted avg       0.80      0.73      0.71      2000

