In [10]:
# ==========================================================
# Generalized Expectation (GE-FL) with Weak Supervision
# Dataset: Rotten Tomatoes (real movie review polarity data)
# ==========================================================

import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy.special import softmax

# ------------------------------
# 1. Load and prepare dataset
# ------------------------------
print("ðŸ“¦ Loading dataset...")
dataset = load_dataset("rotten_tomatoes")

# Convert Hugging Face Dataset columns to normal lists/arrays
texts = list(dataset['train']['text'])
labels = np.array(dataset['train']['label'])
test_texts = list(dataset['test']['text'])
test_labels = np.array(dataset['test']['label'])

# Split into small labeled subset + large unlabeled subset
X_labeled_texts, X_unlabeled_texts, y_labeled, _ = train_test_split(
    texts, labels, test_size=0.95, stratify=labels, random_state=42
)
print(f"âœ… Labeled instances: {len(X_labeled_texts)}, Unlabeled: {len(X_unlabeled_texts)}")

# ------------------------------
# 2. Vectorize (shared vocabulary)
# ------------------------------
vectorizer = CountVectorizer(max_features=5000, stop_words='english', binary=True)
X_all = vectorizer.fit_transform(texts + test_texts).toarray()
vocab = np.array(vectorizer.get_feature_names_out())

X_labeled = vectorizer.transform(X_labeled_texts).toarray()
X_unlabeled = vectorizer.transform(X_unlabeled_texts).toarray()
X_test = vectorizer.transform(test_texts).toarray()

num_classes = 2  # positive / negative

# ------------------------------
# 3. Define labeled features (domain knowledge)
# ------------------------------
labeled_features = {
    "excellent": [1, 0],
    "amazing": [1, 0],
    "wonderful": [1, 0],
    "great": [1, 0],
    "fantastic": [1, 0],
    "boring": [0, 1],
    "awful": [0, 1],
    "terrible": [0, 1],
    "bad": [0, 1],
    "poor": [0, 1],
    "worst": [0, 1],
}

# ------------------------------
# 4. Initialize model parameters
# ------------------------------
theta = np.zeros((num_classes, X_all.shape[1]))
sigma = 1.0
lambda_sup = 0.5  # weight for supervised term

def predict_proba(X, theta):
    return softmax(X @ theta.T, axis=1)

# ------------------------------
# 5. Combined GE + weak supervision loss and gradient
# ------------------------------
def combined_loss_and_grad(theta, X_labeled, y_labeled, X_unlabeled, vocab, labeled_features, sigma, lambda_sup):
    probs_unlabeled = predict_proba(X_unlabeled, theta)
    loss = 0.0
    grad = np.zeros_like(theta)

    # --- GE feature constraints ---
    for f, target in labeled_features.items():
        if f not in vocab:
            continue
        idx = np.where(vocab == f)[0][0]
        mask = X_unlabeled[:, idx] > 0
        if not np.any(mask):
            continue
        expected = probs_unlabeled[mask].mean(axis=0)
        target = np.array(target)
        # GE term (KL divergence)
        loss += np.sum(target * np.log((target + 1e-10) / (expected + 1e-10)))
        diff = expected - target
        grad += diff[:, None] * X_unlabeled[mask].mean(axis=0)

    # --- Weak supervision term ---
    if len(X_labeled) > 0:
        probs_labeled = predict_proba(X_labeled, theta)
        y_onehot = np.zeros_like(probs_labeled)
        y_onehot[np.arange(len(y_labeled)), y_labeled] = 1
        sup_loss = -np.sum(y_onehot * np.log(probs_labeled + 1e-10))
        loss += lambda_sup * sup_loss / len(X_labeled)
        grad += lambda_sup * ((probs_labeled - y_onehot).T @ X_labeled) / len(X_labeled)

    # --- Regularization ---
    loss += 0.5 / sigma**2 * np.sum(theta**2)
    grad += theta / sigma**2

    return loss, grad

# ------------------------------
# 6. Optimize via gradient descent
# ------------------------------
print("ðŸš€ Training GE-FL (weak supervision) model...")
lr = 0.3
for step in range(300):
    loss, grad = combined_loss_and_grad(theta, X_labeled, y_labeled, X_unlabeled, vocab, labeled_features, sigma, lambda_sup)
    theta -= lr * grad
    if step % 30 == 0:
        print(f"Step {step:3d} | Loss = {loss:.4f}")

# ------------------------------
# 7. Evaluate GE-FL (weak supervision)
# ------------------------------
probs_test = predict_proba(X_test, theta)
y_pred = np.argmax(probs_test, axis=1)
acc = accuracy_score(test_labels, y_pred)
print(f"\nâœ… GE-FL (weak supervision) Test Accuracy: {acc*100:.2f}%")

# ------------------------------
# 8. Supervised baseline (full data)
# ------------------------------
print("\nðŸ’ª Training fully supervised Logistic Regression baseline...")
supervised = LogisticRegression(max_iter=1000)
supervised.fit(vectorizer.transform(texts), labels)
acc_sup = supervised.score(X_test, test_labels)
print(f"Supervised baseline accuracy: {acc_sup*100:.2f}%")

# ------------------------------
# 9. Inference function
# ------------------------------
def predict(text):
    X_new = vectorizer.transform([text]).toarray()
    probs = predict_proba(X_new, theta)
    label = np.argmax(probs)
    return "Positive" if label == 0 else "Negative"

# ------------------------------
# 10. Example predictions
# ------------------------------
print("\nðŸ”® Example Predictions:")
examples = [
    "An excellent and touching movie.",
    "Terrible acting and awful script.",
    "The plot was boring and predictable.",
    "A great film with amazing visuals!",
    "It was just okay, not too bad."
]
for ex in examples:
    print(f"'{ex}' => {predict(ex)}")


ðŸ“¦ Loading dataset...
âœ… Labeled instances: 426, Unlabeled: 8104
ðŸš€ Training GE-FL (weak supervision) model...
Step   0 | Loss = 7.9712
Step  30 | Loss = 5.5995
Step  60 | Loss = 5.5995
Step  90 | Loss = 5.5995
Step 120 | Loss = 5.5995
Step 150 | Loss = 5.5995
Step 180 | Loss = 5.5995
Step 210 | Loss = 5.5995
Step 240 | Loss = 5.5995
Step 270 | Loss = 5.5995

âœ… GE-FL (weak supervision) Test Accuracy: 44.56%

ðŸ’ª Training fully supervised Logistic Regression baseline...
Supervised baseline accuracy: 76.45%

ðŸ”® Example Predictions:
'An excellent and touching movie.' => Positive
'Terrible acting and awful script.' => Negative
'The plot was boring and predictable.' => Negative
'A great film with amazing visuals!' => Positive
'It was just okay, not too bad.' => Negative


In [1]:
# ============================================================
#  Learning from Labeled Features using Generalized Expectation
#  (GE-FL) â€“ Weak Supervision Implementation
#  Dataset: Rotten Tomatoes (Movie Review Polarity)
# ============================================================

import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.special import softmax

# ------------------------------------------------------------
# 1. Load Real Dataset
# ------------------------------------------------------------
print("ðŸ“¦ Loading dataset...")
dataset = load_dataset("rotten_tomatoes")

# Convert to plain Python types
texts = list(dataset["train"]["text"])
labels = np.array(dataset["train"]["label"])
test_texts = list(dataset["test"]["text"])
test_labels = np.array(dataset["test"]["label"])

# Weak supervision split: small labeled + large unlabeled
X_labeled_texts, X_unlabeled_texts, y_labeled, _ = train_test_split(
    texts, labels, test_size=0.90, stratify=labels, random_state=42
)
print(f"âœ… Labeled instances (weak supervision): {len(X_labeled_texts)}, Unlabeled: {len(X_unlabeled_texts)}")

# ------------------------------------------------------------
# 2. Vectorization
# ------------------------------------------------------------
vectorizer = CountVectorizer(max_features=5000, stop_words="english", binary=True)
X_all = vectorizer.fit_transform(texts).toarray()
X_labeled = vectorizer.transform(X_labeled_texts).toarray()
X_unlabeled = vectorizer.transform(X_unlabeled_texts).toarray()
X_test = vectorizer.transform(test_texts).toarray()

vocab = vectorizer.get_feature_names_out()

# ------------------------------------------------------------
# 3. Define labeled features (domain knowledge)
# ------------------------------------------------------------
labeled_features = {
    # Positive features
    "excellent": [1, 0],
    "amazing": [1, 0],
    "wonderful": [1, 0],
    "great": [1, 0],
    "fantastic": [1, 0],
    "love": [1, 0],
    "enjoyed": [1, 0],
    "brilliant": [1, 0],
    "favorite": [1, 0],
    "best": [1, 0],
    # Negative features
    "boring": [0, 1],
    "awful": [0, 1],
    "terrible": [0, 1],
    "bad": [0, 1],
    "poor": [0, 1],
    "worst": [0, 1],
    "waste": [0, 1],
    "disappointing": [0, 1],
    "stupid": [0, 1],
    "slow": [0, 1],
}

# ------------------------------------------------------------
# 4. Helper Functions
# ------------------------------------------------------------
def predict_proba(X, theta):
    logits = X @ theta.T
    return softmax(logits, axis=1)

def compute_GE_loss(X_unlabeled, theta, labeled_features, vocab, lambda_sup, X_lab, y_lab):
    # Gaussian prior regularization
    reg_loss = 0.5 * np.sum(theta ** 2)

    # Supervised loss (on small labeled set)
    if len(X_lab) > 0:
        probs = predict_proba(X_lab, theta)
        sup_loss = -np.mean(np.log(probs[np.arange(len(y_lab)), y_lab] + 1e-8))
    else:
        sup_loss = 0

    # Generalized Expectation (GE) term
    ge_loss = 0
    for word, target in labeled_features.items():
        if word not in vocab:
            continue
        idx = np.where(vocab == word)[0][0]
        has_word = X_unlabeled[:, idx] > 0
        if np.any(has_word):
            probs = predict_proba(X_unlabeled[has_word], theta)
            avg_pred = probs.mean(axis=0)
            target_dist = np.array(target) / np.sum(target)
            ge_loss += np.sum((avg_pred - target_dist) ** 2)

    total_loss = reg_loss + ge_loss + lambda_sup * sup_loss
    return total_loss

# ------------------------------------------------------------
# 5. GE-FL Optimization Loop
# ------------------------------------------------------------
np.random.seed(42)
theta = np.random.normal(0, 0.01, (2, X_all.shape[1]))  # binary classifier

lr = 0.1
lambda_sup = 5.0  # stronger supervised weight
steps = 300

print("\nðŸš€ Training GE-FL (weak supervision) model...")
for step in range(steps):
    # Gradient approximation by finite difference
    grad = np.zeros_like(theta)
    eps = 1e-4
    base_loss = compute_GE_loss(X_unlabeled, theta, labeled_features, vocab, lambda_sup, X_labeled, y_labeled)
    for i in range(theta.shape[0]):
        for j in np.random.choice(theta.shape[1], size=100, replace=False):
            theta_perturbed = theta.copy()
            theta_perturbed[i, j] += eps
            loss2 = compute_GE_loss(X_unlabeled, theta_perturbed, labeled_features, vocab, lambda_sup, X_labeled, y_labeled)
            grad[i, j] = (loss2 - base_loss) / eps

    theta -= lr * grad

    if step % 30 == 0:
        print(f"Step {step:3d} | Loss = {base_loss:.4f}")

# ------------------------------------------------------------
# 6. Evaluate Weakly Supervised GE-FL Model
# ------------------------------------------------------------
probs_test = predict_proba(X_test, theta)
preds_test = np.argmax(probs_test, axis=1)

# Weak supervision accuracy (on small labeled + test)
probs_labeled = predict_proba(X_labeled, theta)
preds_labeled = np.argmax(probs_labeled, axis=1)

acc_weak = accuracy_score(y_labeled, preds_labeled)
acc_weak_test = accuracy_score(test_labels, preds_test)

print("\nðŸŽ¯ Evaluation Results:")
print(f"  â€¢ Weak Supervision Accuracy (on small labeled set): {acc_weak * 100:.2f}%")
print(f"  â€¢ GE-FL Weak Supervision Test Accuracy: {acc_weak_test * 100:.2f}%")

# ------------------------------------------------------------
# 7. Supervised Baseline
# ------------------------------------------------------------
print("\nðŸ’ª Training fully supervised Logistic Regression baseline...")
supervised = LogisticRegression(max_iter=1000)
supervised.fit(vectorizer.transform(texts), labels)
acc_sup = supervised.score(X_test, test_labels)
print(f"  â€¢ Fully Supervised Test Accuracy: {acc_sup * 100:.2f}%")

# ------------------------------------------------------------
# 8. Example Predictions
# ------------------------------------------------------------
def predict(text):
    X_new = vectorizer.transform([text]).toarray()
    probs = predict_proba(X_new, theta)
    label = np.argmax(probs)
    return "Positive" if label == 0 else "Negative"

print("\nðŸ”® Example Predictions:")
examples = [
    "An excellent and touching movie.",
    "Terrible acting and awful script.",
    "The plot was boring and predictable.",
    "A great film with amazing visuals!",
    "It was just okay, not too bad."
]
for ex in examples:
    print(f"'{ex}' => {predict(ex)}")


ðŸ“¦ Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

âœ… Labeled instances (weak supervision): 853, Unlabeled: 7677

ðŸš€ Training GE-FL (weak supervision) model...
Step   0 | Loss = 13.9847
Step  30 | Loss = 13.1116
Step  60 | Loss = 12.5464
Step  90 | Loss = 11.8585
Step 120 | Loss = 11.4127
Step 150 | Loss = 11.0868
Step 180 | Loss = 10.8398
Step 210 | Loss = 10.6198
Step 240 | Loss = 10.4436
Step 270 | Loss = 10.3128

ðŸŽ¯ Evaluation Results:
  â€¢ Weak Supervision Accuracy (on small labeled set): 53.46%
  â€¢ GE-FL Weak Supervision Test Accuracy: 43.71%

ðŸ’ª Training fully supervised Logistic Regression baseline...
  â€¢ Fully Supervised Test Accuracy: 75.98%

ðŸ”® Example Predictions:
'An excellent and touching movie.' => Positive
'Terrible acting and awful script.' => Negative
'The plot was boring and predictable.' => Negative
'A great film with amazing visuals!' => Positive
'It was just okay, not too bad.' => Negative
