In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score

# =========================
# 1. Load 20 Newsgroups dataset
# =========================
categories = [
    'sci.space', 'comp.graphics', 'rec.sport.baseball', 'talk.politics.mideast'
]  # smaller subset for clarity and speed

data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
X_text = data.data
y = data.target

# =========================
# 2. TF-IDF Vectorization
# =========================
# Use n-grams and increase max_features for better text representation
vectorizer = TfidfVectorizer(max_features=8000, stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(X_text)

# =========================
# 3. Split labeled and unlabeled data
# =========================
# Use 20% labeled, 80% unlabeled
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(
    X, y, test_size=0.80, random_state=42, stratify=y
)

# Create a separate test set from labeled portion
X_train, X_test, y_train, y_test = train_test_split(
    X_labeled, y_labeled, test_size=0.25, random_state=42, stratify=y_labeled
)

# Combine labeled + unlabeled
y_train_full = np.concatenate([y_train, np.full(y_unlabeled.shape, -1, dtype=int)])
X_train_full = np.vstack([X_train.toarray(), X_unlabeled.toarray()])

print(f"Total docs: {X.shape[0]}")
print(f"Labeled docs: {X_train.shape[0]}")
print(f"Unlabeled docs: {X_unlabeled.shape[0]}")
print(f"Test docs: {X_test.shape[0]}")

# =========================
# 4. Dimensionality Reduction + Classifier Pipeline
# =========================
# Reduce high-dimensional TF-IDF using SVD (LSA)
svd = TruncatedSVD(n_components=300, random_state=42)

# Logistic regression (probability output helps SelfTraining)
base_clf = make_pipeline(svd, LogisticRegression(max_iter=2000, solver='lbfgs', n_jobs=-1))

# =========================
# 5. Semi-supervised training
# =========================
# Lower threshold â†’ more aggressive pseudo-labeling
semi_supervised_model = SelfTrainingClassifier(base_clf, threshold=0.6, max_iter=15, verbose=True)
semi_supervised_model.fit(X_train_full, y_train_full)

# =========================
# 6. Evaluation
# =========================
y_pred = semi_supervised_model.predict(X_test)
semi_acc = accuracy_score(y_test, y_pred)
print(f"\nâœ… Semi-supervised model accuracy: {semi_acc:.4f}")

# =========================
# 7. Baseline (supervised only)
# =========================
supervised_model = make_pipeline(svd, LogisticRegression(max_iter=2000, solver='lbfgs', n_jobs=-1))
supervised_model.fit(X_train, y_train)
y_pred_sup = supervised_model.predict(X_test)
sup_acc = accuracy_score(y_test, y_pred_sup)
print(f"ðŸ§  Supervised-only accuracy (small labeled set): {sup_acc:.4f}")

Total docs: 3894
Labeled docs: 583
Unlabeled docs: 3116
Test docs: 195
End of iteration 1, added 501 new labels.
End of iteration 2, added 591 new labels.
End of iteration 3, added 339 new labels.
End of iteration 4, added 148 new labels.
End of iteration 5, added 60 new labels.
End of iteration 6, added 23 new labels.
End of iteration 7, added 23 new labels.
End of iteration 8, added 12 new labels.
End of iteration 9, added 6 new labels.
End of iteration 10, added 5 new labels.
End of iteration 11, added 6 new labels.
End of iteration 12, added 6 new labels.
End of iteration 13, added 3 new labels.
End of iteration 14, added 1 new labels.
End of iteration 15, added 3 new labels.

âœ… Semi-supervised model accuracy: 0.7949
ðŸ§  Supervised-only accuracy (small labeled set): 0.8205


In [2]:
# ============================================================
#  Learning from Labeled Features using Generalized Expectation
#  (GE-FL) â€“ Semi-Supervised Implementation
#  Dataset: Rotten Tomatoes (Movie Review Polarity)
# ============================================================

import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from scipy.special import softmax

# ------------------------------------------------------------
# 1. Load Dataset
# ------------------------------------------------------------
print("ðŸ“¦ Loading dataset...")
dataset = load_dataset("rotten_tomatoes")

texts = list(dataset["train"]["text"])
labels = np.array(dataset["train"]["label"])
test_texts = list(dataset["test"]["text"])
test_labels = np.array(dataset["test"]["label"])

# Semi-supervised split: few labeled, many unlabeled
X_labeled_texts, X_unlabeled_texts, y_labeled, _ = train_test_split(
    texts, labels, test_size=0.9, stratify=labels, random_state=42
)
print(f"âœ… Semi-supervised setup: {len(X_labeled_texts)} labeled, {len(X_unlabeled_texts)} unlabeled")

# ------------------------------------------------------------
# 2. Vectorization
# ------------------------------------------------------------
vectorizer = CountVectorizer(max_features=5000, stop_words="english", binary=True)
X_all = vectorizer.fit_transform(texts).toarray()
X_labeled = vectorizer.transform(X_labeled_texts).toarray()
X_unlabeled = vectorizer.transform(X_unlabeled_texts).toarray()
X_test = vectorizer.transform(test_texts).toarray()
vocab = vectorizer.get_feature_names_out()

# ------------------------------------------------------------
# 3. Labeled Features (Domain Knowledge)
# ------------------------------------------------------------
labeled_features = {
    "excellent": [1, 0],
    "amazing": [1, 0],
    "wonderful": [1, 0],
    "great": [1, 0],
    "fantastic": [1, 0],
    "love": [1, 0],
    "enjoyed": [1, 0],
    "brilliant": [1, 0],
    "favorite": [1, 0],
    "best": [1, 0],
    "boring": [0, 1],
    "awful": [0, 1],
    "terrible": [0, 1],
    "bad": [0, 1],
    "poor": [0, 1],
    "worst": [0, 1],
    "waste": [0, 1],
    "disappointing": [0, 1],
    "stupid": [0, 1],
    "slow": [0, 1],
}

# ------------------------------------------------------------
# 4. Helper Functions
# ------------------------------------------------------------
def predict_proba(X, theta):
    logits = X @ theta.T
    return softmax(logits, axis=1)

def compute_loss(X_labeled, y_labeled, X_unlabeled, theta, labeled_features, vocab,
                 lambda_ge=1.0, lambda_sup=5.0, lambda_reg=0.1):
    # Regularization
    reg_loss = 0.5 * lambda_reg * np.sum(theta ** 2)

    # Supervised cross-entropy loss
    if len(X_labeled) > 0:
        probs = predict_proba(X_labeled, theta)
        sup_loss = -np.mean(np.log(probs[np.arange(len(y_labeled)), y_labeled] + 1e-8))
    else:
        sup_loss = 0.0

    # GE term on unlabeled data
    ge_loss = 0.0
    for word, target in labeled_features.items():
        if word not in vocab:
            continue
        idx = np.where(vocab == word)[0][0]
        has_word = X_unlabeled[:, idx] > 0
        if np.any(has_word):
            probs = predict_proba(X_unlabeled[has_word], theta)
            avg_pred = probs.mean(axis=0)
            target_dist = np.array(target) / np.sum(target)
            ge_loss += np.sum((avg_pred - target_dist) ** 2)
    return lambda_ge * ge_loss + lambda_sup * sup_loss + reg_loss

# ------------------------------------------------------------
# 5. Semi-supervised GE-FL Training
# ------------------------------------------------------------
np.random.seed(42)
theta = np.random.normal(0, 0.01, (2, X_all.shape[1]))

lr = 0.1
steps = 300
lambda_ge = 1.0
lambda_sup = 5.0
lambda_reg = 0.1

print("\nðŸš€ Training Semi-Supervised GE-FL Model...")
for step in range(steps):
    grad = np.zeros_like(theta)
    eps = 1e-4
    base_loss = compute_loss(X_labeled, y_labeled, X_unlabeled, theta, labeled_features, vocab,
                             lambda_ge, lambda_sup, lambda_reg)
    for i in range(theta.shape[0]):
        for j in np.random.choice(theta.shape[1], size=100, replace=False):
            theta_perturbed = theta.copy()
            theta_perturbed[i, j] += eps
            loss2 = compute_loss(X_labeled, y_labeled, X_unlabeled, theta_perturbed,
                                 labeled_features, vocab, lambda_ge, lambda_sup, lambda_reg)
            grad[i, j] = (loss2 - base_loss) / eps

    theta -= lr * grad

    if step % 30 == 0:
        print(f"Step {step:3d} | Loss = {base_loss:.4f}")

# ------------------------------------------------------------
# 6. Evaluation
# ------------------------------------------------------------
probs_test = predict_proba(X_test, theta)
preds_test = np.argmax(probs_test, axis=1)
acc_test = accuracy_score(test_labels, preds_test)

probs_labeled = predict_proba(X_labeled, theta)
preds_labeled = np.argmax(probs_labeled, axis=1)
acc_labeled = accuracy_score(y_labeled, preds_labeled)

print("\nðŸŽ¯ Semi-Supervised GE-FL Results:")
print(f"  â€¢ Labeled subset accuracy: {acc_labeled * 100:.2f}%")
print(f"  â€¢ Test accuracy: {acc_test * 100:.2f}%")

# ------------------------------------------------------------
# 7. Fully Supervised Baseline
# ------------------------------------------------------------
print("\nðŸ’ª Fully Supervised Logistic Regression baseline...")
supervised = LogisticRegression(max_iter=1000)
supervised.fit(vectorizer.transform(texts), labels)
acc_sup = supervised.score(X_test, test_labels)
print(f"  â€¢ Fully Supervised Test Accuracy: {acc_sup * 100:.2f}%")

# ------------------------------------------------------------
# 8. Example Predictions
# ------------------------------------------------------------
def predict(text):
    X_new = vectorizer.transform([text]).toarray()
    probs = predict_proba(X_new, theta)
    label = np.argmax(probs)
    return "Positive" if label == 0 else "Negative"

print("\nðŸ”® Example Predictions:")
examples = [
    "An excellent and touching movie.",
    "Terrible acting and awful script.",
    "The plot was boring and predictable.",
    "A great film with amazing visuals!",
    "It was just okay, not too bad."
]
for ex in examples:
    print(f"'{ex}' => {predict(ex)}")


ðŸ“¦ Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

âœ… Semi-supervised setup: 853 labeled, 7677 unlabeled

ðŸš€ Training Semi-Supervised GE-FL Model...
Step   0 | Loss = 13.5316
Step  30 | Loss = 12.6200
Step  60 | Loss = 12.0046
Step  90 | Loss = 11.1576
Step 120 | Loss = 10.5791
Step 150 | Loss = 10.0618
Step 180 | Loss = 9.6449
Step 210 | Loss = 9.2502
Step 240 | Loss = 8.9495
Step 270 | Loss = 8.7094

ðŸŽ¯ Semi-Supervised GE-FL Results:
  â€¢ Labeled subset accuracy: 53.69%
  â€¢ Test accuracy: 43.53%

ðŸ’ª Fully Supervised Logistic Regression baseline...
  â€¢ Fully Supervised Test Accuracy: 75.98%

ðŸ”® Example Predictions:
'An excellent and touching movie.' => Positive
'Terrible acting and awful script.' => Negative
'The plot was boring and predictable.' => Negative
'A great film with amazing visuals!' => Positive
'It was just okay, not too bad.' => Negative
