In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np

print("--- Downloading and Preprocessing SRAA2 Dataset (from 20 Newsgroups) ---")


desired_categories = [
    'alt.autos',
    'rec.autos',
    'rec.aviation',
    'sci.space'
]


all_available_categories = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')).target_names


categories = [cat for cat in desired_categories if cat in all_available_categories]

if not categories:
    raise ValueError("None of the specified categories were found in the 20 Newsgroups dataset. Please check category names.")

print(f"Using categories: {categories}")


newsgroups_train = fetch_20newsgroups(subset='train', categories=categories,
                                      remove=('headers', 'footers', 'quotes'), random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories,
                                     remove=('headers', 'footers', 'quotes'), random_state=42)


target_names = newsgroups_train.target_names
target_map = {name: i for i, name in enumerate(target_names)}


df_train = pd.DataFrame({
    'text_column': newsgroups_train.data,
    'label_column': [target_map[newsgroups_train.target_names[t]] for t in newsgroups_train.target]
})
df_test = pd.DataFrame({
    'text_column': newsgroups_test.data,
    'label_column': [target_map[newsgroups_test.target_names[t]] for t in newsgroups_test.target]
})

df = pd.concat([df_train, df_test], ignore_index=True)

print(f"Dataset loaded with {len(df)} samples from 20 Newsgroups ({len(df_train)} training, {len(df_test)} test).")
print("First 5 rows of the combined dataset:")
display(df.head())
print(f"Target names: {target_names}")


vectorizer = TfidfVectorizer(max_features=2000)


X = vectorizer.fit_transform(df['text_column'])
print(f"Text data vectorized into {X.shape[0]} samples and {X.shape[1]} features.")


y = df['label_column'].values
print(f"Labels extracted for {len(y)} samples.")



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print(f"Data split: {X_train.shape[0]} training samples, {X_test.shape[0]} test samples.")


y_train_semi = np.copy(y_train)


unlabeled_percentage = 0.75
rng = np.random.RandomState(42)
unlabeled_indices = rng.rand(len(y_train_semi)) < unlabeled_percentage
y_train_semi[unlabeled_indices] = -1


print("\n--- Semi-supervised Data Simulation Summary ---")
print(f"Original training samples: {len(y_train)}")
print(f"Number of initial labeled training samples: {np.sum(y_train_semi != -1)}")
print(f"Number of unlabeled training samples: {np.sum(y_train_semi == -1)}")
print(f"Number of test samples: {len(y_test)}")
print("---------------------------------------------")

--- Downloading and Preprocessing SRAA2 Dataset (from 20 Newsgroups) ---
Using categories: ['rec.autos', 'sci.space']
Dataset loaded with 1977 samples from 20 Newsgroups (1187 training, 790 test).
First 5 rows of the combined dataset:


Unnamed: 0,text_column,label_column
0,Well thank you dennis for your as usual highly...,1
1,\n\nPerhaps a nice used '88 Pontiac Fiero GT? ...,0
2,"I bought a car with a defunct engine, to use f...",0
3,\nI haven't seen any speculation about it. But...,1
4,I am in the process of looking for a half dece...,0


Target names: ['rec.autos', 'sci.space']
Text data vectorized into 1977 samples and 2000 features.
Labels extracted for 1977 samples.
Data split: 1383 training samples, 594 test samples.

--- Semi-supervised Data Simulation Summary ---
Original training samples: 1383
Number of initial labeled training samples: 352
Number of unlabeled training samples: 1031
Number of test samples: 594
---------------------------------------------


In [None]:
print("\n--- Defining Text-Based Weak Supervision Heuristic (20 Newsgroups) ---")


unlabeled_train_indices_in_split = np.where(y_train_semi == -1)[0]


_, _, _, _, train_indices, _ = train_test_split(df['text_column'], y, range(len(df)), test_size=0.3, random_state=42, stratify=y)


unlabeled_original_df_indices = np.array(train_indices)[unlabeled_train_indices_in_split]
original_unlabeled_texts = df['text_column'].iloc[unlabeled_original_df_indices]

print(f"Identified {len(original_unlabeled_texts)} original text messages for unlabeled training samples.")


def text_weak_label_heuristic(text_message):


    auto_keywords = ['car', 'auto', 'engine', 'tire', 'vehicle', 'motor', 'ford', 'gm', 'honda', 'toyota', 'dealership', 'sedan', 'truck', 'transmission', 'mechanic']
    space_keywords = ['space', 'nasa', 'satellite', 'orbit', 'galaxy', 'astronomy', 'shuttle', 'moon', 'planet', 'cosmic', 'universe', 'telescope', 'rocket', 'mission', 'hst']


    text_message = str(text_message).lower()

    is_auto = any(keyword in text_message for keyword in auto_keywords)
    is_space = any(keyword in text_message for keyword in space_keywords)


    if is_auto and not is_space:
        return target_map['rec.autos']
    elif is_space and not is_auto:
        return target_map['sci.space']
    else:
        return -1

weak_labels_text_heuristic = np.array([text_weak_label_heuristic(text) for text in original_unlabeled_texts])


print(f"Generated {np.sum(weak_labels_text_heuristic != -1)} weak labels from the text heuristic.")
print(f"Weak labels generated (sample): {weak_labels_text_heuristic[:10]}")
print("--------------------------------------------------")


--- Defining Text-Based Weak Supervision Heuristic (20 Newsgroups) ---
Identified 1031 original text messages for unlabeled training samples.
Generated 551 weak labels from the text heuristic.
Weak labels generated (sample): [ 0 -1 -1 -1 -1  1  0  0  1 -1]
--------------------------------------------------


In [None]:
print("\n--- Combining Text Weak Labels with Initial Labeled Data ---")


y_train_combined_text_ws = np.copy(y_train_semi)


newly_weakly_labeled_from_text = 0
for i, weak_label in enumerate(weak_labels_text_heuristic):

    original_train_split_index = unlabeled_train_indices_in_split[i]


    if weak_label != -1 and y_train_combined_text_ws[original_train_split_index] == -1:
        y_train_combined_text_ws[original_train_split_index] = weak_label
        newly_weakly_labeled_from_text += 1


print(f"Total initial labeled samples: {np.sum(y_train_semi != -1)}")
print(f"Weak labels added from text heuristic: {newly_weakly_labeled_from_text}")
print(f"Total labeled samples after text weak supervision: {np.sum(y_train_combined_text_ws != -1)}")


print("Labels after incorporating text-based weak supervision (-1 indicates still unlabeled):")
display(pd.Series(y_train_combined_text_ws).value_counts())
print("----------------------------------------------------------")


--- Combining Text Weak Labels with Initial Labeled Data ---
Total initial labeled samples: 352
Weak labels added from text heuristic: 551
Total labeled samples after text weak supervision: 903
Labels after incorporating text-based weak supervision (-1 indicates still unlabeled):


Unnamed: 0,count
0,502
-1,480
1,401


----------------------------------------------------------


In [None]:
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.linear_model import LogisticRegression

print("\n--- Applying Self-Training with Updated Labels ---")


base_classifier_text_ws = LogisticRegression(max_iter=200, random_state=42)


self_training_model_text_ws = SelfTrainingClassifier(base_classifier_text_ws, threshold=0.8)


self_training_model_text_ws.fit(X_train, y_train_combined_text_ws)

print("Self-training completed with labels incorporating text-based weak supervision.")
print("----------------------------------------------------------")


--- Applying Self-Training with Updated Labels ---
Self-training completed with labels incorporating text-based weak supervision.
----------------------------------------------------------


In [None]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

print("\n--- Evaluating Self-Training Model with Text Weak Supervision ---")


accuracy_self_train = self_training_model_text_ws.score(X_test, y_test)
print(f"Self-Training Model (Text WS) Test Accuracy: {accuracy_self_train:.4f}")


y_pred_self_train = self_training_model_text_ws.predict(X_test)

print("\nClassification Report for Self-Training Model (Text WS):")
print(classification_report(y_test, y_pred_self_train))

print("----------------------------------------------------------------")

print("\n--- Evaluating Baseline Model (Only Initial Labeled Data) ---")


X_train_initial_labeled = X_train[y_train_semi != -1]
y_train_initial_labeled = y_train_semi[y_train_semi != -1]

if X_train_initial_labeled.shape[0] > 0:
    baseline_model = LogisticRegression(max_iter=200, random_state=42)
    baseline_model.fit(X_train_initial_labeled, y_train_initial_labeled)


    accuracy_baseline = baseline_model.score(X_test, y_test)
    print(f"Baseline Model (Initial Labeled Data) Test Accuracy: {accuracy_baseline:.4f}")


    y_pred_baseline = baseline_model.predict(X_test)


    print("\nClassification Report for Baseline Model (Initial Labeled Data):")
    print(classification_report(y_test, y_pred_baseline))
else:
    print("No initially labeled data available to train a baseline model for comparison.")

print("---------------------------------------------------------------")


--- Evaluating Self-Training Model with Text Weak Supervision ---
Self-Training Model (Text WS) Test Accuracy: 0.8939

Classification Report for Self-Training Model (Text WS):
              precision    recall  f1-score   support

           0       0.84      0.97      0.90       297
           1       0.96      0.82      0.89       297

    accuracy                           0.89       594
   macro avg       0.90      0.89      0.89       594
weighted avg       0.90      0.89      0.89       594

----------------------------------------------------------------

--- Evaluating Baseline Model (Only Initial Labeled Data) ---
Baseline Model (Initial Labeled Data) Test Accuracy: 0.8687

Classification Report for Baseline Model (Initial Labeled Data):
              precision    recall  f1-score   support

           0       0.89      0.84      0.86       297
           1       0.85      0.90      0.87       297

    accuracy                           0.87       594
   macro avg       0.87 