This notebook is the implementation of full algorithm for SemiSupervised learning based on logistic regressions

In [None]:
pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from datasets import load_dataset
import matplotlib.pyplot as plt

In [None]:
print(load_dataset("imdb"))



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [None]:
#the percentages of labeled data and unlabeled data are subject to change
#2% of the training data (500) is labeled training
#the purpose of this decreased size is to simulate the environment of semi-supervised learning
imdb_train_labeled = load_dataset("imdb", split='train[:2%]+train[-2%:]')
#the whole unsupervised data set is unlabeled training
#imdb_train_unlabeled = load_dataset("imdb", split='train[10%:90%]')
imdb_train_unlabeled = load_dataset("imdb", split='unsupervised[40%:60%]')
#the test dataset
imdb_test = load_dataset("imdb", split='test[:5%]+test[-5%:]')



In [None]:
print(imdb_train_labeled)
print(imdb_train_unlabeled)
print(imdb_test)

Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})
Dataset({
    features: ['text', 'label'],
    num_rows: 10000
})
Dataset({
    features: ['text', 'label'],
    num_rows: 2500
})


In [None]:
# split the data into text and labels
train_data_l = imdb_train_labeled["text"]
train_labels_l = imdb_train_labeled["label"]

train_data_u = imdb_train_unlabeled["text"]
#unlabeled data should not have labels
#train_labels_u = imdb_train_unlabeled["label"]

test_data = imdb_test["text"]
test_labels = imdb_test["label"]

In [None]:
# convert the labels to numpy arrays for use with scikit-learn
y_train_l = np.array(train_labels_l)
y_test = np.array(test_labels)

In [None]:
def sstc(train_data, test_data, train_labels, test_labels):
    '''
    This function trains a logistic regression model on the IMDB dataset and prints the accuracy on the test set using scikit-learn (LogisticRegression)
        Parameters:
            train_data (list): list of strings, each string is a review from the training set
            test_data (list): list of strings, each string is a review from the test set
            train_labels (list): list of integers, each integer is the label (0 or 1) for the corresponding review in train_data
            test_labels (list): list of integers, each integer is the label (0 or 1) for the corresponding review in test_data
    '''
    # create a vectorizer object to generate feature vectors, we will use word counts as features
    vectorization = CountVectorizer()
    X_train = vectorization.fit_transform(train_data)
    X_test = vectorization.transform(test_data)

    # train a logistic regression model on the training set
    model = LogisticRegression(random_state=0, max_iter=1000).fit(
        X_train, train_labels)

    # make predictions on the test data
    y_pred = model.predict(X_test)

    return np.mean(y_pred == test_labels), model

In [None]:
# initial step: fit the model to labeled data
accuracy, model = sstc(train_data_l, test_data, y_train_l, y_test)

In [None]:
print('The initial model\'s test accuracy is', accuracy)

The initial model's test accuracy is 0.7568


In [None]:
print(len(train_data_u))

10000


In [None]:
# track the numnber of iterations
loop = 0

prob_thrhd = 0.99

train_accuracy = []
test_accuracy = []
unlabeled_count = []

# training loop
qualified_obs = 1000
while (qualified_obs > 100):
    unlabeled_count.append(len(train_data_u))

    #predict probabilities for unlabeled data
    vectorization = CountVectorizer()
    X_train = vectorization.fit_transform(train_data_l)
    X_unl = vectorization.transform(train_data_u)
    predict_probs = model.predict_proba(X_unl)

    #append high confidence data to labeled dataset
    high_con = 0
    drop = []
    for i in range(len(predict_probs)):
      if predict_probs[i,0] >= prob_thrhd:
        train_data_l.append(train_data_u[i])
        train_labels_l.append(0)
        drop.append(i)
        high_con += 1
      elif predict_probs[i,1] >= prob_thrhd:
        train_data_l.append(train_data_u[i])
        train_labels_l.append(1)
        drop.append(i)
        high_con += 1
    qualified_obs = high_con
    print('Iteration:', loop)
    print(qualified_obs, 'unlabeled data added to labeled dataset')

    #delete high confidence data from unlabeled dataset
    for index in sorted(drop, reverse=True):
        train_data_u.pop(index)
    print(len(train_data_u), 'unlabeled data remaining')

    #fit the model using the new labeled data
    vectorization = CountVectorizer()
    X_train = vectorization.fit_transform(train_data_l)
    X_test = vectorization.transform(test_data)
    model = LogisticRegression(random_state=0, max_iter=1000).fit(
        X_train, train_labels_l)

    # make predictions on the test data
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    test_accu = np.mean(y_test_pred == test_labels)
    train_accu = np.mean(y_train_pred == train_labels_l)
    print('The train accuracy is', train_accu)
    print('The test accuracy is', test_accu)
    train_accuracy.append(train_accu)
    test_accuracy.append(test_accu)

    loop += 1

In [None]:
plt.plot(range(loop), test_accuracy)
plt.ylabel('Test Accuracy')
plt.xlabel('Iterations')

In [None]:
plt.bar(range(loop), unlabeled_count)
plt.ylabel('Unlabeled data remaining')
plt.xlabel('Iterations')