In [1]:
from speech import *
from classify import evaluate

In [2]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def self_train(Xu, Xl, yl, C=0.7, confident_cutoff=0.8):
    Xhat, yhat = Xl, yl
    num_iter = 0
    while True:
        # Train
        num_iter += 1
        print(f"{num_iter}th train")
        print("Data size:", Xl.shape, yl.shape)
        cls = LogisticRegression(max_iter=10000, n_jobs=-1, C=C)
        cls.fit(Xhat, yhat)
        
        print("Evaluate Dev")
        evaluate(speech.devX, speech.devy, cls)

        # Predict
        print("Predicting unlabeled data with the previous model")
        yu_hat = cls.predict(Xu)
        confidents = cls.predict_proba(Xu).max(axis=1)
        print(confidents)

        # Expand Confident samples
        confident_Xu = Xu[confidents >= confident_cutoff]
        confident_yu_hat = yu_hat[confidents >= confident_cutoff]
        Xu = Xu[confidents < confident_cutoff]

        if confident_Xu.shape[0] == 0:
            print("Data size has converged")
            break
            
        Xhat = np.concatenate((Xl.toarray(), confident_Xu.toarray()), axis=0)
        yhat = np.concatenate((yl, confident_yu_hat), axis=0)
            
    return cls, Xhat, yhat

In [None]:
from nltk.stem import WordNetLemmatizer

class Lemmatizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, word):
        return self.wnl.lemmatize(word)

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from speech import *
from classify import evaluate

def get_file_list(tsv_file):
    print(tsv_file)
    fnames = []
    with open(tsv_file, 'r') as f:
        for line in f:
            fname, label = line.strip().split('\t')
            fnames.append(f"data/speech/{fname}")
    return fnames

class BPETokenizer(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        self.tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
        print("Training BPE with training data:")
        files = get_file_list("data/speech/train.tsv")
        self.tokenizer.train(files=files, trainer=trainer)

    def __call__(self, articles):
        return self.tokenizer.encode(articles).tokens

bpe_tokenizer = BPETokenizer()

Training BPE with training data:
data/speech/train.tsv


In [None]:
print("Reading data")
tarfname = "data/speech.tar.gz"
speech = read_files(tarfname, preprocessor=Lemmatizer(), tokenizer=bpe_tokenizer)
print(speech.trainX.shape)

print("Reading unlabeled data")
unlabeled = read_unlabeled(tarfname, speech)
print(unlabeled.X.shape)

In [None]:
print("Training classifier")
cls, Xhat, yhat = self_train(unlabeled.X, speech.trainX, speech.trainy)

print("Evaluating")
evaluate(speech.trainX, speech.trainy, cls)
evaluate(speech.devX, speech.devy, cls)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

bins = np.unique(yhat)
plt.hist(yhat, bins=bins-.5, edgecolor='black')
plt.xticks(bins)
plt.grid(axis='y')
plt.title("Histogram of the training set labels")
plt.show()

## Which labels to include in Dl_hat in every iteration?
https://towardsdatascience.com/self-training-for-natural-language-understanding-d5c369b5a7f6

Select top K samples from unlabeled data for each category based on the teacher's prediction.