# Designing Better Features

## The Sparsity of the Vocabulary

- utilize the corpus of unlabeled text to learn something
- knowledge from unlabeled documents can allow us to spread the labels to the words that do not even appear in the training data
- e.g. co-occurence statistics on the unlabeled set of speeches

### How to represent the word contexts?

- word-document matrix
- word-word matrix

### How to compute similarity between word representations?

- cosine distance
- PMI(pairwise mutual information)

### How to represent a document?

- sum of the word embeddings

In [3]:
from speech import *
from classify import evaluate

In [17]:
from nltk.stem import WordNetLemmatizer

class Lemmatizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, sentence):
        return ' '.join([self.wnl.lemmatize(word) for word in sentence.split()])
    
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from speech import *
from classify import evaluate
from tokenizers.processors import TemplateProcessing
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
from os import listdir

def get_file_list(tsv_file):
    print(tsv_file)
    fnames = []
    with open(tsv_file, 'r') as f:
        for line in f:
            fname, label = line.strip().split('\t')
            fnames.append(f"data/speech/{fname}")
    return fnames

def get_unlabeled_file_list():
    lst = []
    dirname = 'data/speech/unlabeled'
    for fname in listdir(dirname):
        if ".txt" in fname:
            lst.append(f'{dirname}/{fname}')
    return lst

class BPETokenizer(object):
    def __init__(self):
        self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        self.tokenizer.pre_tokenizer = Whitespace()
        self.tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()])
        self.tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", 1),
                ("[SEP]", 2),
            ],
        )
        
        trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
        print("Training BPE with training data:")
        files = get_file_list("data/speech/train.tsv") + get_unlabeled_file_list()
        self.tokenizer.train(files=files, trainer=trainer)

    def __call__(self, articles):
        return self.tokenizer.encode(articles).tokens

bpe_tokenizer = BPETokenizer()

Training BPE with training data:
data/speech/train.tsv


In [4]:
from sklearn.linear_model import LogisticRegression
import numpy as np
from scipy.sparse import vstack

def self_train(Xu, Xl, yl, devX, devy, C=0.7, confident_cutoff=0.8):
    Xhat, yhat = Xl, yl
    num_iter = 0
    num_stall = 0
    curr_best = .0
    
    clss = []
    accs = []
    data_sizes = []
    
    while True:
        # Train
        num_iter += 1
        data_sizes.append(Xhat.shape[0])
        print(f"{num_iter}th train")
        print("Data size:", Xhat.shape, yhat.shape)
        cls = LogisticRegression(max_iter=10000, n_jobs=-1, C=C)
        cls.fit(Xhat, yhat)
        clss.append(cls)
        
        print("Evaluate Dev")
        acc = evaluate(devX, devy, cls)
        accs.append(acc)
        
        if acc > curr_best:
            print("new best score")
            curr_best = max(acc, curr_best)
            num_stall = 0
        else:
            num_stall += 1
            print(f"stall {num_stall} times")
            if num_stall >= 5:
                print(f"dev accuracy is not improving for {num_stall} iterations. Stop.")
                break

        # Predict
        print("Predicting unlabeled data with the previous model")
        yu_hat = cls.predict(Xu)
        confidents = cls.predict_proba(Xu).max(axis=1)

        # Expand Confident samples
        confident_Xu = Xu[confidents >= confident_cutoff]
        confident_yu_hat = yu_hat[confidents >= confident_cutoff]
        Xu = Xu[confidents < confident_cutoff]

        if confident_Xu.shape[0] == 0:
            print("Data size has converged")
            break
        
        print("Data added:", confident_Xu.shape)
        Xhat = vstack((Xhat, confident_Xu))
        yhat = np.concatenate((yhat, confident_yu_hat), axis=0)
            
    return clss, accs, data_sizes

In [24]:
from nltk import word_tokenize

print("Reading data")
tarfname = "data/speech.tar.gz"
speech = read_files(tarfname, preprocessor=Lemmatizer(), tokenizer=word_tokenize, ngram_range=(1,2))
print(speech.trainX.shape)

Reading data
-- train data
train.tsv
4370
-- dev data
dev.tsv
414
-- transforming data and labels
(4370, 59393)


In [25]:
print("Training classifier")
from sklearn.linear_model import LogisticRegression
cls = LogisticRegression(max_iter=1000, n_jobs=-1, C=.5)
cls.fit(speech.trainX, speech.trainy)

print("Evaluating")
evaluate(speech.trainX, speech.trainy, cls)
evaluate(speech.devX, speech.devy, cls)

Training classifier
Evaluating
  Accuracy 0.9995423340961098
  Accuracy 0.41545893719806765


0.41545893719806765