In [1]:
import numpy as np
import pandas as pd
import pickle
import glob
from sklearn.model_selection import train_test_split
from sklearn.exceptions import NotFittedError
from sklearn.metrics import accuracy_score, f1_score

Build vocabulary.

In [2]:
neg_files = glob.glob("txt_sentoken/neg/*.txt")
pos_files = glob.glob("txt_sentoken/neg/*.txt")
files = neg_files + pos_files
vocab = {}
index = 0
for file in files:
    with open(file) as f:
        raw_txt = f.read()
    txt_arr = raw_txt.split()
    for token in txt_arr:
        if token not in vocab:
            vocab[token] = index
            index += 1
            
pickle.dump(vocab, open("vocab.pkl", "wb"))

Generate word count feature vectors for each document.

In [3]:
vocab_size = len(vocab)
n = len(files)
n_neg = len(neg_files)
n_pos = len(pos_files)

X = np.zeros((n, vocab_size), dtype=np.int8)
y = np.concatenate((np.repeat(0, n_neg), np.repeat(1, n_pos))).astype(np.int8)

for i, file in enumerate(files):
    with open(file) as f:
        raw_txt = f.read()
    txt_arr = raw_txt.split()
    for token in txt_arr:
        X[i, vocab[token]] += 1

In [5]:
feature_means = np.mean(X, axis = 0)
vocab_size = len(feature_means)

threshold = 0.05
above_idx = feature_means > threshold
num_above = np.sum(above_idx)
frac_above = num_above / vocab_size

print("%d words out of %d words have above %f instances in each document on average (%0.3f)." % (num_above, vocab_size, threshold, frac_above))

1282 words out of 34542 words have above 0.050000 instances in each document on average (0.037).


In [6]:
X = X[:, above_idx]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=400)

pickle.dump(X_train, open("X_train.pkl", "wb"))
pickle.dump(X_test, open("X_test.pkl", "wb"))
pickle.dump(y_train, open("y_train.pkl", "wb"))
pickle.dump(y_test, open("y_test.pkl", "wb"))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=400)

pickle.dump(X_train, open("X_train.pkl", "wb"))
pickle.dump(X_test, open("X_test.pkl", "wb"))
pickle.dump(y_train, open("y_train.pkl", "wb"))
pickle.dump(y_test, open("y_test.pkl", "wb"))

Load train and test data.

In [13]:
X_train = pickle.load(open("X_train.pkl", "rb"))
X_test = pickle.load(open("X_test.pkl", "rb"))
y_train = pickle.load(open("y_train.pkl", "rb"))
y_test = pickle.load(open("y_test.pkl", "rb"))

vocab = pickle.load(open("vocab.pkl", "rb"))

print("Train size:", len(X_train))
print("Test size:", len(X_test))
print("Vocabulary size:", len(vocab))
print()
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Train size: 1600
Test size: 400
Vocabulary size: 34542

X_train shape: (1600, 1282)
X_test shape: (400, 1282)
y_train shape: (1600,)
y_test shape: (400,)


Run Multinomial Logistic Regression algorithm.

In [9]:
class MultinomialLogisticRegressor():
    
    def __init__(self, lambda_, batch_size, epochs, lr, verbose):
        self.lambda_ = lambda_
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.verbose = verbose
    
    def fit(self, X, y):
        
        n, d = X.shape
        X_appended = np.concatenate((np.ones((n, 1)), X), axis=1)
        if not hasattr(self, 'weights'):
            self.weights = np.random.normal(size=d+1)
        checkpoint = self.epochs // 10
        
        for epoch in range(self.epochs):
            
            order = np.random.permutation(n)
            num_batch = n // batch_size
            
            for i in range(num_batch):
                indices = order[i:min(i + batch_size, n)]
                X_batch = X_appended[indices]
                y_batch = y[indices]
                
                self.gradient_step(X_batch, y_batch)
                
            # Compute training loss.
            if verbose and (epoch % checkpoint == 0):
                logits = np.dot(X_appended, self.weights)
                probs = self.sigmoid(logits)
                loss = (-(y * np.log(probs) + (1 - y) * np.log(1 - probs)).sum() + 0.5 * self.lambda_ * np.dot(self.weights, self.weights)) / n
                
                y_pred = (1 + np.sign(logits)) / 2
                train_error = np.mean(np.abs(y - y_pred))
                print("Epoch %d \t cross entropy loss: %0.4f train error %0.3f" % (epoch, loss, train_error))
            
                
        self.fitted = True
        return self
    
    def predict(self, X):
        
        if not hasattr(self, 'fitted'):
            raise NotFittedError("This MultinomialLogisticRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")
            
        n = len(X)
        X_appended = np.concatenate((np.ones((n, 1)), X), axis=1)
        
        return (1 + np.sign(np.dot(X_appended, self.weights))) / 2
    
    def gradient_step(self, X, y):
        
        n = len(X)
        probs = self.sigmoid(np.dot(X, self.weights))
        grad = (np.dot(X.T, (probs - y)) + self.lambda_ * self.weights) / n
        self.weights = self.weights - self.lr * grad
        
    def sigmoid(self, Z):
        Z_clipped = np.clip(Z, -10, 10)
        return 1 / (1 + np.exp(-Z_clipped))

Hyperparameter search.

In [None]:
num_cross_val = 100
batch_size = 32
epochs = 500
verbose = False

lambdas = np.power(10, np.random.uniform(low=-5, high=0.5, size=num_cross_val))
lrs = np.power(10, np.random.uniform(low=-5, high=0.5, size=num_cross_val))

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2)

best_acc = 0
best_lambda = 0
best_lr = 0

hyperparams = []

print('Iter \t lambda \t lr \t\t train acc \t val acc \t val f1 ')
for i in range(num_cross_val):
    lambda_ = lambdas[i]
    lr = lrs[i]
    
    mlr = MultinomialLogisticRegressor(lambda_=lambda_, batch_size=batch_size, epochs=epochs, lr=lr, verbose=verbose)
    mlr.fit(X_tr, y_tr)
    y_pred = mlr.predict(X_val)
    
    val_acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    train_acc = accuracy_score(y_tr, mlr.predict(X_tr))
        
    hyperparams.append(
        {
            'lambda' : lambda_,
            'lr' : lr,
            'train_acc' : train_acc,
            'val_acc' : val_acc,
            'f1' : f1
        }
    )    
    print('%d \t %0.5f \t %0.5f \t %0.2f \t\t %0.2f \t\t %0.2f' % (i, lambda_, lr, train_acc, val_acc, f1))
        
pickle.dump(hyperparams, open("hyperparams.pkl", "wb"))

In [None]:
hyperparams = pd.DataFrame(pickle.load(open("hyperparams.pkl", "rb")))

In [None]:
hyperparams.sort_values('val_acc', inplace=True, ascending=False)
print(hyperparams[0:20])

See if the code works on a toy example.

In [12]:
n = 1000
d = 10

mu = 3 * np.ones(d)
cov = np.eye(d)

X_neg = np.random.multivariate_normal(-mu, cov, size = n // 2)
y_neg = np.repeat(0, n // 2)
X_pos = np.random.multivariate_normal(mu, cov, size = n // 2)
y_pos = np.repeat(1, n // 2)

X = np.concatenate((X_neg, X_pos))
y = np.concatenate((y_neg, y_pos))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

batch_size = 32
epochs = 100
verbose = True
lr = 0.03
lambda_ = 0.001

mlr = MultinomialLogisticRegressor(lambda_=lambda_, batch_size=batch_size, epochs=epochs, lr=lr, verbose=verbose)
mlr.fit(X_train, y_train)
y_pred = mlr.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
print(test_acc)

Epoch 0 	 cross entropy loss: 0.0340 train error 0.012
Epoch 10 	 cross entropy loss: 0.0025 train error 0.000
Epoch 20 	 cross entropy loss: 0.0011 train error 0.000
Epoch 30 	 cross entropy loss: 0.0007 train error 0.000
Epoch 40 	 cross entropy loss: 0.0005 train error 0.000
Epoch 50 	 cross entropy loss: 0.0004 train error 0.000
Epoch 60 	 cross entropy loss: 0.0004 train error 0.000
Epoch 70 	 cross entropy loss: 0.0003 train error 0.000
Epoch 80 	 cross entropy loss: 0.0003 train error 0.000
Epoch 90 	 cross entropy loss: 0.0003 train error 0.000
1.0


Run with better hyperparameters.

In [14]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [42]:
batch_size = 100
epochs = 1000
verbose = True
lr = 0.01
lambda_ = 6

mlr = MultinomialLogisticRegressor(lambda_=lambda_, batch_size=batch_size, epochs=epochs, lr=lr, verbose=verbose)

# 1. Does the loss monotonically decrease? -> 
# - yes: increase learning rate until no. Move to 2.
# - no: decrease learning rate.
# 2. Does the model optimize?
# - yes: Move to 3.
# - no: Increase epochs.
# 3. Is there a lare gap between train acc and val acc?
# - yes: increase lambda. Move to 1.
# - no: Move to 4.
# Is the performance reasonable?
# - yes: You're done!
# - no: decrease lambda. Move to 1.

In [87]:
try:
    mlr.epochs = 2000
    mlr.lambda_ = 10000
    mlr.lr = 0.0003
    mlr.fit(X_tr, y_tr)
    y_pred = mlr.predict(X_val)

    val_acc = accuracy_score(y_val, y_pred)
    print("Validation accuracy: ", val_acc)
except KeyboardInterrupt:
    print('Graceful Exit')

Epoch 0 	 cross entropy loss: 0.6932 train error 0.491
Epoch 200 	 cross entropy loss: 0.6932 train error 0.495
Epoch 400 	 cross entropy loss: 0.6939 train error 0.509
Epoch 600 	 cross entropy loss: 0.7027 train error 0.491
Epoch 800 	 cross entropy loss: 0.6951 train error 0.491
Epoch 1000 	 cross entropy loss: 0.6925 train error 0.481
Epoch 1200 	 cross entropy loss: 0.7004 train error 0.491
Epoch 1400 	 cross entropy loss: 0.6967 train error 0.505
Epoch 1600 	 cross entropy loss: 0.7083 train error 0.507
Epoch 1800 	 cross entropy loss: 0.6968 train error 0.508
Validation accuracy:  0.528125


In [88]:
y_pred = mlr.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)

print("Test accuracy: ", test_acc)
print("Test f1: ", test_f1)

Test accuracy:  0.505
Test f1:  0.6710963455149502


In [90]:
pickle.dump(mlr, open("model.pkl", "wb"))

In [91]:
mlr = pickle.load(open("model.pkl", "rb"))

y_pred = mlr.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)

print("Test accuracy: ", test_acc)
print("Test f1: ", test_f1)

Test accuracy:  0.505
Test f1:  0.6710963455149502
