In [1]:
import numpy as np
import pandas as pd
import pickle
import glob
from sklearn.model_selection import train_test_split
from sklearn.exceptions import NotFittedError
from sklearn.metrics import accuracy_score, f1_score

Build vocabulary.

In [None]:
# neg_files = glob.glob("txt_sentoken/neg/*.txt")
# pos_files = glob.glob("txt_sentoken/neg/*.txt")
# files = neg_files + pos_files
# vocab = {}
# index = 0
# for file in files:
#     with open(file) as f:
#         raw_txt = f.read()
#     txt_arr = raw_txt.split()
#     for token in txt_arr:
#         if token not in vocab:
#             vocab[token] = index
#             index += 1
            
# pickle.dump(vocab, open("vocab.pkl", "wb"))

Generate word count feature vectors for each document.

In [53]:
# vocab_size = len(vocab)
# n = len(files)
# n_neg = len(neg_files)
# n_pos = len(pos_files)

# X = np.zeros((n, vocab_size), dtype=np.int8)
# y = np.concatenate((np.repeat(0, n_neg), np.repeat(1, n_pos))).astype(np.int8)

# for i, file in enumerate(files):
#     with open(file) as f:
#         raw_txt = f.read()
#     txt_arr = raw_txt.split()
#     for token in txt_arr:
#         X[i, vocab[token]] += 1
        
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=400)

# pickle.dump(X_train, open("X_train.pkl", "wb"))
# pickle.dump(X_test, open("X_test.pkl", "wb"))
# pickle.dump(y_train, open("y_train.pkl", "wb"))
# pickle.dump(y_test, open("y_test.pkl", "wb"))

NameError: name 'files' is not defined

Load train and test data.

In [60]:
X_train = pickle.load(open("X_train.pkl", "rb"))
X_test = pickle.load(open("X_test.pkl", "rb"))
y_train = pickle.load(open("y_train.pkl", "rb"))
y_test = pickle.load(open("y_test.pkl", "rb"))

vocab = pickle.load(open("vocab.pkl", "rb"))

print("Train size:", len(X_train))
print("Test size:", len(X_test))
print("Vocabulary size:", len(vocab))
print()
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Train size: 1600
Test size: 400
Vocabulary size: 34542

X_train shape: (1600, 34542)
X_test shape: (400, 34542)
y_train shape: (1600,)
y_test shape: (400,)


Run Multinomial Logistic Regression algorithm.

In [48]:
class MultinomialLogisticRegressor():
    
    def __init__(self, lambda_, batch_size, epochs, lr, verbose):
        self.lambda_ = lambda_
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.verbose = verbose
    
    def fit(self, X, y):
        
        n, d = X.shape
        X_appended = np.concatenate((np.ones((n, 1)), X), axis=1)
        self.weights = np.random.normal(size=d+1)
        checkpoint = epochs // 10
        
        for epoch in range(epochs):
            
            order = np.random.permutation(n)
            num_batch = n // batch_size
            
            for i in range(num_batch):
                indices = order[i:min(i + batch_size, n)]
                X_batch = X_appended[indices]
                y_batch = y[indices]
                
                self.gradient_step(X_batch, y_batch)
                
            # Compute training loss.
            if verbose and (epoch % checkpoint == 0):
                logits = np.dot(X_appended, self.weights)
                probs = self.sigmoid(logits)
                loss = -(y * np.log(probs) + (1 - y) * np.log(1 - probs)).sum() / n + 0.5 * self.lambda_ * np.dot(self.weights, self.weights)
                
                y_pred = (1 + np.sign(logits)) / 2
                train_error = np.mean(np.abs(y - y_pred))
                print("Epoch %d \t cross entropy loss: %0.4f train error %0.3f" % (epoch, loss, train_error))
            
                
        self.fitted = True
        return self
    
    def predict(self, X):
        
        if not hasattr(self, 'fitted'):
            raise NotFittedError("This MultinomialLogisticRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")
            
        n = len(X)
        X_appended = np.concatenate((np.ones((n, 1)), X), axis=1)
        
        return (1 + np.sign(np.dot(X_appended, self.weights))) / 2
    
    def gradient_step(self, X, y):
        
        n = len(X)
        probs = self.sigmoid(np.dot(X, self.weights))
        grad = (np.dot(X.T, (probs - y)) + self.lambda_ * self.weights) / n
        self.weights = self.weights - self.lr * grad
        
    def sigmoid(self, Z):
        Z_clipped = np.clip(Z, -10, 10)
        return 1 / (1 + np.exp(-Z_clipped))

Hyperparameter search.

In [None]:
num_cross_val = 100
batch_size = 32
epochs = 500
verbose = False

lambdas = np.power(10, np.random.uniform(low=-5, high=0.5, size=num_cross_val))
lrs = np.power(10, np.random.uniform(low=-5, high=0.5, size=num_cross_val))

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2)

best_acc = 0
best_lambda = 0
best_lr = 0

hyperparams = []

print('Iter \t lambda \t lr \t\t train acc \t val acc \t val f1 ')
for i in range(num_cross_val):
    lambda_ = lambdas[i]
    lr = lrs[i]
    
    mlr = MultinomialLogisticRegressor(lambda_=lambda_, batch_size=batch_size, epochs=epochs, lr=lr, verbose=verbose)
    mlr.fit(X_tr, y_tr)
    y_pred = mlr.predict(X_val)
    
    val_acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    train_acc = accuracy_score(y_tr, mlr.predict(X_tr))
        
    hyperparams.append(
        {
            'lambda' : lambda_,
            'lr' : lr,
            'train_acc' : train_acc,
            'val_acc' : val_acc,
            'f1' : f1
        }
    )    
    print('%d \t %0.5f \t %0.5f \t %0.2f \t\t %0.2f \t\t %0.2f' % (i, lambda_, lr, train_acc, val_acc, f1))
        
pickle.dump(hyperparams, open("hyperparams.pkl", "wb"))

In [4]:
hyperparams = pd.DataFrame(pickle.load(open("hyperparams.pkl", "rb")))

      lambda        lr  train_acc   val_acc        f1
0   0.000035  0.000269   0.543750  0.418750  0.449704
1   0.000199  0.000021   0.514844  0.496875  0.519403
2   0.000055  0.008396   0.632031  0.259375  0.354223
3   0.000372  0.028210   0.562500  0.365625  0.028708
4   0.746300  2.104295   0.487500  0.490625  0.000000
..       ...       ...        ...       ...       ...
95  0.002314  0.000045   0.507812  0.484375  0.535211
96  0.070781  0.050187   0.553125  0.428125  0.594235
97  0.000199  0.000172   0.534375  0.456250  0.452830
98  2.609784  0.000964   0.532813  0.396875  0.030151
99  0.283345  0.057510   0.529687  0.465625  0.633833

[100 rows x 5 columns]


In [11]:
hyperparams.sort_values('val_acc', inplace=True, ascending=False)
hyperparams.head()

Unnamed: 0,lambda,lr,train_acc,val_acc,f1
64,0.718564,0.058442,0.513281,0.509375,0.673597
68,0.011996,1.293417,0.513281,0.509375,0.673597
90,1.701879,0.165927,0.513281,0.509375,0.673597
37,0.450486,1.6e-05,0.5,0.509375,0.555241
19,0.021464,1.170003,0.514062,0.50625,0.672199


See if the code works on a toy example.

In [10]:
n = 1000
d = 10

mu = np.ones(d)
cov = np.eye(d)

X_neg = np.random.multivariate_normal(-mu, cov, size = n // 2)
y_neg = np.repeat(0, n // 2)
X_pos = np.random.multivariate_normal(mu, cov, size = n // 2)
y_pos = np.repeat(1, n // 2)

X = np.concatenate((X_neg, X_pos))
y = np.concatenate((y_neg, y_pos))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

batch_size = 32
epochs = 100
verbose = True
lr = 0.001
lambda_ = 0.001

mlr = MultinomialLogisticRegressor(lambda_=lambda_, batch_size=batch_size, epochs=epochs, lr=lr, verbose=verbose)
mlr.fit(X_train, y_train)
y_pred = mlr.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
print(test_acc)

Epoch 0 	 cross entropy loss: 0.4498
Epoch 10 	 cross entropy loss: 0.0609
Epoch 20 	 cross entropy loss: 0.0374
Epoch 30 	 cross entropy loss: 0.0281
Epoch 40 	 cross entropy loss: 0.0220
Epoch 50 	 cross entropy loss: 0.0195
Epoch 60 	 cross entropy loss: 0.0183
Epoch 70 	 cross entropy loss: 0.0173
Epoch 80 	 cross entropy loss: 0.0168
Epoch 90 	 cross entropy loss: 0.0164
1.0


Run with better hyperparameters.

In [59]:
batch_size = 64
epochs = 300
verbose = True
lr = 0.01
lambda_ = 1

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2)

try:
    mlr = MultinomialLogisticRegressor(lambda_=lambda_, batch_size=batch_size, epochs=epochs, lr=lr, verbose=verbose)
    mlr.fit(X_tr, y_tr)
    y_pred = mlr.predict(X_val)

    val_acc = accuracy_score(y_val, y_pred)
    print("Validation accuracy: ", val_acc)
except KeyboardInterrupt:
    print('Graceful Exit')

Epoch 0 	 cross entropy loss: 17354.1443 train error 0.497
Epoch 30 	 cross entropy loss: 14336.7710 train error 0.491
Epoch 60 	 cross entropy loss: 11853.7137 train error 0.473
Epoch 90 	 cross entropy loss: 9807.6763 train error 0.458
Epoch 120 	 cross entropy loss: 8117.4998 train error 0.460
Epoch 150 	 cross entropy loss: 6721.5702 train error 0.442
Epoch 180 	 cross entropy loss: 5569.0041 train error 0.459
Epoch 210 	 cross entropy loss: 4615.1826 train error 0.456
Epoch 240 	 cross entropy loss: 3825.6719 train error 0.444
Epoch 270 	 cross entropy loss: 3172.8950 train error 0.427
Validation accuracy:  0.38125


In [29]:
pickle.dump(mlr, open("model.pkl", "wb"))

In [32]:
mlr = pickle.load(open("model.pkl", "rb"))

y_pred = mlr.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)

print("Test accuracy: ", test_acc)
print("Test f1: ", test_f1)

Test accuracy:  0.4525
Test f1:  0.6191304347826087
