In [1]:
import numpy as np
import pandas as pd
import pickle
import glob
from sklearn.model_selection import train_test_split
from sklearn.exceptions import NotFittedError
from sklearn.metrics import accuracy_score, f1_score

Build vocabulary.

In [2]:
# neg_files = glob.glob("txt_sentoken/neg/*.txt")
# pos_files = glob.glob("txt_sentoken/neg/*.txt")
# files = neg_files + pos_files
# vocab = {}
# index = 0
# for file in files:
#     with open(file) as f:
#         raw_txt = f.read()
#     txt_arr = raw_txt.split()
#     for token in txt_arr:
#         if token not in vocab:
#             vocab[token] = index
#             index += 1
            
# pickle.dump(vocab, open("vocab.pkl", "wb"))

Generate word count feature vectors for each document.

In [3]:
# vocab_size = len(vocab)
# n = len(files)
# n_neg = len(neg_files)
# n_pos = len(pos_files)

# X = np.zeros((n, vocab_size), dtype=np.int8)
# y = np.concatenate((np.repeat(0, n_neg), np.repeat(1, n_pos))).astype(np.int8)

# for i, file in enumerate(files):
#     with open(file) as f:
#         raw_txt = f.read()
#     txt_arr = raw_txt.split()
#     for token in txt_arr:
#         X[i, vocab[token]] += 1
        
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=400)

# pickle.dump(X_train, open("X_train.pkl", "wb"))
# pickle.dump(X_test, open("X_test.pkl", "wb"))
# pickle.dump(y_train, open("y_train.pkl", "wb"))
# pickle.dump(y_test, open("y_test.pkl", "wb"))

Load train and test data.

In [4]:
X_train = pickle.load(open("X_train.pkl", "rb"))
X_test = pickle.load(open("X_test.pkl", "rb"))
y_train = pickle.load(open("y_train.pkl", "rb"))
y_test = pickle.load(open("y_test.pkl", "rb"))

vocab = pickle.load(open("vocab.pkl", "rb"))

print("Train size:", len(X_train))
print("Test size:", len(X_test))
print("Vocabulary size:", len(vocab))
print()
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Train size: 1600
Test size: 400
Vocabulary size: 34542

X_train shape: (1600, 34542)
X_test shape: (400, 34542)
y_train shape: (1600,)
y_test shape: (400,)


Run Multinomial Logistic Regression algorithm.

In [5]:
class MultinomialLogisticRegressor():
    
    def __init__(self, lambda_, batch_size, epochs, lr, verbose):
        self.lambda_ = lambda_
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.verbose = verbose
    
    def fit(self, X, y):
        
        n, d = X.shape
        X_appended = np.concatenate((np.ones((n, 1)), X), axis=1)
        self.weights = np.random.normal(size=d+1)
        checkpoint = epochs // 10
        
        for epoch in range(epochs):
            
            order = np.random.permutation(n)
            num_batch = n // batch_size
            
            for i in range(num_batch):
                indices = order[i:min(i + batch_size, n)]
                X_batch = X_appended[indices]
                y_batch = y[indices]
                
                self.gradient_step(X_batch, y_batch)
                
            # Compute training loss.
            if verbose and (epoch % checkpoint == 0):
                logits = np.dot(X_appended, self.weights)
                probs = self.sigmoid(logits)
                loss = -(y * np.log(probs) + (1 - y) * np.log(1 - probs)).sum() / n + 0.5 * self.lambda_ * np.dot(self.weights, self.weights)
                
                y_pred = (1 + np.sign(logits)) / 2
                train_error = np.mean(np.abs(y - y_pred))
                print("Epoch %d \t cross entropy loss: %0.4f train error %0.3f" % (epoch, loss, train_error))
            
                
        self.fitted = True
        return self
    
    def predict(self, X):
        
        if not hasattr(self, 'fitted'):
            raise NotFittedError("This MultinomialLogisticRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")
            
        n = len(X)
        X_appended = np.concatenate((np.ones((n, 1)), X), axis=1)
        
        return (1 + np.sign(np.dot(X_appended, self.weights))) / 2
    
    def gradient_step(self, X, y):
        
        n = len(X)
        probs = self.sigmoid(np.dot(X, self.weights))
        grad = (np.dot(X.T, (probs - y)) + self.lambda_ * self.weights) / n
        self.weights = self.weights - self.lr * grad
        
    def sigmoid(self, Z):
        Z_clipped = np.clip(Z, -10, 10)
        return 1 / (1 + np.exp(-Z_clipped))

Hyperparameter search.

In [6]:
num_cross_val = 100
batch_size = 32
epochs = 500
verbose = False

lambdas = np.power(10, np.random.uniform(low=-5, high=0.5, size=num_cross_val))
lrs = np.power(10, np.random.uniform(low=-5, high=0.5, size=num_cross_val))

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2)

best_acc = 0
best_lambda = 0
best_lr = 0

hyperparams = []

print('Iter \t lambda \t lr \t\t train acc \t val acc \t val f1 ')
for i in range(num_cross_val):
    lambda_ = lambdas[i]
    lr = lrs[i]
    
    mlr = MultinomialLogisticRegressor(lambda_=lambda_, batch_size=batch_size, epochs=epochs, lr=lr, verbose=verbose)
    mlr.fit(X_tr, y_tr)
    y_pred = mlr.predict(X_val)
    
    val_acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    train_acc = accuracy_score(y_tr, mlr.predict(X_tr))
        
    hyperparams.append(
        {
            'lambda' : lambda_,
            'lr' : lr,
            'train_acc' : train_acc,
            'val_acc' : val_acc,
            'f1' : f1
        }
    )    
    print('%d \t %0.5f \t %0.5f \t %0.2f \t\t %0.2f \t\t %0.2f' % (i, lambda_, lr, train_acc, val_acc, f1))
        
pickle.dump(hyperparams, open("hyperparams.pkl", "wb"))

Iter 	 lambda 	 lr 		 train acc 	 val acc 	 val f1 
0 	 0.18960 	 0.01412 	 0.64 		 0.25 		 0.37
1 	 0.08555 	 0.00412 	 0.61 		 0.29 		 0.27
2 	 0.00011 	 0.46173 	 0.67 		 0.21 		 0.30
3 	 0.00868 	 0.00136 	 0.54 		 0.42 		 0.43
4 	 0.00030 	 0.00002 	 0.52 		 0.48 		 0.47
5 	 0.00021 	 0.00002 	 0.52 		 0.42 		 0.45
6 	 0.11458 	 0.31950 	 0.55 		 0.41 		 0.55
7 	 0.43021 	 0.04702 	 0.54 		 0.41 		 0.19
8 	 0.00057 	 0.00020 	 0.51 		 0.48 		 0.45
9 	 0.01471 	 0.02499 	 0.60 		 0.33 		 0.09
10 	 0.19112 	 1.52390 	 0.53 		 0.46 		 0.63
11 	 0.00012 	 0.00455 	 0.61 		 0.29 		 0.23
12 	 0.00002 	 0.00971 	 0.58 		 0.37 		 0.51
13 	 2.00787 	 0.00004 	 0.50 		 0.46 		 0.44
14 	 0.00004 	 0.20228 	 0.64 		 0.23 		 0.07
15 	 0.03082 	 0.00832 	 0.65 		 0.23 		 0.29
16 	 0.00020 	 0.00031 	 0.51 		 0.45 		 0.46
17 	 0.00428 	 0.53488 	 0.48 		 0.54 		 0.00
18 	 0.00086 	 0.00004 	 0.50 		 0.49 		 0.45
19 	 0.00007 	 0.10718 	 0.65 		 0.22 		 0.33
20 	 0.00215 	 0.00015 	 0.49 		 0.49 

In [7]:
hyperparams = pd.DataFrame(pickle.load(open("hyperparams.pkl", "rb")))

In [8]:
hyperparams.sort_values('val_acc', inplace=True, ascending=False)
hyperparams.head()

Unnamed: 0,lambda,lr,train_acc,val_acc,f1
89,0.156648,2.8604,0.475781,0.55,0.04
76,0.330259,0.054199,0.473438,0.54375,0.0
56,1.614966,0.022752,0.477344,0.54375,0.051948
74,1.785102,1.571451,0.474219,0.54375,0.0
81,0.219275,0.2354,0.473438,0.54375,0.0


See if the code works on a toy example.

In [9]:
n = 1000
d = 10

mu = np.ones(d)
cov = np.eye(d)

X_neg = np.random.multivariate_normal(-mu, cov, size = n // 2)
y_neg = np.repeat(0, n // 2)
X_pos = np.random.multivariate_normal(mu, cov, size = n // 2)
y_pos = np.repeat(1, n // 2)

X = np.concatenate((X_neg, X_pos))
y = np.concatenate((y_neg, y_pos))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

batch_size = 32
epochs = 100
verbose = True
lr = 0.001
lambda_ = 0.001

mlr = MultinomialLogisticRegressor(lambda_=lambda_, batch_size=batch_size, epochs=epochs, lr=lr, verbose=verbose)
mlr.fit(X_train, y_train)
y_pred = mlr.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
print(test_acc)

Epoch 0 	 cross entropy loss: 0.0385 train error 0.010
Epoch 10 	 cross entropy loss: 0.0377 train error 0.010
Epoch 20 	 cross entropy loss: 0.0370 train error 0.010
Epoch 30 	 cross entropy loss: 0.0366 train error 0.010
Epoch 40 	 cross entropy loss: 0.0361 train error 0.010
Epoch 50 	 cross entropy loss: 0.0356 train error 0.010
Epoch 60 	 cross entropy loss: 0.0351 train error 0.009
Epoch 70 	 cross entropy loss: 0.0346 train error 0.009
Epoch 80 	 cross entropy loss: 0.0344 train error 0.009
Epoch 90 	 cross entropy loss: 0.0340 train error 0.009
0.9848484848484849


Run with better hyperparameters.

In [10]:
batch_size = 64
epochs = 300
verbose = True
lr = 0.01
lambda_ = 1

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2)

try:
    mlr = MultinomialLogisticRegressor(lambda_=lambda_, batch_size=batch_size, epochs=epochs, lr=lr, verbose=verbose)
    mlr.fit(X_tr, y_tr)
    y_pred = mlr.predict(X_val)

    val_acc = accuracy_score(y_val, y_pred)
    print("Validation accuracy: ", val_acc)
except KeyboardInterrupt:
    print('Graceful Exit')

Epoch 0 	 cross entropy loss: 10.2883 train error 0.129
Epoch 30 	 cross entropy loss: 9.0160 train error 0.050
Epoch 60 	 cross entropy loss: 8.2944 train error 0.034
Epoch 90 	 cross entropy loss: 7.6937 train error 0.015
Epoch 120 	 cross entropy loss: 7.1690 train error 0.011
Epoch 150 	 cross entropy loss: 6.7008 train error 0.009
Epoch 180 	 cross entropy loss: 6.2702 train error 0.006
Epoch 210 	 cross entropy loss: 5.8818 train error 0.002
Epoch 240 	 cross entropy loss: 5.5125 train error 0.002
Epoch 270 	 cross entropy loss: 5.1711 train error 0.002
Validation accuracy:  1.0


In [11]:
pickle.dump(mlr, open("model.pkl", "wb"))

In [12]:
mlr = pickle.load(open("model.pkl", "rb"))

y_pred = mlr.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)

print("Test accuracy: ", test_acc)
print("Test f1: ", test_f1)

Test accuracy:  0.996969696969697
Test f1:  0.9971181556195965
