In [1]:
import numpy as np
import pickle
import glob
from sklearn.model_selection import train_test_split
from sklearn.exceptions import NotFittedError
from sklearn.metrics import accuracy_score, f1_score

Build vocabulary.

In [2]:
neg_files = glob.glob("txt_sentoken/neg/*.txt")
pos_files = glob.glob("txt_sentoken/neg/*.txt")
files = neg_files + pos_files
vocab = {}
index = 0
for file in files:
    with open(file) as f:
        raw_txt = f.read()
    txt_arr = raw_txt.split()
    for token in txt_arr:
        if token not in vocab:
            vocab[token] = index
            index += 1
            
pickle.dump(vocab, open("vocab.pkl", "wb"))

Generate word count feature vectors for each document.

In [3]:
vocab_size = len(vocab)
n = len(files)
n_neg = len(neg_files)
n_pos = len(pos_files)

X = np.zeros((n, vocab_size), dtype=np.int8)
y = np.concatenate((np.repeat(0, n_neg), np.repeat(1, n_pos))).astype(np.int8)

for i, file in enumerate(files):
    with open(file) as f:
        raw_txt = f.read()
    txt_arr = raw_txt.split()
    for token in txt_arr:
        X[i, vocab[token]] += 1
        
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=400)

pickle.dump(X_train, open("X_train.pkl", "wb"))
pickle.dump(X_test, open("X_test.pkl", "wb"))
pickle.dump(y_train, open("y_train.pkl", "wb"))
pickle.dump(y_test, open("y_test.pkl", "wb"))

Load train and test data.

In [4]:
X_train = pickle.load(open("X_train.pkl", "rb"))
X_test = pickle.load(open("X_test.pkl", "rb"))
y_train = pickle.load(open("y_train.pkl", "rb"))
y_test = pickle.load(open("y_test.pkl", "rb"))

vocab = pickle.load(open("vocab.pkl", "rb"))

print("Train size:", len(X_train))
print("Test size:", len(X_test))
print("Vocabulary size:", len(vocab))
print()
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Train size: 1600
Test size: 400
Vocabulary size: 34542

X_train shape: (1600, 34542)
X_test shape: (400, 34542)
y_train shape: (1600,)
y_test shape: (400,)


Run Multinomial Logistic Regression algorithm.

In [5]:
class MultinomialLogisticRegressor():
    
    def __init__(self, lambda_, batch_size, epochs, lr, verbose):
        self.lambda_ = lambda_
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.verbose = verbose
    
    def fit(self, X, y):
        
        n, d = X.shape
        X_appended = np.concatenate((np.ones((n, 1)), X), axis=1)
        self.weights = np.random.normal(size=d+1)
        checkpoint = epochs // 10
        
        for epoch in range(epochs):
            
            order = np.random.permutation(n)
            num_batch = n // batch_size
            
            for i in range(num_batch):
                indices = order[i:min(i + batch_size, n)]
                X_batch = X_appended[indices]
                y_batch = y[indices]
                
                self.gradient_step(X_batch, y_batch)
                
            # Compute training loss.
            if verbose and (epoch % checkpoint == 0):
                probs = self.sigmoid(np.dot(X_appended, self.weights))
                loss = -(y * np.log(probs) + (1 - y) * np.log(1 - probs)).sum() / n + 0.5 * self.lambda_ * np.dot(self.weights, self.weights)
                print("Epoch %d \t cross entropy loss: %0.4f" % (epoch, loss))
            
                
        self.fitted = True
        return self
    
    def predict(self, X):
        
        if not hasattr(self, 'fitted'):
            raise NotFittedError("This MultinomialLogisticRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")
            
        n = len(X)
        X_appended = np.concatenate((np.ones((n, 1)), X), axis=1)
        
        return (1 + np.sign(np.dot(X_appended, self.weights))) / 2
    
    def gradient_step(self, X, y):
        
        n = len(X)
        probs = self.sigmoid(np.dot(X, self.weights))
        grad = np.dot(X.T, (probs - y)) + self.lambda_ * self.weights / n
        self.weights = self.weights - self.lr * grad
        
    def sigmoid(self, Z):
        Z_clipped = np.clip(Z, -10, 10)
        return 1 / (1 + np.exp(-Z_clipped))

Hyperparameter search.

In [6]:
num_cross_val = 100
batch_size = 32
epochs = 100
verbose = False

lambdas = np.power(10, np.random.uniform(low=-5, high=0.5, size=num_cross_val))
lrs = np.power(10, np.random.uniform(low=-5, high=0.5, size=num_cross_val))

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

best_acc = 0
best_lambda = 0
best_lr = 0

hyperparams = []

print('Iter \t lambda \t lr \t\t train acc \t val acc \t val f1 ')
for i in range(num_cross_val):
    lambda_ = lambdas[i]
    lr = lrs[i]
    
    mlr = MultinomialLogisticRegressor(lambda_=lambda_, batch_size=batch_size, epochs=epochs, lr=lr, verbose=verbose)
    mlr.fit(X_train, y_train)
    y_pred = mlr.predict(X_val)
    
    val_acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    train_acc = accuracy_score(y_train, mlr.predict(X_train))
        
    hyperparams.append(
        {
            'lambda' : lambda_,
            'lr' : lr,
            'train_acc' : train_acc,
            'val_acc' : val_acc,
            'f1' : f1
        }
    )    
    print('%d \t %0.5f \t %0.5f \t %0.2f \t\t %0.2f \t\t %0.2f' % (i, lambda_, lr, train_acc, val_acc, f1))
        
pickle.dump(hyperparams, open("hyperparams.pkl", "wb"))

Iter 	 lambda 	 lr 		 train acc 	 val acc 	 val f1 
0 	 0.00003 	 0.00027 	 0.54 		 0.42 		 0.45
1 	 0.00020 	 0.00002 	 0.51 		 0.50 		 0.52
2 	 0.00006 	 0.00840 	 0.63 		 0.26 		 0.35
3 	 0.00037 	 0.02821 	 0.56 		 0.37 		 0.03
4 	 0.74630 	 2.10430 	 0.49 		 0.49 		 0.00
5 	 0.00029 	 1.00526 	 0.55 		 0.42 		 0.59
6 	 1.04302 	 0.00049 	 0.57 		 0.40 		 0.50
7 	 0.15314 	 0.01068 	 0.55 		 0.43 		 0.60
8 	 1.72537 	 0.13175 	 0.49 		 0.49 		 0.00
9 	 0.00237 	 0.32985 	 0.50 		 0.47 		 0.00
10 	 0.00270 	 0.00003 	 0.48 		 0.50 		 0.48
11 	 0.00443 	 0.00054 	 0.57 		 0.37 		 0.48
12 	 0.00002 	 0.00036 	 0.56 		 0.37 		 0.39
13 	 0.02076 	 0.04759 	 0.50 		 0.47 		 0.00
14 	 0.00350 	 0.00823 	 0.52 		 0.49 		 0.66
15 	 0.01934 	 0.00031 	 0.56 		 0.42 		 0.47
16 	 0.88285 	 0.00031 	 0.54 		 0.44 		 0.56
17 	 0.00441 	 0.02128 	 0.52 		 0.45 		 0.00
18 	 0.00023 	 0.03929 	 0.61 		 0.28 		 0.39
19 	 0.02146 	 1.17000 	 0.51 		 0.51 		 0.67
20 	 0.00219 	 0.00066 	 0.55 		 0.38 