In [15]:
import numpy as np
from sklearn.metrics import f1_score
from scipy.sparse import csr_matrix, find
from data import Data

In [16]:
data = Data()
train_X, train_S, train_R = data.load("0*", True)
print train_X.shape
print train_S.shape
val_X, val_S, val_R = data.load("ff*")
print val_X.shape
print val_S.shape

(13575, 2638)
(13575, 1803)
(839, 2638)
(839, 1803)


In [17]:
from scipy.special import gammaln
from sklearn.utils.extmath import safe_sparse_dot

class MultinomialBayesianNaiveBayes():
    
    def __init__(self, class_pseudo_counts=1.0, feature_pseudo_counts=1.0):
        self.class_pseudo_counts = class_pseudo_counts
        self.feature_pseudo_counts = feature_pseudo_counts
        
    def betaln(self, x, axis):
        return np.sum(gammaln(x), axis=axis) - gammaln(np.sum(x, axis=axis))
    
    def fit(self, X, Y):
        """
        X: sparse n-samples x n-features
        Y: sparse n-samples x n-classes
        """
        
        self.W = safe_sparse_dot(Y.T, X) # n-classes x n-features
        self.C = Y.sum(axis=0).A         # 1 x n-classes
        
        self.lprior = np.log(self.C + self.class_pseudo_counts)
        
    def predict_log_proba(self, x):
        """
        x: sparse n-samples x n-features
        """
        lp = np.zeros((x.shape[0], self.C.size))
        for i in range(x.shape[0]):
            _,j,xj = find(x[i,:])
            wj = self.W[:,j].toarray() + self.feature_pseudo_counts
            lp[i,:] = self.lprior + self.betaln(wj + xj, axis=1) - self.betaln(wj, axis=1)

        return lp
        
    def predict(self, x):
        """
        x: sparse n-samples x n-features
        """
        return np.argmax(self.predict_log_proba(x), axis=1)
    
    def predict_proba(self, x):
        """
        x: sparse n-samples x n-features
        """
        lp = self.predict_log_proba(x)
        lp -= lp.max(axis=1, keepdims=True)
        p = np.exp(lp)
        return p/p.sum(axis=1, keepdims=True)

In [18]:
nbs = MultinomialBayesianNaiveBayes(class_pseudo_counts = 0, feature_pseudo_counts = 0.01)
nbs.fit(train_X, train_S)

nbr = MultinomialBayesianNaiveBayes(class_pseudo_counts = 0, feature_pseudo_counts = 0.01)
nbr.fit(train_X, train_R)

In [19]:
val_s = np.argmax(val_S.toarray(), axis=1)
val_r = np.argmax(val_R.toarray(), axis=1)

pred_s = nbs.predict(val_X)
pred_r = nbr.predict(val_X)

In [20]:
print f1_score(val_s, pred_s, average='micro')
print f1_score(val_s, pred_s, average='macro')
print f1_score(val_s, pred_s, average='weighted')
print "\n"
print f1_score(val_r, pred_r, average='micro')
print f1_score(val_r, pred_r, average='macro')
print f1_score(val_r, pred_r, average='weighted')

0.903456495828
0.814100610351
0.905691019681


0.749702026222
0.35988271189
0.764105435458
