In [53]:
import numpy as np
from sklearn.metrics import f1_score
from scipy.sparse import csr_matrix, find
from data import Data

In [54]:
data = Data()
train_X, train_S, train_R = data.load("0*", True)
print train_X.shape
print train_R.shape
print train_S.shape
val_X, val_S, val_R = data.load("ff*")
print val_X.shape
print val_R.shape
print val_S.shape

(13575, 2638)
(13575, 529)
(13575, 1803)
(839, 2638)
(839, 529)
(839, 1803)


In [55]:
from scipy.special import gammaln
from sklearn.utils.extmath import safe_sparse_dot
from scipy.sparse import lil_matrix, csr_matrix

class CombinedMultinomialBayesianNaiveBayes():
    
    def __init__(self, eta=0.0, alpha = 0.1, beta=0.01):
        self.eta = eta
        self.alpha = alpha
        self.beta = beta
        
    def betaln(self, x, axis):
        return np.sum(gammaln(x), axis=axis) - gammaln(np.sum(x, axis=axis))
    
    def fit(self, X, R, S):
        """
        X: sparse n-samples x n-words
        R: sparse (one hot) n-samples x n-receivers
        S: sparse (one hot) n-samples x n-senders
        """
        self.m = R.sum(axis=0).A.T             # n-receivers x 1 
        self.c = safe_sparse_dot(R.T, S)     # n-receivers x n-senders
        
        self.lrp = np.log(self.eta + self.m) # n-receivers x 1
        k = self.alpha + self.c.toarray()
        self.lrsp = np.log(k) - np.log(np.sum(k, axis=1, keepdims=True)) # n-receivers x n-senders
        
        r = np.argmax(R.toarray(), axis=1)
        s = np.argmax(S.toarray(), axis=1)
        
        self.W = list()                      # n-receivers x n-senders x n-words
        for i in range(R.shape[1]):
            self.W.append(lil_matrix((S.shape[1], X.shape[1])))
            
        for i in range(X.shape[0]):
            self.W[r[i]][s[i],:] += X[i,:]
        
        for i in range(R.shape[1]):
            self.W[i] = csr_matrix(self.W[i])
        
    def predict_log_proba(self, x):
        """
        x: sparse n-samples x n-features
        """
        lp = np.zeros((x.shape[0], self.lrsp.shape[0], self.lrsp.shape[1]))
        for i in range(x.shape[0]):
            for l in range(self.lrsp.shape[0]):
                _,j,xj = find(x[i,:])
                wj = self.W[l][:,j].toarray() + self.beta
                a = self.betaln(wj + xj, axis=1)
                b = self.betaln(wj, axis=1)
                lp[i,l,:] = self.lrp[l] + self.lrsp[l] + a - b

        return lp
        
    def predict(self, x):
        """
        x: sparse n-samples x n-features
        """
        lp = self.predict_log_proba(x)

        return zip(*[np.unravel_index(np.argmax(p),p.shape) for p in lp])
    
    def predict_proba(self, x):
        """
        x: sparse n-samples x n-features
        """
        lp = self.predict_log_proba(x)
        lp -= lp.max(axis=1, keepdims=True)
        p = np.exp(lp)
        return p/p.sum(axis=1, keepdims=True)

In [56]:
nb = CombinedMultinomialBayesianNaiveBayes()
nb.fit(train_X, train_R, train_S)

In [57]:
pred_r, pred_s = nb.predict(val_X)

In [58]:
val_r = np.argmax(val_R.toarray(),axis=1)
val_s = np.argmax(val_S.toarray(),axis=1)

In [59]:
print f1_score(val_s, pred_s, average='micro')
print f1_score(val_s, pred_s, average='macro')
print f1_score(val_s, pred_s, average='weighted')
print "\n"
print f1_score(val_r, pred_r, average='micro')
print f1_score(val_r, pred_r, average='macro')
print f1_score(val_r, pred_r, average='weighted')

0.909415971395
0.828563704997
0.907714549346


0.814064362336
0.439955234229
0.810424417683
