In [1]:
import shorttext
import numpy as np
import shorttext.utils.classification_exceptions as e

from keras.layers import Dense, Reshape
from keras.models import Sequential
from keras.regularizers import l2

Using Theano backend.


In [2]:
# abstract class
class StackedGeneralization:
    def __init__(self, intermediate_classifiers={}):
        self.classifiers = intermediate_classifiers
        self.classlabels = []
        self.trained = False

    def register_classifiers(self):
        self.classifier2idx = {}
        self.idx2classifier = {}
        for idx, key in enumerate(self.classifiers.keys()):
            self.classifier2idx[key] = idx
            self.idx2classifier[idx] = key

    def register_classlabels(self, labels):
        self.classlabels = list(labels)
        self.labels2idx = {}
        for idx, classlabel in enumerate(self.classlabels):
            self.labels2idx[classlabel] = idx

    def add_classifier(self, name, classifier):
        self.classifiers[name] = classifier
        self.register_classifiers()

    def delete_classifier(self, name):
        del self.classifiers[name]
        self.register_classifiers()

    def translate_shorttext_intfeature_matrix(self, shorttext):
        feature_matrix = np.zeros((len(self.classifier2idx),len(self.labels2idx)))
        for key in self.classifier2idx:
            scoredict = self.classifiers[key].score(shorttext)
            for label in scoredict:
                feature_matrix[self.classifier2idx[key], self.labels2idx[label]] = scoredict[label] 
        return feature_matrix
    
    def convert_label_to_buckets(self, label):
        buckets = np.zeros(len(self.labels2idx), dtype=np.int)
        buckets[self.labels2idx[label]] = 1
        return buckets
    
    def convert_traindata_matrix(self, classdict, tobucket=True):
        for label in classdict:
            y = self.convert_label_to_buckets(label) if tobucket else self.labels2idx[label]
            for shorttext in classdict[label]:
                X = self.translate_shorttext_intfeature_matrix(shorttext)
                yield X, y


    def train(self, classdict):
        raise e.NotImplementedException()

    def score(self, shorttext):
        raise e.NotImplementedException()

In [3]:
subdict = shorttext.data.subjectkeywords()

In [4]:
wvmodel = shorttext.utils.load_word2vec_model('/Users/stephenhky/Data/word2vec/GoogleNews-vectors-negative300.bin')

In [5]:
cnn_caltor = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel)
cnnmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(subdict))
cnn_caltor.train(subdict, cnnmodel)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [6]:
lda128 = shorttext.classifiers.GensimTopicModeler()
lda128.train(subdict, 128)

In [7]:
from sklearn.naive_bayes import GaussianNB

nblda_caltor = shorttext.classifiers.TopicVectorSkLearnClassifier(lda128, GaussianNB())
nblda_caltor.train(subdict)

In [8]:
cnn_caltor.score('linear algebra')

{'mathematics': 0.99972397,
 'physics': 0.00022208734,
 'theology': 5.3945409e-05}

In [9]:
nblda_caltor.score('linear algebra')

{'mathematics': 1.0, 'physics': 0.0, 'theology': 0.0}

In [55]:
class LogisticStackedGeneralization(StackedGeneralization):
    def train(self, classdict, optimizer='adam', l2reg=0.01, nb_epoch=1000):
        kmodel = Sequential()
        kmodel.add(Reshape((len(self.classifier2idx) * len(self.labels2idx),),
                           input_shape=(len(self.classifier2idx), len(self.labels2idx))))
        kmodel.add(Dense(output_dim=len(classdict),
                         activation='sigmoid',
                         W_regularizer=l2(l2reg)))
        kmodel.compile(loss='categorical_crossentropy', optimizer=optimizer)

        Xy = [(xone, yone) for xone, yone in self.convert_traindata_matrix(classdict, tobucket=True)]
        X = np.array(map(lambda item: item[0], Xy))
        y = np.array(map(lambda item: item[1], Xy))

        print X.shape, y.shape

        kmodel.fit(X, y, nb_epoch=nb_epoch)

        self.model = kmodel
        self.trained = True

    def score(self, shorttext):
        if not self.trained:
            raise e.ModelNotTrainedException()

        input_matrix = self.translate_shorttext_intfeature_matrix(shorttext)
        prediction = self.model.predict(np.array([input_matrix]))

        scoredict = {}
        for idx, label in enumerate(self.classlabels):
            scoredict[label] = prediction[0][idx]

        return scoredict

In [56]:
logit = LogisticStackedGeneralization(intermediate_classifiers={'cnn': cnn_caltor, 'nblda': nblda_caltor})
logit.register_classifiers()
logit.register_classlabels(subdict)

In [57]:
logit.train(subdict)

(45, 2, 3) (45, 3)
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1

In [58]:
logit.score('linear algebra')

{'mathematics': 0.5971843, 'physics': 0.048728578, 'theology': 0.052062999}

In [59]:
logit.score('quantum mechanics')

{'mathematics': 0.051558491, 'physics': 0.56622142, 'theology': 0.046154667}

In [60]:
W, b=logit.model.get_weights()
print W
print b

[[ 1.09985244 -0.87020779 -1.25024903]
 [-1.09337103  0.99620873 -0.80586994]
 [-1.12908328 -1.15591133  0.26295975]
 [ 0.41320395 -0.77839983 -0.81154251]
 [-0.70005041  0.59358555 -1.38240731]
 [-1.24424171 -0.9976837   0.29964894]]
[-1.11870229 -1.32332516 -0.84022224]


In [61]:
x=logit.translate_shorttext_intfeature_matrix('quantum mechanics')
print x

[[  9.39190159e-06   9.99990344e-01   2.93100982e-07]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00]]


In [62]:
x.reshape((6,))

array([  9.39190159e-06,   9.99990344e-01,   2.93100982e-07,
         0.00000000e+00,   1.00000000e+00,   0.00000000e+00])

In [63]:
np.dot(W.transpose(), x.reshape((6,)))+b

array([-2.91210318,  0.26645099, -3.02850337])

In [64]:
logit.model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
reshape_4 (Reshape)              (None, 6)             0           reshape_input_4[0][0]            
____________________________________________________________________________________________________
dense_7 (Dense)                  (None, 3)             21          reshape_4[0][0]                  
Total params: 21
Trainable params: 21
Non-trainable params: 0
____________________________________________________________________________________________________


In [65]:
logit.model.get_config()

[{'class_name': 'Reshape',
  'config': {'batch_input_shape': (None, 2, 3),
   'input_dtype': 'float32',
   'name': 'reshape_4',
   'target_shape': (6,),
   'trainable': True}},
 {'class_name': 'Dense',
  'config': {'W_constraint': None,
   'W_regularizer': {'l1': 0.0,
    'l2': 0.009999999776482582,
    'name': 'L1L2Regularizer'},
   'activation': 'sigmoid',
   'activity_regularizer': None,
   'b_constraint': None,
   'b_regularizer': None,
   'bias': True,
   'init': 'glorot_uniform',
   'input_dim': 6,
   'name': 'dense_7',
   'output_dim': 3,
   'trainable': True}}]