In [117]:
import shorttext
import numpy as np
import shorttext.utils.classification_exceptions as e

from keras.layers import Dense, Reshape
from keras.models import Sequential
from keras.regularizers import l2

In [100]:
# abstract class
class StackedGeneralization:
    def __init__(self, intermediate_classifiers={}):
        self.classifiers = intermediate_classifiers
        self.classlabels = []
        self.trained = False

    def register_classifiers(self):
        self.classifier2idx = {}
        self.idx2classifier = {}
        for idx, key in enumerate(self.classifiers.keys()):
            self.classifier2idx[key] = idx
            self.idx2classifier[idx] = key

    def register_classlabels(self, labels):
        self.classlabels = list(labels)
        self.labels2idx = {}
        for idx, classlabel in enumerate(self.classlabels):
            self.labels2idx[classlabel] = idx

    def add_classifier(self, name, classifier):
        self.classifiers[name] = classifier
        self.register_classifiers()

    def delete_classifier(self, name):
        del self.classifiers[name]
        self.register_classifiers()

    def translate_shorttext_intfeature_matrix(self, shorttext):
        feature_matrix = np.zeros((len(self.classifier2idx),len(self.labels2idx)))
        for key in self.classifier2idx:
            scoredict = self.classifiers[key].score(shorttext)
            for label in scoredict:
                feature_matrix[self.classifier2idx[key], self.labels2idx[label]] = scoredict[label] 
        return feature_matrix
    
    def convert_label_to_buckets(self, label):
        buckets = np.zeros(len(self.labels2idx), dtype=np.int)
        buckets[self.labels2idx[label]] = 1
        return buckets
    
    def convert_traindata_matrix(self, classdict, tobucket=True):
        for label in classdict:
            y = self.convert_label_to_buckets(label) if tobucket else self.labels2idx[label]
            for shorttext in classdict[label]:
                X = self.translate_shorttext_intfeature_matrix(shorttext)
                yield X, y


    def train(self, classdict):
        raise e.NotImplementedException()

    def score(self, shorttext):
        raise e.NotImplementedException()

In [77]:
subdict = shorttext.data.subjectkeywords()

In [4]:
wvmodel = shorttext.utils.load_word2vec_model('/Users/hok/Data/Word2vec/GoogleNews-vectors-negative300.bin.gz')

In [5]:
cnn_caltor = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel)
cnnmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(subdict))
cnn_caltor.train(subdict, cnnmodel)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [6]:
lda128 = shorttext.classifiers.GensimTopicModeler()
lda128.train(subdict, 128)

In [7]:
from sklearn.naive_bayes import GaussianNB

nblda_caltor = shorttext.classifiers.TopicVectorSkLearnClassifier(lda128, GaussianNB())
nblda_caltor.train(subdict)

In [8]:
cnn_caltor.score('linear algebra')

{'mathematics': 0.99984062,
 'physics': 0.00012793425,
 'theology': 3.1439002e-05}

In [9]:
nblda_caltor.score('linear algebra')

{'mathematics': 1.0, 'physics': 0.0, 'theology': 0.0}

In [10]:
stacker = StackedGeneralization(intermediate_classifiers={'cnn': cnn_caltor, 'nblda': nblda_caltor})
stacker.register_classifiers()
stacker.register_classlabels(subdict.keys())

In [11]:
stacker.translate_shorttext_intfeature_matrix('linear algebra')

array([[  9.99840617e-01,   1.27934254e-04,   3.14390018e-05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [12]:
stacker.convert_label_to_buckets('mathematics')

array([1, 0, 0])

In [14]:
stacker.translate_shorttext_intfeature_matrix('quantum cohomology')

array([[  3.10719013e-03,   9.96733725e-01,   1.59094488e-04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00]])

In [129]:
class LogisticStackedGeneralization(StackedGeneralization):
    def train(self, classdict, optimizer='adam', l2reg=0.01, nb_epoch=100):
        kmodel = Sequential()
        kmodel.add(Reshape((len(self.classifier2idx) * len(self.labels2idx),),
                           input_shape=(len(self.classifier2idx), len(self.labels2idx))))
        kmodel.add(Dense(output_dim=len(classdict),
                         activation='sigmoid',
                         W_regularizer=l2(l2reg)))
        kmodel.compile(loss='categorical_crossentropy', optimizer=optimizer)

        Xy = [(xone, yone) for xone, yone in self.convert_traindata_matrix(classdict, tobucket=True)]
        X = np.array(map(lambda item: item[0], Xy))
        y = np.array(map(lambda item: item[1], Xy))

        print X.shape, y.shape

        kmodel.fit(X, y, nb_epoch=nb_epoch)

        self.model = kmodel
        self.trained = True

    def score(self, shorttext):
        if not self.trained:
            raise e.ModelNotTrainedException()

        input_matrix = self.translate_shorttext_intfeature_matrix(shorttext)
        prediction = self.model.predict(np.array([input_matrix]))

        scoredict = {}
        for idx, label in enumerate(self.classlabels):
            scoredict[label] = prediction[0][idx]

        return scoredict

In [130]:
logit = LogisticStackedGeneralization(intermediate_classifiers={'cnn': cnn_caltor, 'nblda': nblda_caltor})
logit.register_classifiers()
logit.register_classlabels(subdict)

In [131]:
logit.train(subdict)

(45, 2, 3) (45, 3)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Ep

In [132]:
logit.score('linear algebra')

{'mathematics': 0.76277208, 'physics': 0.26839256, 'theology': 0.35419792}

<keras.models.Sequential at 0x26372ca90>