In [1]:
import shorttext
import numpy as np
import shorttext.utils.classification_exceptions as e

Using Theano backend.


In [22]:
# abstract class
class StackedGeneralization:
    def __init__(self, intermediate_classifiers={}):
        self.classifiers = intermediate_classifiers
        self.classlabels = []

    def register_classifiers(self):
        self.classifier2idx = {}
        self.idx2classifier = {}
        for idx, key in enumerate(self.classifiers.keys()):
            self.classifier2idx[key] = idx
            self.idx2classifier[idx] = key

    def register_classlabels(self, labels):
        self.classlabels = list(labels)
        self.labels2idx = {}
        for idx, classlabel in enumerate(self.classlabels):
            self.labels2idx[classlabel] = idx

    def add_classifier(self, name, classifier):
        self.classifiers[name] = classifier
        self.register_classifiers()

    def delete_classifier(self, name):
        del self.classifiers[name]
        self.register_classifiers()

    def translate_shorttext_intfeature_matrix(self, shorttext):
        feature_matrix = np.zeros((len(self.classifier2idx),len(self.labels2idx)))
        for key in self.classifier2idx:
            scoredict = self.classifiers[key].score(shorttext)
            for label in scoredict:
                feature_matrix[self.classifier2idx[key], self.labels2idx[label]] = scoredict[label] 
        return feature_matrix
    
    def convert_label_to_buckets(self, label):
        buckets = np.zeros(len(self.labels2idx), dtype=np.int)
        buckets[self.labels2idx[label]] = 1
        return buckets

    def train(self, classdict):
        raise e.NotImplementedException()

    def score(self, shorttext):
        raise e.NotImplementedException()

In [4]:
subdict = shorttext.data.subjectkeywords()

In [8]:
wvmodel = shorttext.utils.load_word2vec_model('/Users/stephenhky/Data/word2vec/GoogleNews-vectors-negative300.bin')

In [9]:
cnn_caltor = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel)
cnnmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(subdict))
cnn_caltor.train(subdict, cnnmodel)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
lda128 = shorttext.classifiers.GensimTopicModeler()
lda128.train(subdict, 128)

In [11]:
from sklearn.naive_bayes import GaussianNB

nblda_caltor = shorttext.classifiers.TopicVectorSkLearnClassifier(lda128, GaussianNB())
nblda_caltor.train(subdict)

In [12]:
cnn_caltor.score('linear algebra')

{'mathematics': 0.99978971,
 'physics': 0.00018459298,
 'theology': 2.5684361e-05}

In [13]:
nblda_caltor.score('linear algebra')

{'mathematics': 1.0, 'physics': 0.0, 'theology': 0.0}

In [23]:
stacker = StackedGeneralization(intermediate_classifiers={'cnn': cnn_caltor, 'nblda': nblda_caltor})
stacker.register_classifiers()
stacker.register_classlabels(subdict.keys())

In [24]:
stacker.translate_shorttext_intfeature_matrix('linear algebra')

array([[  9.99789715e-01,   1.84592980e-04,   2.56843614e-05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [25]:
stacker.convert_label_to_buckets('mathematics')

array([1, 0, 0])

In [17]:
[0]*3

[0, 0, 0]