In [1]:
import shorttext
import numpy as np
import shorttext.utils.classification_exceptions as e

Using Theano backend.


In [2]:
# abstract class
class StackedGeneralization:
    def __init__(self, intermediate_classifiers={}):
        self.classifiers = intermediate_classifiers
        self.classlabels = []

    def register_classifiers(self):
        self.classifier2idx = {}
        self.idx2classifier = {}
        for idx, key in enumerate(self.classifiers.keys()):
            self.classifier2idx[key] = idx
            self.idx2classifier[idx] = key

    def add_classifier(self, name, classifier):
        self.classifiers[name] = classifier
        self.register_classifiers()

    def delete_classifier(self, name):
        del self.classifiers[name]
        self.register_classifiers()

    def translate_shorttext_intfeatures(self, shorttext):
        feature_vec = np.zeros(len(self.classifier2idx))
        for key in self.classifier2idx:
            feature_vec[self.classifier2idx[key]] = self.classifiers[key].score(shorttext)
        return feature_vec

    def convert_traindata_vectors(self, classdict):
        self.classlabels = list(classdict.keys())
        self.labels2idx = {}
        for idx, classlabel in enumerate(self.classlabels):
            self.labels2idx[classlabel] = idx

        X = []
        y = []
        for label in classdict:
            topicvecs = map(self.translate_shorttext_intfeatures, classdict[label])
            X += topicvecs
            y += [self.labels2idx[label]]*len(topicvecs)

        return X, y

    def train(self, classdict):
        raise e.NotImplementedException()

    def score(self, shorttext):
        raise e.NotImplementedException()


In [3]:
subdict = shorttext.data.subjectkeywords()

In [4]:
wvmodel = shorttext.utils.load_word2vec_model('/Users/hok/Data/Word2Vec/GoogleNews-vectors-negative300.bin.gz')

In [5]:
cnn_caltor = shorttext.classifiers.VarNNEmbeddedVecClassifier(wvmodel)
cnnmodel = shorttext.classifiers.frameworks.CNNWordEmbed(len(subdict))
cnn_caltor.train(subdict, cnnmodel)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [6]:
lda128 = shorttext.classifiers.GensimTopicModeler()
lda128.train(subdict, 128)

In [7]:
from sklearn.naive_bayes import GaussianNB

nblda_caltor = shorttext.classifiers.TopicVectorSkLearnClassifier(lda128, GaussianNB())
nblda_caltor.train(subdict)

In [8]:
cnn_caltor.score('linear algebra')

{'mathematics': 0.99991071, 'physics': 7.257604e-05, 'theology': 1.6685246e-05}

In [9]:
nblda_caltor.score('linear algebra')

{'mathematics': 1.0, 'physics': 0.0, 'theology': 0.0}

In [13]:
stacker = StackedGeneralization(intermediate_classifiers={'cnn': cnn_caltor, 'nblda': nblda_caltor})
stacker.register_classifiers()

In [14]:
stacker.convert_traindata_vectors(subdict)

TypeError: float() argument must be a string or a number

In [19]:
stacker.classifiers['cnn'].score('linear')

{'mathematics': 0.65447074, 'physics': 0.33622575, 'theology': 0.0093035102}