In [1]:
import nltk
nltk.usage(nltk.classify.ClassifierI)

ClassifierI supports the following operations:
  - self.classify(featureset)
  - self.classify_many(featuresets)
  - self.labels()
  - self.prob_classify(featureset)
  - self.prob_classify_many(featuresets)


Let's try the NaiveBayesClassifier:

In [2]:
train = [
        (dict(a=1,b=1,c=1), 'y'),
        (dict(a=1,b=1,c=1), 'x'),
        (dict(a=1,b=1,c=0), 'y'),
        (dict(a=0,b=1,c=1), 'x'),
        (dict(a=0,b=1,c=1), 'y'),
        (dict(a=0,b=0,c=1), 'y'),
        (dict(a=0,b=1,c=0), 'x'),
        (dict(a=0,b=0,c=0), 'x'),
        (dict(a=0,b=1,c=1), 'y'),
        ]
test = [
        (dict(a=1,b=0,c=1)), # unseen
        (dict(a=1,b=0,c=0)), # unseen
        (dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x
        (dict(a=0,b=1,c=0)), # seen 1 time, label=x
        ]
classifier = nltk.classify.NaiveBayesClassifier.train(train)
sorted(classifier.labels())

['x', 'y']

In [3]:
classifier.classify_many(test)

['y', 'x', 'y', 'x']

In [4]:
for pdist in classifier.prob_classify_many(test):
     print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))

0.3203 0.6797
0.5857 0.4143
0.3792 0.6208
0.6470 0.3530


In [5]:
classifier.show_most_informative_features()

Most Informative Features
                       c = 0                   x : y      =      2.0 : 1.0
                       c = 1                   y : x      =      1.5 : 1.0
                       a = 1                   y : x      =      1.4 : 1.0
                       b = 0                   x : y      =      1.2 : 1.0
                       a = 0                   x : y      =      1.2 : 1.0
                       b = 1                   y : x      =      1.1 : 1.0


Let's try the Decision Tree

In [8]:
classifier = nltk.classify.DecisionTreeClassifier.train(
        train, entropy_cutoff=0, support_cutoff=0)
sorted(classifier.labels())



['x', 'y']

In [9]:
print(classifier)

c=0? .................................................. x
  a=0? ................................................ x
  a=1? ................................................ y
c=1? .................................................. y



In [12]:
classifier.classify_many(test)

print(classifier)
'''for pdist in classifier.prob_classify_many(test):
    print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))'''

c=0? .................................................. x
  a=0? ................................................ x
  a=1? ................................................ y
c=1? .................................................. y



"for pdist in classifier.prob_classify_many(test):\n    print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))"

In [13]:
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

train_data = [({"a": 4, "b": 1, "c": 0}, "ham"),
                  ({"a": 5, "b": 2, "c": 1}, "ham"),
                  ({"a": 0, "b": 3, "c": 4}, "spam"),
                  ({"a": 5, "b": 1, "c": 1}, "ham"),
                  ({"a": 1, "b": 4, "c": 3}, "spam")]

classif = SklearnClassifier(BernoulliNB()).train(train_data)
test_data = [{"a": 3, "b": 2, "c": 1},
             {"a": 0, "b": 3, "c": 7}]

classif.classify_many(test_data)
#print(classif.classify(gender_features('Frank')))
#classif.show_most_informative_features(5)
print(classif.labels())

classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
classif.classify_many(test_data)

print(classif.labels())

['ham', 'spam']
['ham', 'spam']
