# Classify text

In [1]:
import nltk
from nltk.util import ngrams

def get_features(word):
    word = word.lower()
    
    feature = {'word('+ word + ')': True}
    feature['sufix1'] =  word[-1:]
    
    
    for ngram in  ngrams(word, 2):
        feature['bigram' + str(ngram) + ''] = True
    
        
    for ngram in  ngrams(word, 3):
        feature['trigram' + str(ngram) + ''] = True
    
    
    return feature
        
def get_features_from_file(file):

    lines = [line.rstrip() for line in open(file)]
    
    features = []
    
    for word in lines:
        
        features.append(get_features(word))
    
    return features

def get_features_from_sentenece(sentence):
    features = {}
    for word in sentence.split(' '):
        features.update(get_features(word))
    return features
        
    

In [2]:
featuresets_de = [(f, 'de') for f in get_features_from_file('data/top1000de.txt')]
featuresets_en = [(f, 'en') for f in get_features_from_file('data/top1000en.txt')]

In [3]:
import random

random.shuffle(featuresets_de)
random.shuffle(featuresets_en)

In [4]:
print('German features', len(featuresets_de))
print('English features', len(featuresets_de))

('German features', 1000)
('English features', 1000)


In [5]:
print(featuresets_de[0])
print(featuresets_en[0])

({"trigram('d', 'e', 'r')": True, "bigram('r', ' ')": True, "trigram('r', ' ', 'w')": True, "trigram('e', 'd', 'e')": True, "bigram('m', 'e')": True, "trigram(' ', 'w', 'i')": True, "bigram('m', 'm')": True, "trigram('e', 'r', ' ')": True, "trigram('m', 'e', 'r')": True, "bigram(' ', 'w')": True, "bigram('i', 'e')": True, "trigram('m', 'm', 'e')": True, "trigram('i', 'm', 'm')": True, 'word(immer wieder)': True, 'sufix1': 'r', "bigram('e', 'r')": True, "bigram('w', 'i')": True, "trigram('i', 'e', 'd')": True, "trigram('w', 'i', 'e')": True, "bigram('i', 'm')": True, "bigram('d', 'e')": True, "bigram('e', 'd')": True}, 'de')
({'word(be)': True, 'sufix1': 'e', "bigram('b', 'e')": True}, 'en')


In [9]:
train_feats, test_feats = featuresets_de[200:] + featuresets_en[200:], featuresets_de[:200] + featuresets_en[:200]
classifier = nltk.NaiveBayesClassifier.train(train_feats)

In [27]:
print('accuracy', nltk.classify.accuracy(classifier, test_feats))

('accuracy', 0.8425)


In [11]:
print(classifier.classify(get_features_from_sentenece('Mein Name ist Hugo')))
print(classifier.classify(get_features_from_sentenece('My name is Hugo')))

de
en


In [12]:
classifier.show_most_informative_features(10)

Most Informative Features
  trigram('i', 'c', 'h') = True               de : en     =     28.3 : 1.0
        bigram('e', 'a') = True               en : de     =     23.7 : 1.0
  trigram('s', 'c', 'h') = True               de : en     =     23.0 : 1.0
  trigram('c', 'h', 'e') = True               de : en     =     20.3 : 1.0
  trigram('e', 'i', 'n') = True               de : en     =     19.7 : 1.0
        bigram('e', 'i') = True               de : en     =     18.8 : 1.0
  trigram('e', 'i', 't') = True               de : en     =     18.3 : 1.0
        bigram('t', 'h') = True               en : de     =     17.4 : 1.0
        bigram('h', 'r') = True               de : en     =     16.3 : 1.0
        bigram('e', 'h') = True               de : en     =     15.7 : 1.0


In [26]:
import collections
import nltk.metrics

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
 
for i, (feats, label) in enumerate(test_feats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
    
print 'DE precision:', nltk.metrics.precision(refsets['de'], testsets['de'])
print 'DE recall:', nltk.metrics.recall(refsets['de'], testsets['de'])
print 'DE F-measure:', nltk.metrics.f_measure(refsets['de'], testsets['de'])
print 'EN precision:', nltk.metrics.precision(refsets['en'], testsets['en'])
print 'EN recall:', nltk.metrics.recall(refsets['en'], testsets['en'])
print 'EN F-measure:', nltk.metrics.f_measure(refsets['en'], testsets['en'])

DE precision: 0.8407960199
DE recall: 0.845
DE F-measure: 0.84289276808
EN precision: 0.844221105528
EN recall: 0.84
EN F-measure: 0.842105263158
