In [None]:
                                        ####    1.1 GENDER IDENTIFICATION   #####

In [None]:
# Names ending in a, e and i are likely to be female, while names ending in k, o, r, s and t are likely to be male

In [2]:
import nltk
from nltk.corpus import names
import random


In [3]:
def gender_features(word):
     return {'last_letter': word[-1]}

In [4]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

In [5]:
labeled_names[1:10]

[('Aaron', 'male'),
 ('Abbey', 'male'),
 ('Abbie', 'male'),
 ('Abbot', 'male'),
 ('Abbott', 'male'),
 ('Abby', 'male'),
 ('Abdel', 'male'),
 ('Abdul', 'male'),
 ('Abdulkarim', 'male')]

In [6]:
random.shuffle(labeled_names)

In [7]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]


In [8]:
featuresets[1:10]

[({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'r'}, 'male'),
 ({'last_letter': 'h'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'l'}, 'male'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'a'}, 'female')]

In [9]:
train_set, test_set = featuresets[500:], featuresets[:500]
len(train_set)

7444

In [10]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [11]:
classifier.classify(gender_features('Neo'))

'male'

In [13]:
classifier.classify(gender_features('kausar'))

'male'

In [14]:
print(nltk.classify.accuracy(classifier, test_set))

0.79


In [15]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     36.9 : 1.0
             last_letter = 'k'              male : female =     32.2 : 1.0
             last_letter = 'f'              male : female =     25.5 : 1.0
             last_letter = 'p'              male : female =     12.6 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0


In [None]:
                                ########## 1.2  CHOOSING THE RIGHT FEATURES ########

In [None]:
# the interesting work in building a classifier is deciding what features might be relevant
# it's often possible to get decent performance by using a fairly simple and obvious set of features
# Typically, feature extractors are built through a process of trial-and-error.

In [16]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [20]:
gender_features2('John')

{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [21]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))


0.794


In [22]:
# Training data
# Devtest data
# Testing data
#a very productive method for refining the feature set is error analysis
# The training set is used to train the model, and the dev-test set is used to perform error analysis
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [23]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.76


In [24]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [25]:
for (tag, guess, name) in sorted(errors):
    sets='correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name)

In [34]:
sets[1:2000]

'orrect=male     guess=female   name=Zane                          '

In [35]:
def gender_features(word):
    return {'suffix1': word[-1:],'suffix2': word[-2:]}

In [36]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.784


In [62]:
                            #################### 1.3 DOCUMENT CLASSIFICATION ########

In [37]:
#Using these corpora, we can build classifiers that will automatically tag new documents with appropriate category labels

from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)for category in movie_reviews.categories()for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [40]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [41]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 

In [59]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [60]:
print(nltk.classify.accuracy(classifier, test_set))

0.87


In [61]:
classifier.show_most_informative_features(5)

Most Informative Features
 contains(unimaginative) = True              neg : pos    =      8.3 : 1.0
    contains(schumacher) = True              neg : pos    =      7.0 : 1.0
     contains(atrocious) = True              neg : pos    =      7.0 : 1.0
        contains(turkey) = True              neg : pos    =      6.5 : 1.0
       contains(singers) = True              pos : neg    =      6.3 : 1.0


In [None]:
                            ##########   1.4 PART OF SPEECH TAGGING ########

In [62]:
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [63]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [64]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
        return features
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [65]:
featuresets[1:10]

[({'endswith(e)': False}, 'NP-TL'),
 ({'endswith(e)': False}, 'NN-TL'),
 ({'endswith(e)': False}, 'JJ-TL'),
 ({'endswith(e)': False}, 'NN-TL'),
 ({'endswith(e)': False}, 'VBD'),
 ({'endswith(e)': False}, 'NR'),
 ({'endswith(e)': False}, 'AT'),
 ({'endswith(e)': False}, 'NN'),
 ({'endswith(e)': False}, 'IN')]

In [66]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [67]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.17135753356539035

In [68]:
classifier.classify(pos_features('cats'))

'IN'

In [None]:
                                ######### 1.5 EXPLOITING CONTEXT ############

In [None]:
# A part-of-speech classifier whose feature detector examines the context in which a word appears 
#in order to determine which part of speech tag should be assigned. 
#In particular, the identity of the previous word is included as a feature

In [69]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],"suffix(2)": sentence[i][-2:],"suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [70]:
pos_features(brown.sents()[0], 8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [71]:
tagged_sents = brown.tagged_sents(categories='news')

In [72]:
featuresets = []
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append( (pos_features(untagged_sent, i), tag) )

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [None]:
                              ####### 1.6 Sequence Classification ############

In [None]:
#Sequence Classification can helps to capture the dependencies between related classification tasks
#which choose an appropriate labeling for a collection of related inputs.
#In the case of part-of-speech tagging, a variety of different 
#sequence classifier models can be used to jointly choose part-of-speech tags for all the words in a given sentence.

In [None]:
#One sequence classification strategy, known as consecutive classification or greedy sequence classification, 
#is to find the most likely class label for the first input,
#then to use that answer to help find the best label for the next input.
#The process can then be repeated until all of the inputs have been labeled.
#This is the approach that was taken by the bigram tagger