In [84]:
### Part of Speech(POS) Tagging or word-category disambiguation : The process of classifying words into their parts
### of speech and labeling them accordingly.

In [85]:
# Requirement: nltk package,scikit-learn
# word_tokenize: NLTK tokenizer tokenizes a sentence into words and punctuation
# pos_tag : Tagging is done for certain words from pretrained NLTK model.

In [86]:
from nltk import word_tokenize,pos_tag
text=word_tokenize("They'd love to move house; it's just so difficult to sell right now.")
pos_tag(text)

[('They', 'PRP'),
 ("'d", 'MD'),
 ('love', 'VB'),
 ('to', 'TO'),
 ('move', 'VB'),
 ('house', 'NN'),
 (';', ':'),
 ('it', 'PRP'),
 ("'s", 'VBZ'),
 ('just', 'RB'),
 ('so', 'RB'),
 ('difficult', 'JJ'),
 ('to', 'TO'),
 ('sell', 'VB'),
 ('right', 'RB'),
 ('now', 'RB'),
 ('.', '.')]

In [87]:
# The tag set depends on the corpus that was used to train the tagger.
# The default tagger of nltk.pos_tag() uses the Penn Treebank Tag Set.
# help.upenn_tagset('tag'): Query documenation for each tag from NLTK library.
nltk.help.upenn_tagset('VB')

VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...


In [88]:
# tagged token is represented using a tuple consisting of the token and the tag.
# standard string representation of a tagged token can be represented using the function str2tuple() to tuple.
tagged_token = nltk.tag.str2tuple('house/NN')
print(tagged_token)
print(tagged_token[0])
print(tagged_token[1])

('house', 'NN')
house
NN


In [89]:
### Reading Tagged Corpora

In [90]:
# Reading Brown Corpus: word and their corresponding tags are represented using slash. Eg.
# Jury/nn-tl said/vbd Friday/nr an/at investigation/nn
nltk.corpus.brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [91]:
### Training own POS Tagger using scikit-learn

In [92]:
# Picking Corpus for training POS tagger

In [93]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 100676


In [94]:
# features to use for classifier like 2-letter suffix  to indicate past-tense verbs.

In [95]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 
features(['Ram', 'was', 'a', 'brave' , 'warrior'], 2)

{'word': 'a',
 'is_first': False,
 'is_last': False,
 'is_capitalized': False,
 'is_all_caps': False,
 'is_all_lower': True,
 'prefix-1': 'a',
 'prefix-2': 'a',
 'prefix-3': 'a',
 'suffix-1': 'a',
 'suffix-2': 'a',
 'suffix-3': 'a',
 'prev_word': 'was',
 'next_word': 'brave',
 'has_hyphen': False,
 'is_numeric': False,
 'capitals_inside': False}

In [96]:
# Removing tags from tagged corpus

In [97]:
def Remove_tag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [98]:
#Transforming corpus sentences to features for a single word.

In [99]:
# Split the dataset for training and testing
splt = int(.80 * len(tagged_sentences))
training_sentences = tagged_sentences[:splt]
test_sentences = tagged_sentences[splt:]

print("Training size:",len(training_sentences))
print("Test size:",len(test_sentences))
 
def transform_to_dataset(tagged_sentences):
    X, y = [], [] 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(Remove_tag(tagged), index))
            y.append(tagged[index][1])
    return X, y
 
X, y = transform_to_dataset(training_sentences)

Training size: 3131
Test size: 783


In [100]:
# Train the Classifier using Decision Tree

In [101]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
 
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])
 
clf.fit(X[:10000], y[:10000])   # Use only the first 10K samples
 
print('Training completed')
 
X_test, y_test = transform_to_dataset(test_sentences)
 
print("Accuracy:", clf.score(X_test, y_test))

Training completed
Accuracy: 0.894206297719447


In [102]:
#Using Own classifier to tag words

In [103]:
def pos_tag(sentence):
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return zip(sentence, tags)
text = word_tokenize("Let us learn NLP Tagging.")
print(list(pos_tag(text)))

[('Let', 'CC'), ('us', 'PRP'), ('learn', 'NN'), ('NLP', 'NNP'), ('Tagging', 'NNP'), ('.', '.')]
