# POS Tagging 

In [45]:
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
stop_words = set(stopwords.words('english')) 
  
statement = "Hello all, I am Dr. Chetana. " \
            "Welcome to the lab session of Natural Language Processing(NLP). " \
            "NLP is a very interesting area."

# sent_tokenize is one of instances of  
# PunktSentenceTokenizer from the nltk.tokenize.punkt module 
  
tokenized = sent_tokenize(statement) 
for i in tokenized: 
      
    # Word tokenizers is used to find the words  
    # and punctuation in a string 
    wordsList = nltk.word_tokenize(i) 
  
    # removing stop words from wordList 
    wordsList = [w for w in wordsList if not w in stop_words]  
  
    #  Using a Tagger. Which is part-of-speech  
    # tagger or POS-tagger.  
    tagged = nltk.pos_tag(wordsList) 
  
    print(tagged) 

[('Hello', 'NNP'), (',', ','), ('I', 'PRP'), ('Dr.', 'NNP'), ('Chetana', 'NNP'), ('.', '.')]
[('Welcome', 'JJ'), ('lab', 'NN'), ('session', 'NN'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('.', '.')]
[('NLP', 'NNP'), ('interesting', 'JJ'), ('area', 'NN'), ('.', '.')]


In [None]:
# NLTK includes a diverse set of corpora which can be read using the nltk.corpus package. 
# Corpus : Body of text, singular. Corpora is the plural of this.
# Most corpora consist of a set of files, each containing a document (or other pieces of text). 
# Each corpus reader provides a variety of methods to read data from the corpus, 
# depending on the format of the corpus. 
# NLTK's data package also contains a wide variety of annotated corpora. 
# For example, the Brown Corpus is annotated with part-of-speech tags
# Indian Language POS-Tagged Corpus includes samples of Indian text annotated with part-of-speech tags

In [37]:
# Evaluate Unigram tag using brown corpus.
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
# We train a UnigramTagger by specifying tagged sentence data as a parameter
# when we initialize the tagger.
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
print(unigram_tagger.tag(brown_sents[2008]))
print(unigram_tagger.evaluate(brown_tagged_sents))

[('Others', 'NNS'), (',', ','), ('which', 'WDT'), ('are', 'BER'), ('reached', 'VBN'), ('by', 'IN'), ('walking', 'VBG'), ('up', 'RP'), ('a', 'AT'), ('single', 'AP'), ('flight', 'NN'), ('of', 'IN'), ('stairs', 'NNS'), (',', ','), ('have', 'HV'), ('balconies', 'NNS'), ('.', '.')]
0.9349006503968017


In [44]:
from nltk import UnigramTagger
from nltk.corpus import brown
# Use the brown corpus with universal tagset for readability
tagged_sentences = brown.tagged_sents(categories="news", tagset="universal")

# 20% of the data for testing, and 80% for training
i = int(len(tagged_sentences)*0.2)
train_sentences = tagged_sentences[i:]
test_sentences = tagged_sentences[:i]

# Train the tagger with train sentences
unigram_tagger = UnigramTagger(train_sentences)
# Evaluate with test sentences
# default evaluation metric for nltk taggers is accuracy
accuracy = unigram_tagger.evaluate(test_sentences)
print("Accuracy:", accuracy)

Accuracy: 0.8630364649525858


In [42]:
# Other measures to evaluate the quality of POS tagging
tagged_test_sentences = unigram_tagger.tag_sents([[token for token,tag in sent] for sent in test_sentences])
gold = [str(tag) for sentence in test_sentences for token,tag in sentence]
pred = [str(tag) for sentence in tagged_test_sentences for token,tag in sentence]
from sklearn import metrics
print(metrics.classification_report(gold, pred))

              precision    recall  f1-score   support

           .       1.00      1.00      1.00      2107
         ADJ       0.89      0.79      0.84      1341
         ADP       0.97      0.92      0.94      2621
         ADV       0.93      0.79      0.86       573
        CONJ       1.00      1.00      1.00       453
         DET       1.00      0.99      1.00      2456
        NOUN       0.96      0.76      0.85      6265
         NUM       0.99      0.85      0.92       379
        None       0.00      0.00      0.00         0
        PRON       1.00      0.96      0.98       502
         PRT       0.69      0.96      0.80       481
        VERB       0.96      0.83      0.89      3274
           X       0.10      0.17      0.12         6

   micro avg       0.86      0.86      0.86     20458
   macro avg       0.81      0.77      0.78     20458
weighted avg       0.96      0.86      0.91     20458



  'recall', 'true', average, warn_for)
