# <font color='lightblue'>Demonstrate Parts of Speech Tagging with any sentence of your choice </font>

In [1]:
import nltk
nltk.download("punkt")
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import string
from nltk.tokenize import word_tokenize

### <font color='yellow'>Performing POS Using NLTK </font>

In [3]:
sentence= """US President Joe Biden said that he raised the importance of respecting human rights and free press during talks with PM Narendra Modi
            on the sidelines of the G20 Summit, reported PTI."""
tokens= word_tokenize(sentence)   # Tokenize the sentence
nltk_pos_tags= nltk.pos_tag(tokens)    # Perform POS tagging

print(f"Length of NLTK pos tags : {len(nltk_pos_tags)} \n")
for token, pos_tag in nltk_pos_tags[:10]:
    print(f"Token: {token} --> POS Tag: {pos_tag}")

# print(nltk_pos_tags)

Length of NLTK pos tags : 34 

Token: US --> POS Tag: NNP
Token: President --> POS Tag: NNP
Token: Joe --> POS Tag: NNP
Token: Biden --> POS Tag: NNP
Token: said --> POS Tag: VBD
Token: that --> POS Tag: IN
Token: he --> POS Tag: PRP
Token: raised --> POS Tag: VBD
Token: the --> POS Tag: DT
Token: importance --> POS Tag: NN


### <font color='yellow'> Performing POS Using TextBlob </font>

In [4]:
!pip install textblob
from textblob import TextBlob



In [5]:
blob = TextBlob(sentence)   # Create a TextBlob object
blob_pos_tags = blob.tags    # Perform POS tagging

print(f"Length of blob pos tags : {len(blob_pos_tags)} \n")
for word, pos_tag in blob_pos_tags[:11]:
    print(f"Word: {word}, POS Tag: {pos_tag}")

Length of blob pos tags : 32 

Word: US, POS Tag: NNP
Word: President, POS Tag: NNP
Word: Joe, POS Tag: NNP
Word: Biden, POS Tag: NNP
Word: said, POS Tag: VBD
Word: that, POS Tag: IN
Word: he, POS Tag: PRP
Word: raised, POS Tag: VBD
Word: the, POS Tag: DT
Word: importance, POS Tag: NN
Word: of, POS Tag: IN


### <font color='yellow'>POS tagging After removing punctuations and stopwords</font>

In [6]:
from nltk.corpus import stopwords as sw

stopwords_eng= sw.words('english')
words= word_tokenize(sentence)
words_nosw_nopunc = [w for w in words if w.isalpha() and w not in stopwords_eng]
print(len(sentence))
print(len(words_nosw_nopunc))
print(words_nosw_nopunc)

words_nosw_nopunc_pos_tags= nltk.pos_tag(words_nosw_nopunc)    # Perform POS tagging on words_nosw_nopunc tokens
words_nosw_nopunc_pos_tags

196
20
['US', 'President', 'Joe', 'Biden', 'said', 'raised', 'importance', 'respecting', 'human', 'rights', 'free', 'press', 'talks', 'PM', 'Narendra', 'Modi', 'sidelines', 'Summit', 'reported', 'PTI']


[('US', 'NNP'),
 ('President', 'NNP'),
 ('Joe', 'NNP'),
 ('Biden', 'NNP'),
 ('said', 'VBD'),
 ('raised', 'VBN'),
 ('importance', 'NN'),
 ('respecting', 'VBG'),
 ('human', 'JJ'),
 ('rights', 'NNS'),
 ('free', 'JJ'),
 ('press', 'NN'),
 ('talks', 'NNS'),
 ('PM', 'NNP'),
 ('Narendra', 'NNP'),
 ('Modi', 'NNP'),
 ('sidelines', 'VBZ'),
 ('Summit', 'NNP'),
 ('reported', 'VBD'),
 ('PTI', 'NNP')]

### <font color='yellow'>POS Using Spacy</font>

In [7]:
!pip install spacy
import spacy



In [8]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(sentence)
print(len(doc))

for token in doc[10:]:
  if(token.pos_ not in ["SPACE","PUNCT"]):
    print(token," | ", token.pos_, " | ", spacy.explain(token.pos_), " | ", token.tag_, " | ", spacy.explain(token.tag_))

35
of  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
respecting  |  VERB  |  verb  |  VBG  |  verb, gerund or present participle
human  |  ADJ  |  adjective  |  JJ  |  adjective (English), other noun-modifier (Chinese)
rights  |  NOUN  |  noun  |  NNS  |  noun, plural
and  |  CCONJ  |  coordinating conjunction  |  CC  |  conjunction, coordinating
free  |  ADJ  |  adjective  |  JJ  |  adjective (English), other noun-modifier (Chinese)
press  |  NOUN  |  noun  |  NN  |  noun, singular or mass
during  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
talks  |  NOUN  |  noun  |  NNS  |  noun, plural
with  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
PM  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
Narendra  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
Modi  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
on  |  ADP  |  adposition  |  IN  |  conjunction, subordinating o

# <font color='lightblue'>Implement Hidden Markov model along with Viterbi algorithm to generate sequence of tags & check the best possible path for it. [use treebank corpus/brown corpus] </font>

In [9]:
import numpy as np
from nltk.corpus import brown
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

Transition Matrix:
The transition matrix represents the probabilities of transitioning from one hidden state to another in a Markov chain. In the context of HMMs for part-of-speech tagging, hidden states typically correspond to different part-of-speech tags (e.g., noun, verb, adjective), and transitions represent the likelihood of moving from one tag to another in a sequence of tags.

In [10]:
# function to compute transition probabilities-takes a sequence of POS tags,
# counts the transitions between tags, and normalizes these counts to compute transition probabilities
def compute_transition_probabilities(tags):
    tag_counts = nltk.FreqDist(tags) #creates a frequency distribution object, tag_counts, which stores the count of each unique tag
    num_tags = len(tags) #to calculates the total number of tags in the input tags sequence.

    transition_probabilities = {} # Initialize an empty dictionary to store the transition probabilities between tags.
    #store all tags by iterating over each tag
    for i in range(num_tags):
        current_tag = tags[i]
        next_tag = tags[i + 1] if i + 1 < num_tags else 'END'

        #To deal with tags that are not in the corpus
        if current_tag not in transition_probabilities:
            transition_probabilities[current_tag] = {}

        #create an entry if the next tag is not already a key in the nested dictionary and initialize its count to 0
        if next_tag not in transition_probabilities[current_tag]:
            transition_probabilities[current_tag][next_tag] = 0

        transition_probabilities[current_tag][next_tag] += 1

    #normalize the transition probabilities
    for current_tag in transition_probabilities:
        total_count = sum(transition_probabilities[current_tag].values())
        for next_tag in transition_probabilities[current_tag]:
            transition_probabilities[current_tag][next_tag] /= total_count #Normalization Calculation

    return transition_probabilities

Emission Matrix:
The emission matrix, represents the probabilities of observing particular emissions (observations or symbols) when in a certain hidden state. In the context of part-of-speech tagging, emissions correspond to words in a sentence, and hidden states correspond to part-of-speech tags.

In [11]:
# function to compute emission probabilities of words given their corresponding part-of-speech tags
# calculates emission probabilities for words associated with POS tags in a given sequence. It counts the occurrences of words and
# normalizes the probabilities to represent the likelihood of observing a specific word given a particular POS tag.
def compute_emission_probabilities(words, tags):
    #Initialize Data Structures-word_counts and emission_probabilities, to store counts and probabilities
    word_counts = {}
    emission_probabilities = {}

    for word, tag in zip(words, tags):
        if tag not in emission_probabilities:
            emission_probabilities[tag] = {}

        #if tag is not a key in the emission_probabilities dictionary creates an empty dictionary for that tag.
        if word not in emission_probabilities[tag]:
            emission_probabilities[tag][word] = 0

        emission_probabilities[tag][word] += 1
        #if word is not a key in the nested dictionary associated with the tag creates an entry for the word and
        #initializes its count to 0
        if word not in word_counts:
            word_counts[word] = 0
        word_counts[word] += 1  # to get hold of no of times this word has been observed with this tag

    #Normalize Emission Probabilities
    for tag in emission_probabilities:
        total_count = sum(emission_probabilities[tag].values())
        for word in emission_probabilities[tag]:
            emission_probabilities[tag][word] /= total_count

    return emission_probabilities, word_counts


The Viterbi algorithm is a dynamic programming algorithm for obtaining the maximum a posteriori probability estimate of the most likely sequence of hidden states—called the Viterbi path—that results in a sequence of observed events, especially in the context of Markov information sources and hidden Markov models (HMM).

The Viterbi Algorithm not only helps us find the π(k) values, that is the cost values for all the sequences using the concept of dynamic programming, but it also helps us to find the most likely tag sequence given a start state and a sequence of observations.

In [12]:
# function to perform Viterbi decoding
def viterbi_decode(observation_sequence, states, start_probabilities, transition_probabilities, emission_probabilities, smoothing_factor=0.01):

    #Step1:-Initialization
    n = len(observation_sequence)
    m = len(states)

    # Initialize the Viterbi matrix and backpointer matrix
    viterbi = np.zeros((n, m)) #To store intermediate probabilities.
    backpointer = np.zeros((n, m), dtype=int) #To store backpointers for the best path

    #Step 2: Initialize the first column of the Viterbi matrix
    for i in range(m):
        tag = states[i]
        word = observation_sequence[0]
        start_prob = start_probabilities.get(tag, smoothing_factor) #Calculate the probability of starting with the state tag
        emission_prob = emission_probabilities.get(tag, {}).get(word, smoothing_factor) #emission probability of observing word given tag
        viterbi[0][i] = start_prob * emission_prob # calculate initial probabilities

    # Fill in the rest of the Viterbi matrix
    for t in range(1, n):   #length of observation_sequence
        for j in range(m):    #length of brown tag states
            max_prob = -1
            max_backpointer = -1
            word = observation_sequence[t]
            for i in range(m):
                tag = states[i]
                transition_prob = transition_probabilities.get(states[i], {}).get(states[j], smoothing_factor)
                prob = viterbi[t - 1][i] * transition_prob
                if prob > max_prob:
                    max_prob = prob
                    max_backpointer = i
            emission_prob = emission_probabilities.get(states[j], {}).get(word, smoothing_factor)
            viterbi[t][j] = max_prob * emission_prob #store and update maximum probability.
            backpointer[t][j] = max_backpointer #Update backpointer matrix with the index of the state that led to the maximum probability.

    # Backtrack to find the best path
    best_path = [0] * n #Initialize the best_path list to store the best sequence of states
    #Assign the last state (tag) in the best_path list
    best_path[n - 1] = np.argmax(viterbi[n - 1]) #np.argmax function returns the index of the maximum value.
    #Iterate fron second last step to first step(t=0)
    for t in range(n - 2, -1, -1):
        best_path[t] = backpointer[t + 1][best_path[t + 1]]

    #creates a list of best path tags (best_path_tags) by mapping these
    #indices to their corresponding part-of-speech tags in the states list
    best_path_tags = [states[i] for i in best_path]

    return best_path_tags

# <font color='yellow'>Starting point of HMM</font>

In [13]:
# Train the HMM on the Brown corpus
#Extracting all word-tag pairs from the 'news' category of the Brown corpus.
brown_tagged_sents = brown.tagged_sents(categories='news')
#extracts the part-of-speech tags (tag) from the word-tag pairs and stores them in the tags list
tags = [tag for word, tag in brown.tagged_words(categories='news')]
print(tags)
#Computing Transition Probabilities
transition_probabilities = compute_transition_probabilities(tags)
#Computing Emission Probabilities and Word Counts(frequency of each word's occurrence in the 'news' category)
emission_probabilities, word_counts = compute_emission_probabilities(brown.words(categories='news'), tags)

['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.', 'AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.', 'AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.', '``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.', 'AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '`

In [14]:
# Define the states (tags) and start probabilities
states = list(set(tags))
print(states,'\n')
start_probabilities = {}

#calculates the start probability for each tag based on its frequency in the 'news' category of the Brown corpus
for tag in tags:
    if tag not in start_probabilities:
        start_probabilities[tag] = tags.count(tag) / len(tags)

# Generate a sample sentence
sentence2 = '''The natural heritage division of Intach has got the job of doing a clean-up, particularly of the peripheral canal running around the garden,
            the 24 interconnected lakes, repairing the pump, the sluice gates and the inflow channel from the Hooghly to ensure the free flow of water from the river
            to the lakes and out.'''

# Tokenizes the sentence into a sequence of words
observation_sequence = nltk.word_tokenize(sentence2)

# Perform Viterbi decoding
best_path_tags = viterbi_decode(observation_sequence, states, start_probabilities, transition_probabilities, emission_probabilities)

# Print the best path (tag sequence) for the sentence
print("Sentence:", sentence2, '\n')
print("Tag Sequence after applying Viterbi algorithm in HMM model:", best_path_tags)

['NNS-HL', 'WPS', 'NP-HL', 'MD+HV', 'FW-NNS', 'PN', 'PPLS', 'BEZ-HL', ')', "'", 'JJS', 'UH', 'NPS', 'DO*', 'JJR', 'PPS+BEZ', 'FW-AT-HL', 'AP$', '*-HL', 'VBZ-HL', 'JJ-NC', '--', 'FW-AT-TL', 'RB$', 'MD-HL', 'NR', 'BEZ', 'RB-HL', 'VBG', 'JJ-HL', 'JJR-HL', 'HVZ', 'JJ', 'RB-TL', "''", 'ABX', 'AT-TL', 'QLP', '(-HL', 'BEN', '.-HL', 'HV', 'PP$$', 'NN$-HL', 'PPSS+HVD', 'IN-HL', 'OD', 'RB+BEZ', 'NNS-TL-HL', 'PPS', 'FW-CD', 'HVD*', 'RBT', 'NN-TL', 'FW-AT', 'RP-HL', 'MD*', ',', 'VBN', 'DTS', 'BEDZ*', '(', 'ABN-HL', ',-HL', 'NN-NC', 'NN-HL', 'JJR-TL', 'PPSS+MD', 'BEDZ-HL', 'BER-HL', 'QL', 'VBZ', 'NP+BEZ', 'CD', 'RBR', 'DO', 'NN$', 'VBD-HL', 'NNS', 'DOZ*', 'AT-HL', 'CC-TL', 'VBD', 'DT$', 'WPS+BEZ', 'BED*', 'WP$', 'NP$-TL', 'FW-IN-TL', '*', 'WQL', 'PPSS+BEM', 'NR$-TL', 'VBN-TL-HL', 'MD*-HL', 'HVN', 'PPSS', 'PPS+BEZ-HL', 'FW-IN+NN-TL', 'TO-HL', 'VBD-TL', 'FW-JJ-TL', 'PPS+HVZ', 'ABN', 'FW-NN', 'DT-HL', 'NPS$-TL', 'PN-HL', 'NNS-TL', 'NN', 'DTI', 'DT', 'BED', 'BEDZ', 'WRB', 'VB-HL', 'HVZ*', 'TO-TL', 'PP$

IN: Preposition or subordinating conjunction

CD: Cardinal number

NNS: Noun, plural

TO: to

BE: Verb 'to be'

VBN: Verb, past participle

AT: Article (e.g., 'the', 'an')

NN: Noun, singular or mass

CC: Coordinating conjunction

VB: Verb, base form

RB: Adverb

CS: Subordinating conjunction

PPS: Personal pronoun, nominative

PP$$: Possessive pronoun

'': Quotation mark

UH-TL: Interjection, topicalized

AP-TL: Adjective, topicalized

NPS-HL: Proper noun, plural, topicalized and with headline

VBN-TL-HL: Verb, past participle, topicalized, and with headline

NP-HL: Noun phrase with headline

,-HL: Comma with headline

`.": Period (full stop) with headline
