diff --git a/Pos Tagging NLP/Pos Tagger using HMM.ipynb b/Pos Tagging NLP/Pos Tagger using HMM.ipynb new file mode 100644 index 000000000..d48a7bc7b --- /dev/null +++ b/Pos Tagging NLP/Pos Tagger using HMM.ipynb @@ -0,0 +1,118 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

Design of PoS tagger using HMM.

" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Import liberaries\n", + "from collections import defaultdict\n", + "import nltk\n", + "import numpy as np\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8waH4sMDWgrD", + "outputId": "7207175d-6e0b-46bc-86ff-c1266482cafe" + }, + "outputs": [], + "source": [ + "# Class for pos tagging\n", + "class PosTagging:\n", + " def __init__(self, train_sent):\n", + " self.transition = defaultdict(int)\n", + " self.emission = defaultdict(int)\n", + " self.tag_set = set()\n", + " self.word_set = set()\n", + "\n", + " self.train(train_sent)\n", + "\n", + " def train(self, train_sent):\n", + " for sent in train_sent:\n", + " prev_tag = None\n", + " for word, tag in sent:\n", + " self.transition[(prev_tag, tag)] += 1\n", + " self.emission[(tag, word)] += 1\n", + " self.tag_set.add(tag)\n", + " self.word_set.add(word)\n", + " prev_tag = tag\n", + "\n", + " def tag(self, sentence):\n", + " tagged_sentence = []\n", + " for word in sentence:\n", + " max_prob = 0\n", + " best_tag = None\n", + " for tag in self.tag_set:\n", + " count_total_tag = sum(v for k, v in self.transition.items() if k[0] == tagged_sentence[-1][1]) if tagged_sentence else 1.0\n", + " transition_prob = self.transition[(tagged_sentence[-1][1], tag)] / count_total_tag if tagged_sentence else 1.0\n", + " emission_prob = self.emission[(tag, word)] / count_total_tag\n", + " prob = transition_prob * emission_prob\n", + " if prob > max_prob:\n", + " max_prob = prob\n", + " best_tag = tag\n", + " tagged_sentence.append((word, best_tag))\n", + " return tagged_sentence\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('I', 'PRP'), ('love', 'VBP'), ('nautue', None)]\n" + ] + } + ], + "source": [ + "#Expamle to understand ho this works \n", + "train_sent = [[('I', 'PRP'), ('love', 'VBP'), ('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN')]]\n", + "test_sents = \"I love nautue\".split()\n", + "\n", + "hmm_tagger = PosTagging(train_sent)\n", + "tags = hmm_tagger.tag(test_sents)\n", + "print(tags)\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Pos Tagging NLP/readme.md b/Pos Tagging NLP/readme.md new file mode 100644 index 000000000..446e11c2d --- /dev/null +++ b/Pos Tagging NLP/readme.md @@ -0,0 +1,58 @@ +# Part-of-Speech Tagging Using Hidden Markov Model (HMM) + +This project provides a Python class `PosTagging` for performing Part-of-Speech (POS) tagging using a Hidden Markov Model (HMM). The class trains on a set of tagged sentences and uses the learned model to tag new sentences. + +## Table of Contents +- [Introduction](#introduction) +- [How It Works](#how-it-works) +- [Implementation Details](#implementation-details) +- [Advantages and Limitations](#advantages-and-limitations) +- [License](#license) + +## Introduction + +POS tagging is a fundamental task in Natural Language Processing (NLP) where each word in a sentence is labeled with its corresponding part of speech (e.g., noun, verb, adjective). This implementation uses an HMM-based approach to assign the most likely tag sequence to a sentence. + +## How It Works + +The `PosTagging` class is designed to: + +1. **Train**: It calculates the transition and emission probabilities based on a provided training dataset of tagged sentences. +2. **Tag**: It assigns the most likely POS tags to each word in an unseen sentence using the Viterbi-like algorithm based on the learned probabilities. + +## Implementation Details + +### Transition and Emission Probabilities + +- **Transition Probability**: The probability of a tag $T_i$ given the previous tag $T_{i-1}$. This is calculated during the training phase by counting tag sequences. + +$$ + P(T_i | T_{i-1}) = \frac{\text{Count}(T_{i-1}, T_i)}{\sum_{T_j} \text{Count}(T_{i-1}, T_j)} + $$ + +- **Emission Probability**: The probability of a word $W_i$ given a tag $T_i$. This is calculated based on how often a word is associated with a tag in the training data. + +$$ + P(W_i | T_i) = \frac{\text{Count}(T_i, W_i)}{\sum_{W_j} \text{Count}(T_i, W_j)} + $$ + +### Tagging Algorithm + +For each word in the input sentence, the class calculates the product of the transition and emission probabilities for each possible tag and assigns the tag with the highest probability. + +### Key Attributes + +- **`transition`**: A dictionary storing the transition probabilities between tags. +- **`emission`**: A dictionary storing the emission probabilities for each word-tag pair. +- **`tag_set`**: A set containing all the unique tags in the training data. +- **`word_set`**: A set containing all the unique words in the training data. + +## Advantages and Limitations + +### Advantages +- **Simplicity**: The HMM-based approach is straightforward and interpretable. +- **Efficiency**: The algorithm efficiently computes the most likely tag sequence for a sentence. + +### Limitations +- **Sparsity**: The model may struggle with unseen words or tags not present in the training data. +- **Context**: HMMs assume that the tag of a word depends only on the previous tag, which can be limiting for capturing long-range dependencies.