Step 1: Setting Up
First, ensure you have NLTK installed and download the necessary resources.

In [None]:
!pip install nltk
import nltk
nltk.download('punkt')  # For tokenization
nltk.download('averaged_perceptron_tagger')  # For POS tagging pos_tag
nltk.download('maxent_ne_chunker')  # For chunking
nltk.download('words')  # For chunking
nltk.download('wordnet') # for lemmatization
nltk.download('stopwords')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Step 2: Initializing Sample Data
Let's start with an array of sentences.

In [None]:
# Sample array of sentences
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial Intelligence is transforming the world.",
    "Natural Language Processing is a part of AI."
]


In [None]:
x = "abc"  # characters to be replaced
y = "xyz"  # characters to replace the ones in x

# Create the translation table
translation_table = str.maketrans(x, y)

# Now, use translate() to apply the translation to a string
text = "I have a cat, a bat, and a cap."
translated_text = text.translate(translation_table)

print(translated_text)


I hxve x zxt, x yxt, xnd x zxp.


str.maketrans()
Purpose: maketrans() is a method used to create a translation table that maps characters from one set to another. It can also map characters to None, which means removing those characters.
Syntax: str.maketrans(x, y, z)
x: A string of characters to be replaced.
y: A string of characters to replace the ones in x (must have the same length as x).
z: A string of characters to be deleted (i.e., mapped to None).
In your code, str.maketrans('', '', string.punctuation):

It doesn’t replace any characters because the first two arguments are empty strings.
The third argument, string.punctuation, tells Python to map all punctuation characters to None, which effectively means "remove these characters."

translate()
Purpose: translate() is a string method that takes a translation table (created by maketrans()) and applies it to a string.
Syntax: string.translate(translation_table)
translation_table: The table created by maketrans() that tells Python which characters to replace or delete.

Step 3: Tokenization, Normalization, and Sentence Segmentation
We'll tokenize each sentence, normalize the text, and perform sentence segmentation.

In [None]:
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
# Function to normalize text
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation                (character you want to replace) (corresponding characters for substitution)  remove punc characters from text (delete chars from text - 3rd input)
    text = text.translate(str.maketrans('', '', string.punctuation)) # replace the with empty character and remove punctuations
    return text

# Tokenization, normalization, and sentence segmentation
def preprocess_sentences(sentences):
    all_tokens = []
    for sentence in sentences:
        # tokenization
        # Normalize the sentence
        normalized_sentence = normalize_text(sentence)
        # Tokenize the normalized sentence
        tokens = word_tokenize(normalized_sentence)
        all_tokens.append(tokens)
    return all_tokens

# Preprocess the sample sentences
tokens_list = preprocess_sentences(sentences)
for tokens in tokens_list:
    print("Tokens:", tokens)


Tokens: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Tokens: ['artificial', 'intelligence', 'is', 'transforming', 'the', 'world']
Tokens: ['natural', 'language', 'processing', 'is', 'a', 'part', 'of', 'ai']


Step 4: POS Tagging
We will perform POS tagging on the tokenized text.

In [None]:
# Function to perform POS tagging on tokenized text
def pos_tagging(tokens_list):
    pos_tags_list = []
    for tokens in tokens_list:
        pos_tags = nltk.pos_tag(tokens)
        pos_tags_list.append(pos_tags)
    return pos_tags_list

# Perform POS tagging
pos_tags_list = pos_tagging(tokens_list)
for pos_tags in pos_tags_list:
    print("POS Tags:", pos_tags)


POS Tags: [('the', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]
POS Tags: [('artificial', 'JJ'), ('intelligence', 'NN'), ('is', 'VBZ'), ('transforming', 'VBG'), ('the', 'DT'), ('world', 'NN')]
POS Tags: [('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('part', 'NN'), ('of', 'IN'), ('ai', 'NN')]


Step 5 Chunking, or shallow parsing, involves grouping words into chunks based on POS tags.



In [None]:
# Function to perform chunking
def chunking(pos_tags_list):
    chunks_list = []
    # Define a simple chunk grammar
    # "NP" stands for Noun Phrase
    # {<DT>?<JJ>*<NN>} is the pattern:
    # - <DT>? means an optional determiner (e.g., "the", "a")
    #   - The `?` symbol means the determiner is optional and may appear zero or one time.
    # - <JJ>* means zero or more adjectives (e.g., "quick", "brown")
    #   - The `*` symbol means there can be zero or more adjectives.
    # - <NN> means a singular noun (e.g., "fox", "dog")
    chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"
    chunk_parser = nltk.RegexpParser(chunk_grammar)

    for pos_tags in pos_tags_list:
        # Parse the POS tagged sentence according to the chunk grammar
        tree = chunk_parser.parse(pos_tags)
        chunks_list.append(tree)
    return chunks_list

# Perform chunking
chunks_list = chunking(pos_tags_list)
print(chunks_list)
for tree in chunks_list:
    print("Chunks:")
    tree.pretty_print()


[Tree('S', [Tree('NP', [('the', 'DT'), ('quick', 'JJ'), ('brown', 'NN')]), Tree('NP', [('fox', 'NN')]), ('jumps', 'VBZ'), ('over', 'IN'), Tree('NP', [('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')])]), Tree('S', [Tree('NP', [('artificial', 'JJ'), ('intelligence', 'NN')]), ('is', 'VBZ'), ('transforming', 'VBG'), Tree('NP', [('the', 'DT'), ('world', 'NN')])]), Tree('S', [Tree('NP', [('natural', 'JJ'), ('language', 'NN')]), Tree('NP', [('processing', 'NN')]), ('is', 'VBZ'), Tree('NP', [('a', 'DT'), ('part', 'NN')]), ('of', 'IN'), Tree('NP', [('ai', 'NN')])])]
Chunks:
                                     S                                 
     ________________________________|______________________            
    |        |              NP               NP             NP         
    |        |       _______|________        |       _______|______     
jumps/VBZ over/IN the/DT quick/JJ brown/NN fox/NN the/DT lazy/JJ dog/NN

Chunks:
                              S                             

In [None]:
# Function to perform lemmatization on tokens
def lemmatization(tokens_list):
    lemmatizer = WordNetLemmatizer() #nltk.download('wordnet')
    lemmatized_tokens_list = []
    for tokens in tokens_list:
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        lemmatized_tokens_list.append(lemmatized_tokens)
    return lemmatized_tokens_list

# Perform lemmatization
lemmatized_tokens_list = lemmatization(tokens_list)
for lemmatized_tokens in lemmatized_tokens_list:
    print("Lemmatized Tokens:", lemmatized_tokens)

Lemmatized Tokens: ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog']
Lemmatized Tokens: ['artificial', 'intelligence', 'is', 'transforming', 'the', 'world']
Lemmatized Tokens: ['natural', 'language', 'processing', 'is', 'a', 'part', 'of', 'ai']


In [None]:
# Function to perform lemmatization on tokens
def lemmatization(tokens_list):
    lemmatizer = WordNetLemmatizer() #nltk.download('wordnet')
    lemmatized_tokens_list = []
    for tokens in tokens_list:
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        lemmatized_tokens_list.append(lemmatized_tokens)
    return lemmatized_tokens_list

# Perform lemmatization
lemmatized_tokens_list = lemmatization(tokens_list)
for lemmatized_tokens in lemmatized_tokens_list:
    print("Lemmatized Tokens:", lemmatized_tokens)

# Function to perform stemming on tokens
def stemming(tokens_list):
    stemmer = PorterStemmer()
    stemmed_tokens_list = []
    for tokens in tokens_list:
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        stemmed_tokens_list.append(stemmed_tokens)
    return stemmed_tokens_list

# Perform stemming
stemmed_tokens_list = stemming(tokens_list)
for stemmed_tokens in stemmed_tokens_list:
    print("Stemmed Tokens:", stemmed_tokens)

# Function to remove stop words from tokens
def remove_stop_words(tokens_list):
    stop_words = set(stopwords.words('english'))
    filtered_tokens_list = []
    for tokens in tokens_list:
        filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
        filtered_tokens_list.append(filtered_tokens)
    return filtered_tokens_list

# Remove stop words
filtered_tokens_list = remove_stop_words(tokens_list)
for filtered_tokens in filtered_tokens_list:
    print("Filtered Tokens (After Stop Word Removal):", filtered_tokens)

    #noun lemmatization if you are not giving postag parameter, then if there is ambigutiy it will prefer its noun

Lemmatized Tokens: ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog']
Lemmatized Tokens: ['artificial', 'intelligence', 'is', 'transforming', 'the', 'world']
Lemmatized Tokens: ['natural', 'language', 'processing', 'is', 'a', 'part', 'of', 'ai']
Stemmed Tokens: ['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog']
Stemmed Tokens: ['artifici', 'intellig', 'is', 'transform', 'the', 'world']
Stemmed Tokens: ['natur', 'languag', 'process', 'is', 'a', 'part', 'of', 'ai']
Filtered Tokens (After Stop Word Removal): ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']
Filtered Tokens (After Stop Word Removal): ['artificial', 'intelligence', 'transforming', 'world']
Filtered Tokens (After Stop Word Removal): ['natural', 'language', 'processing', 'part', 'ai']


Stemming with Porter and Snowball Stemmers
Stemming algorithms aim to remove affixes from words to get to their base forms. NLTK provides implementations of two popular stemming algorithms: Porter and Snowball (also known as Porter2).

1. Porter Stemmer
The Porter stemmer is one of the most widely used stemming algorithms. It follows a set of heuristic rules to remove suffixes from words.

In [None]:
from nltk.stem import PorterStemmer

# Initialize the Porter stemmer
porter_stemmer = PorterStemmer()

# Example words to stem
words = ["running", "ran", "cats", "trouble", "troubling", "friendship"]

# Stemming using Porter stemmer
stemmed_words_porter = [porter_stemmer.stem(word) for word in words]

print("Original words:", words)
print("Stemmed words (Porter):", stemmed_words_porter)


Original words: ['running', 'ran', 'cats', 'trouble', 'troubling', 'friendship']
Stemmed words (Porter): ['run', 'ran', 'cat', 'troubl', 'troubl', 'friendship']


2. Snowball Stemmer (Porter2 Stemmer)
The Snowball stemmer (or Porter2 stemmer) is an improved version of the Porter stemmer and supports stemming in multiple languages.

In [None]:
from nltk.stem import SnowballStemmer

# Initialize the Snowball stemmer for English
snowball_stemmer = SnowballStemmer("english")

# Example words to stem
words = ["running", "ran", "cats", "trouble", "troubling", "friendship"]

# Stemming using Snowball stemmer
stemmed_words_snowball = [snowball_stemmer.stem(word) for word in words]

print("Original words:", words)
print("Stemmed words (Snowball):", stemmed_words_snowball)


Original words: ['running', 'ran', 'cats', 'trouble', 'troubling', 'friendship']
Stemmed words (Snowball): ['run', 'ran', 'cat', 'troubl', 'troubl', 'friendship']


The main difference between lemmatization with and without POS tagging lies in how accurately the lemma (base form) of a word is determined based on its part of speech (POS).

Lemmatization Without POS Tagging
When lemmatizing without POS tagging:

The lemmatizer assumes that every word is a noun by default.
For example, without POS tagging:
"running" → "running"
"cats" → "cat"
"better" → "better"
Lemmatization With POS Tagging
When lemmatizing with POS tagging:

Each word is first assigned a specific POS tag (part of speech) using a POS tagger.
Based on the POS tag, the lemmatizer can accurately determine the lemma of the word.
For example, with POS tagging:
"running" (verb) → "run"
"cats" (noun) → "cat"
"better" (adjective) → "good"
Importance of POS Tagging in Lemmatization
POS tagging is important in lemmatization because:

Words can have multiple meanings and can function as different parts of speech (e.g., "run" can be a noun or a verb).
The lemma of a word differs depending on its POS. For instance, the lemma of "better" is "good" when it is used as an adjective, but remains "better" when used as an adverb or verb.
By tagging words with their POS before lemmatization, the lemmatizer can select the appropriate lemma form from a dictionary (such as WordNet) based on the word's part of speech, resulting in more accurate and meaningful base forms.
Example
Consider the word "better":

Without POS tagging, lemmatization would default to "better" → "better" (assuming it's a noun).
With POS tagging:
If tagged as an adjective, "better" → "good".
If tagged as an adverb or verb, "better" → "better".
Here's a Python example demonstrating the difference:

With pos tagging parameter - lemmatization can give better results than without pos tagging
lemmatization (word, pos) ->correct root form

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Example sentence
sentence = "He is running faster than before and likes cats."

# Tokenize the sentence
tokens = word_tokenize(sentence)

# POS tagging using NLTK
pos_tags = pos_tag(tokens)

print("Tokens:", tokens)
print("POS Tags:", pos_tags)
print()

# Lemmatization without POS tagging
lemmatized_words_without_pos = [lemmatizer.lemmatize(word) for word in tokens]
print("Lemmatized words without POS tagging:", lemmatized_words_without_pos)
print()

# Function to convert NLTK POS tags to WordNet POS tags
def nltk_to_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return 'a'  # Adjective
    elif nltk_tag.startswith('VBZ'):
        return 'v'  # Verb
    elif nltk_tag.startswith('N'):
        return 'n'  # Noun
    elif nltk_tag.startswith('R'):
        return 'n'  # Adverb
    else:
        return None  # Default to noun

# Lemmatization with POS tagging
lemmatized_words_with_pos = []
for word, tag in pos_tags:
    wordnet_pos = nltk_to_wordnet_pos(tag) # pos tags that comes from nltk passes on wordnet -> wordnet tagging and it will go towards lemma form
    if wordnet_pos:
        lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
    else:
        lemma = lemmatizer.lemmatize(word)  # default to noun
    lemmatized_words_with_pos.append(lemma)

print("Lemmatized words with POS tagging:", lemmatized_words_with_pos)
# is am are  - pos tag - verb form v
# noun
# destination tag

# Verb form  be

Tokens: ['He', 'is', 'running', 'faster', 'than', 'before', 'and', 'likes', 'cats', '.']
POS Tags: [('He', 'PRP'), ('is', 'VBZ'), ('running', 'VBG'), ('faster', 'RBR'), ('than', 'IN'), ('before', 'RB'), ('and', 'CC'), ('likes', 'JJ'), ('cats', 'NNS'), ('.', '.')]

Lemmatized words without POS tagging: ['He', 'is', 'running', 'faster', 'than', 'before', 'and', 'like', 'cat', '.']

Lemmatized words with POS tagging: ['He', 'be', 'run', 'faster', 'than', 'before', 'and', 'likes', 'cat', '.']


Using Pretrained Taggers


In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Sample sentences
sentences = [
    "He walked quickly to the store.",
    "She sings beautifully every morning.",
    "The old man greeted us warmly.",
    "Their ideas seemed quite innovative.",
    "I saw a man with a telescope."
]

# Tokenize and tag each sentence
tagged_sentences = []
for sentence in sentences:
    tokens = word_tokenize(sentence)
    tags = nltk.pos_tag(tokens)
    tagged_sentences.append(tags)

# Print POS tagged sentences
for i, tags in enumerate(tagged_sentences, start=1):
    print(f"Sentence {i}:")
    print(tags)
    print()


Sentence 1:
[('He', 'PRP'), ('walked', 'VBD'), ('quickly', 'RB'), ('to', 'TO'), ('the', 'DT'), ('store', 'NN'), ('.', '.')]

Sentence 2:
[('She', 'PRP'), ('sings', 'VBZ'), ('beautifully', 'RB'), ('every', 'DT'), ('morning', 'NN'), ('.', '.')]

Sentence 3:
[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('greeted', 'VBD'), ('us', 'PRP'), ('warmly', 'RB'), ('.', '.')]

Sentence 4:
[('Their', 'PRP$'), ('ideas', 'NNS'), ('seemed', 'VBD'), ('quite', 'RB'), ('innovative', 'JJ'), ('.', '.')]

Sentence 5:
[('I', 'PRP'), ('saw', 'VBD'), ('a', 'DT'), ('man', 'NN'), ('with', 'IN'), ('a', 'DT'), ('telescope', 'NN'), ('.', '.')]



Additional Rules for Custom Tagging
To demonstrate how additional rules can improve tagging accuracy, consider the following rules:



Certainly! Let's break down each regular expression pattern and its corresponding POS tag:

1. **`(r'\w+ly$', 'RB')`**:
   - **Pattern**: `\w+ly$`
     - `\w+` matches one or more word characters (letters, digits, or underscores).
     - `ly` matches the characters "ly" literally.
     - `$` asserts position at the end of the string.
   - **Explanation**: This pattern matches adverbs ending with "ly", such as "quickly", "beautifully", "normally".
   - **POS Tag**: `'RB'`
     - **Explanation**: `'RB'` is the POS tag for adverbs in English. Adverbs modify verbs, adjectives, or other adverbs by providing information about manner, place, time, degree, etc.

2. **`(r'[A-Z].*', 'NNP')`**:
   - **Pattern**: `[A-Z].*`
     - `[A-Z]` matches any uppercase letter.
     - `.*` matches any character (except for line terminators) zero or more times.
   - **Explanation**: This pattern matches proper nouns that start with an uppercase letter, such as "John", "London", "Microsoft".
   - **POS Tag**: `'NNP'`
     - **Explanation**: `'NNP'` is the POS tag for proper nouns in English. Proper nouns refer to specific names of people, places, organizations, etc.

3. **`(r'\w+ing$', 'VBG')`**:
   - **Pattern**: `\w+ing$`
     - `\w+` matches one or more word characters.
     - `ing` matches the characters "ing" literally.
     - `$` asserts position at the end of the string.
   - **Explanation**: This pattern matches gerunds (verb forms ending in "ing"), such as "walking", "talking", "swimming".
   - **POS Tag**: `'VBG'`
     - **Explanation**: `'VBG'` is the POS tag for gerunds (present participles) in English. Gerunds are verb forms that function as nouns in sentences.

4. **`(r'\w+\'s$', 'POS')`**:
   - **Pattern**: `\w+\'s$`
     - `\w+` matches one or more word characters.
     - `\'s` matches the possessive form ending with "'s", such as "dog's", "John's".
     - `$` asserts position at the end of the string.
   - **Explanation**: This pattern matches possessive nouns ending in "'s".
   - **POS Tag**: `'POS'`
     - **Explanation**: `'POS'` is the POS tag for possessive endings in English. It denotes possession or association with the noun that precedes it.

5. **`(r'and|but|or', 'CC')`**:
   - **Pattern**: `and|but|or`
     - Matches specific coordinating conjunctions: "and", "but", "or".
   - **Explanation**: This pattern matches coordinating conjunctions that join words, phrases, or clauses of equal grammatical rank.
   - **POS Tag**: `'CC'`
     - **Explanation**: `'CC'` is the POS tag for coordinating conjunctions in English. Coordinating conjunctions connect words, phrases, or clauses together.

### Summary:
These regular expression patterns and their corresponding POS tags are used to enhance POS tagging in natural language processing tasks. They help identify specific linguistic patterns that may not be covered comprehensively by standard pretrained taggers, thereby improving accuracy in identifying adverbs, proper nouns, gerunds, possessive nouns, and coordinating conjunctions in text data.

In [None]:
from nltk.tag import RegexpTagger

# Define additional rules for custom tagging using RegexpTagger
additional_rules = [
    (r'\w+ly$', 'RB'),        # Adverbs ending in 'ly'
    (r'[A-Z].*', 'NNP'),      # Proper nouns starting with uppercase
    (r'\w+ing$', 'VBG'),      # Gerunds ending in 'ing'
    (r'\w+\'s$', 'POS'),     # Possessive nouns ending in "'s"
    (r'and|but|or', 'CC')     # Coordinating conjunctions
]

# Create a RegexpTagger with additional rules
regexp_tagger = RegexpTagger(additional_rules)

# Apply the custom tagger to each sentence
custom_tagged_sentences = []
for sentence in sentences:
    normalized_sentence = normalize_text(sentence)
    tokens = word_tokenize(normalized_sentence)  # Tokenize the normalized sentence
    custom_tags = regexp_tagger.tag(tokens)  # Use the custom RegexpTagger to get POS tags
    custom_tagged_sentences.append(custom_tags)

# Print custom POS tagged sentences
print("Custom POS Tagging with Additional Rules:")
for i, tags in enumerate(custom_tagged_sentences, start=1):
    print(f"Sentence {i}:")
    print(tags)
    print()

Custom POS Tagging with Additional Rules:
Sentence 1:
[('he', None), ('walked', None), ('quickly', 'RB'), ('to', None), ('the', None), ('store', None)]

Sentence 2:
[('she', None), ('sings', None), ('beautifully', 'RB'), ('every', None), ('morning', 'VBG')]

Sentence 3:
[('the', None), ('old', None), ('man', None), ('greeted', None), ('us', None), ('warmly', 'RB')]

Sentence 4:
[('their', None), ('ideas', None), ('seemed', None), ('quite', None), ('innovative', None)]

Sentence 5:
[('i', None), ('saw', None), ('a', None), ('man', None), ('with', None), ('a', None), ('telescope', None)]



Using Different Corpora with NLTK
NLTK provides access to various corpora for different languages and purposes. Here’s how you can access and work with some common corpora:

Brown Corpus:

Description: The Brown Corpus is a general corpus of English text, created in 1961 at Brown University.
Access: You can access it using NLTK's corpus module:

In [None]:
from nltk.corpus import brown


Gutenberg Corpus:

Description: The Gutenberg Corpus includes a selection of literary works from Project Gutenberg.
Access: Use NLTK to access it:

In [None]:
from nltk.corpus import gutenberg

Inaugural Corpus:

Description: The Inaugural Corpus includes U.S. presidential inaugural addresses.
Access: Available through NLTK

In [None]:
from nltk.corpus import inaugural


Web Text Corpus:

Description: The Web Text Corpus contains text from a Firefox discussion forum.
Access: Accessible through NLTK

In [None]:
from nltk.corpus import webtext


r'[^\w\s]': This is the regular expression pattern being used. Let's break it down:

[^\w\s]:
[]: Denotes a character class (i.e., a set of characters to match).
^: Inside a character class, the caret (^) negates the class, meaning "match any character not in this set."
\w: Matches any word character (equivalent to [a-zA-Z0-9_]).
\s: Matches any whitespace character (spaces, tabs, newlines).

In [None]:
import nltk
from nltk.corpus import brown
nltk.download('brown')

# Access the Brown Corpus
words = brown.words()  # Get all words from the corpus
text = ' '.join(words)  # Convert list of words into a single string (if needed)

# Tokenization
tokens = nltk.word_tokenize(text)

# Cleaning (Normalization)
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation using regex
    import re
    text = re.sub(r'[^\w\s]', '', text)
    return text

cleaned_tokens = [normalize_text(token) for token in tokens if token.isalpha()]

# Example: Print first 10 cleaned tokens
print("Cleaned Tokens:")
print(cleaned_tokens[:10])


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


Cleaned Tokens:
['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of']


The Treebank corpus is a widely used annotated corpus in computational linguistics and natural language processing (NLP). It is a collection of parsed and tagged sentences from the Wall Street Journal (WSJ) portion of the Penn Treebank, which is a large annotated corpus of English texts.

Characteristics of the Treebank Corpus:
Annotation: The Treebank corpus provides syntactic (parse trees) and morphological (POS tags) annotations for each sentence. Each word in the corpus is annotated with its part-of-speech tag, and sentences are parsed to show their syntactic structure.

Size: The Treebank corpus is substantial, containing thousands of sentences from the WSJ. It is large enough to support the training and evaluation of various NLP models, particularly those related to POS tagging and syntactic parsing.

Standardization: The annotations in the Treebank corpus adhere to standardized conventions and guidelines, making it a reliable resource for linguistic research and development of NLP algorithms.

Applications: It is used extensively for training and evaluating POS taggers, syntactic parsers, and other NLP tools that require labeled data. Researchers and developers often use subsets of the Treebank corpus for specific experiments or tasks.

Usage in Natural Language Processing:
Training POS Taggers: Many POS taggers, such as UnigramTagger, BigramTagger, and ClassifierBasedPOSTagger in NLTK, are trained on subsets of the Treebank corpus to learn the statistical patterns of word-tag associations.

Training Syntactic Parsers: The parsed sentences in the Treebank corpus are used to train and evaluate syntactic parsers that analyze the grammatical structure of sentences.

Research and Development: Researchers and developers use the Treebank corpus as a benchmark dataset for testing the performance of new algorithms and techniques in NLP.

In [None]:
import nltk
from nltk.corpus import treebank
from nltk.tag.sequential import ClassifierBasedPOSTagger
from nltk.classify import NaiveBayesClassifier

# Load the Treebank corpus
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [None]:
sentences = treebank.tagged_sents()
print(sentences)

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]


Exactly! Let's go deeper into what happens during training for the **Naive Bayes classifier** in POS tagging.

### 1. **Feature Extraction and Counting**

During training, the classifier gathers data about the **features** and their associated **POS tags** from the training set. This involves counting how often each feature appears with each POS tag.

For example, let's say the model is training on the sentence `"She is running quickly."` with the following POS tags:

- `"She"` → `PRP` (Personal pronoun)
- `"is"` → `VBZ` (Verb, 3rd person singular present)
- `"running"` → `VBG` (Verb, gerund/present participle)
- `"quickly"` → `RB` (Adverb)

The features for each word might be extracted as follows (using your `pos_features` function):

```python
word: "running"
features = {
    'suffix(1)': 'g',           # Last character
    'suffix(2)': 'ng',          # Last two characters
    'suffix(3)': 'ing',         # Last three characters
    'prev_word': 'is',          # Previous word
    'next_word': 'quickly'      # Next word
}
```

For each POS tag, the model counts:
- How often **each feature** (e.g., suffixes like "ing" or previous/next word) appears **with a particular tag**.

### 2. **Building Probability Distributions**
Naive Bayes needs to estimate probabilities for each feature given a tag. To do this, it computes:

#### - **P(feature | tag)**:
For each POS tag, the classifier calculates the probability of seeing specific features. For example:
- How often does the suffix "ing" appear with the tag `VBG` (present participle)?
- How often is the word "is" followed by a word tagged as `VBG`?

These probabilities are estimated using relative frequencies:
\[
P(\text{suffix}(3) = 'ing' | \text{tag} = \text{VBG}) = \frac{\text{Count of 'ing' with VBG}}{\text{Total occurrences of VBG}}
\]
\[
P(\text{prev_word} = 'is' | \text{tag} = \text{VBG}) = \frac{\text{Count of 'is' preceding VBG}}{\text{Total occurrences of VBG}}
\]

For example, if:
- The word **"running"** is tagged as `VBG` 200 times in the training data, and 150 of those occurrences have the suffix `"ing"`, then:
  \[
  P(\text{suffix}(3) = 'ing' | \text{tag} = \text{VBG}) = \frac{150}{200} = 0.75
  \]
- If the word **"is"** precedes a word tagged as `VBG` 50 times, and there are 200 instances of `VBG`, then:
  \[
  P(\text{prev_word} = 'is' | \text{tag} = \text{VBG}) = \frac{50}{200} = 0.25
  \]

#### - **P(tag)** (Prior Probability):
The classifier also estimates the prior probability of each tag. For example, if `VBG` appears in 10% of all the tags in the training set:
\[
P(\text{tag} = \text{VBG}) = 0.10
\]
This prior helps adjust for the relative frequency of each tag (some tags are more common than others).

### 3. **Combining the Probabilities (During Prediction)**
After the classifier has been trained, it uses the learned probabilities to predict the tag for a word in a new sentence.

Given the features for the word **"running"**, the model calculates the probability of each possible tag by multiplying the probabilities of the features given that tag (based on the counts from training) and the prior probability of the tag:

For `VBG`:
\[
P(\text{VBG} | \text{features}) \propto P(\text{features} | \text{VBG}) \times P(\text{VBG})
\]
This includes:
\[
P(\text{suffix}(3) = 'ing' | \text{VBG}) \times P(\text{prev_word} = 'is' | \text{VBG}) \times P(\text{VBG})
\]

For each other possible tag (e.g., `NN`, `VBZ`), the classifier calculates a similar probability:
\[
P(\text{NN} | \text{features}), P(\text{VBZ} | \text{features}), \dots
\]
Then, it picks the tag with the highest probability.

### 4. **Smoothing**:
Sometimes, a feature might not appear with a specific tag in the training set. For example, there may be no examples where the suffix "ing" occurs with `NN` (noun). In such cases, the probability `P(suffix(3) = 'ing' | NN)` could be zero, which would lead to incorrect predictions.

To handle this, Naive Bayes typically uses **smoothing** (such as Laplace smoothing), which assigns a small non-zero probability to unseen events to avoid zero probabilities in the calculation.

### Example of Feature-Tag Probabilities (Hypothetical):

| Feature                  | Tag  | Count   | Probability \(P(\text{feature}|\text{tag})\) |
|--------------------------|------|---------|----------------------------------------------|
| `suffix(3) = 'ing'`       | VBG  | 150     | 0.75                                         |
| `suffix(3) = 'ing'`       | NN   | 10      | 0.05                                         |
| `prev_word = 'is'`        | VBG  | 50      | 0.25                                         |
| `next_word = 'quickly'`   | VBG  | 20      | 0.10                                         |
| `suffix(1) = 's'`         | NNS  | 120     | 0.60                                         |

Using these probabilities, the classifier combines them to determine the most likely tag.

### Conclusion:

- During training, the Naive Bayes classifier calculates **feature frequencies** for each tag from the training data.
- It builds **probability distributions** from these counts, estimating the likelihood of each feature occurring with each tag.
- The classifier then uses these probabilities to make predictions on new data by combining the probabilities for each feature and selecting the tag that maximizes the likelihood.

In [None]:
import nltk
from nltk.corpus import treebank
from nltk.tag.sequential import ClassifierBasedPOSTagger
from nltk.classify import NaiveBayesClassifier

# Load the Treebank corpus
nltk.download('treebank')
sentences = treebank.tagged_sents()

# Split into training and test sets (80-20 split)
train_data = sentences[:int(len(sentences) * 0.8)]
test_data = sentences[int(len(sentences) * 0.8):]


# Define features for the custom tagger
def pos_features(sentence, index):
    features = {
        'suffix(1)': sentence[index][-1:],
        'suffix(2)': sentence[index][-2:],
        'suffix(3)': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1]
    }
    return features

# Prepare training data with features
train_data_features = []
for sent in train_data:
    untagged_sent = nltk.tag.untag(sent)
    for i, (word, tag) in enumerate(sent):
        featureset = (pos_features(untagged_sent, i), tag)
        train_data_features.append(featureset)

# Train a custom POS tagger
classifier = NaiveBayesClassifier.train(train_data_features)
custom_tagger = ClassifierBasedPOSTagger(classifier=classifier)

# Evaluate the custom tagger
accuracy = custom_tagger.evaluate(test_data)
print(f"Accuracy: {accuracy:.2%}")


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  accuracy = custom_tagger.evaluate(test_data)


Accuracy: 14.48%


In [None]:
import nltk
from nltk.corpus import treebank
from nltk.tag.sequential import ClassifierBasedPOSTagger
from nltk.classify import NaiveBayesClassifier

# Load the Treebank corpus
nltk.download('treebank')
sentences = treebank.tagged_sents()
print(sentences[:2])

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')]]


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [None]:
# Print tagged sentences with custom rules applied
for sent in test_data[:5]:  # Print first 5 sentences
    print(sent)


[('The', 'DT'), ('latest', 'JJS'), ('10-year', 'JJ'), ('notes', 'NNS'), ('were', 'VBD'), ('quoted', 'VBN'), ('at', 'IN'), ('100', 'CD'), ('22\\/32', 'CD'), ('*-1', '-NONE-'), ('to', 'TO'), ('yield', 'VB'), ('7.88', 'CD'), ('%', 'NN'), ('compared', 'VBN'), ('with', 'IN'), ('100', 'CD'), ('16\\/32', 'CD'), ('*', '-NONE-'), ('to', 'TO'), ('yield', 'VB'), ('7.90', 'CD'), ('%', 'NN'), ('.', '.')]
[('The', 'DT'), ('discount', 'NN'), ('rate', 'NN'), ('on', 'IN'), ('three-month', 'JJ'), ('Treasury', 'NNP'), ('bills', 'NNS'), ('was', 'VBD'), ('essentially', 'RB'), ('unchanged', 'JJ'), ('at', 'IN'), ('7.79', 'CD'), ('%', 'NN'), (',', ','), ('while', 'IN'), ('the', 'DT'), ('rate', 'NN'), ('on', 'IN'), ('six-month', 'JJ'), ('bills', 'NNS'), ('was', 'VBD'), ('slightly', 'RB'), ('lower', 'JJR'), ('at', 'IN'), ('7.52', 'CD'), ('%', 'NN'), ('compared', 'VBN'), ('with', 'IN'), ('7.60', 'CD'), ('%', 'NN'), ('Tuesday', 'NNP'), ('.', '.')]
[('Corporate', 'NNP'), ('Issues', 'NNPS')]
[('IBM', 'NNP'), ("'s",