In [None]:
# Import Libraries
# Purpose: Import necessary libraries for NLP tasks
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')  # Downloads the Punkt sentence tokenizer, used for sentence tokenization.
nltk.download('stopwords')  # Downloads the stop words corpus, used for removing common words.
nltk.download('wordnet')  # Downloads the WordNet lemmatizer, used for lemmatization.
nltk.download('averaged_perceptron_tagger')  # Downloads the Averaged Perceptron tagger, used for part-of-speech tagging.
nltk.download('maxent_ne_chunker')  # Downloads the Maximum Entropy chunker, used for named entity recognition.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [None]:
# Input Text
# Purpose: Define the text data for analysis
text="""Dr. Sir Allama Muhammad Iqbal (9 November 1877 – 21 April 1938) was a Muslim poet, philosopher, political thinker, and politician from Punjab, British India (now
Pakistan), whose poetry in Urdu and Persian is considered to be among the greatest of the modern era, whose vision of an independent state for the Muslims of British
India was to inspire the creation of Pakistan, and who is thus revered by Pakistanis and recognized internationally as Pakistan’s spiritual father of the nation.
Iqbal was born in Sialkot, now in Pakistan’s Punjab province. His father, Sheikh Noor Muhammad, was a tailor by profession and a pious individual with a mystic bent – he
had received no formal education but could read Urdu and Persian books and treasured the company of scholars and mystics, some of whom called him an
“unlettered philosopher”. Iqbal’s mother, Imam Bibi, was illiterate but was highly respected in the family as a wise and generous woman who quietly gave financial
help to the poor and needy and arbitrated in neighbours’ disputes. A few days before the birth of Iqbal, his father had a dream: “I saw a big crowd
gathered in a large field. A magnificent coloured bird was flying over our heads and everyone was admiring it and trying to catch it, but no one succeeded, and, at last, it
got tired of its flight and fell into my lap.” He understood this to be a message that God was about to bless him with a world-famous son. Hence, the “unlettered
philosopher” gave his son the name Muhammad Iqbal – the word Iqbal, whose origins lie in the Arabic language, means recognition, stature, respect, and fortune.
About four hundred years before Iqbal’s birth, his Brahmin ancestors, who lived in Kashmir (Northern India), had converted to Islam. In the late eighteenth or early
nineteenth century, when Afghan rule in Kashmir was being replaced by Sikh rule, Iqbal’s great grandfather emigrated from Kashmir to Sialkot. """

In [None]:
# Tokenization - Sentence Tokenization
# Purpose: Split text into individual sentences
sentences = sent_tokenize(text)
sentences

['Dr. Sir Allama Muhammad Iqbal (9 November 1877 – 21 April 1938) was a Muslim poet, philosopher, political thinker, and politician from Punjab, British India (now\nPakistan), whose poetry in Urdu and Persian is considered to be among the greatest of the modern era, whose vision of an independent state for the Muslims of British\nIndia was to inspire the creation of Pakistan, and who is thus revered by Pakistanis and recognized internationally as Pakistan’s spiritual father of the nation.',
 'Iqbal was born in Sialkot, now in Pakistan’s Punjab province.',
 'His father, Sheikh Noor Muhammad, was a tailor by profession and a pious individual with a mystic bent – he\nhad received no formal education but could read Urdu and Persian books and treasured the company of scholars and mystics, some of whom called him an\n“unlettered philosopher”.',
 'Iqbal’s mother, Imam Bibi, was illiterate but was highly respected in the family as a wise and generous woman who quietly gave financial\nhelp to t

In [None]:
# Tokenization - Word Tokenization
# Purpose: Split each sentence into individual words
words = []
for sentence in sentences:
    words.extend(word_tokenize(sentence))
words

['Dr.',
 'Sir',
 'Allama',
 'Muhammad',
 'Iqbal',
 '(',
 '9',
 'November',
 '1877',
 '–',
 '21',
 'April',
 '1938',
 ')',
 'was',
 'a',
 'Muslim',
 'poet',
 ',',
 'philosopher',
 ',',
 'political',
 'thinker',
 ',',
 'and',
 'politician',
 'from',
 'Punjab',
 ',',
 'British',
 'India',
 '(',
 'now',
 'Pakistan',
 ')',
 ',',
 'whose',
 'poetry',
 'in',
 'Urdu',
 'and',
 'Persian',
 'is',
 'considered',
 'to',
 'be',
 'among',
 'the',
 'greatest',
 'of',
 'the',
 'modern',
 'era',
 ',',
 'whose',
 'vision',
 'of',
 'an',
 'independent',
 'state',
 'for',
 'the',
 'Muslims',
 'of',
 'British',
 'India',
 'was',
 'to',
 'inspire',
 'the',
 'creation',
 'of',
 'Pakistan',
 ',',
 'and',
 'who',
 'is',
 'thus',
 'revered',
 'by',
 'Pakistanis',
 'and',
 'recognized',
 'internationally',
 'as',
 'Pakistan',
 '’',
 's',
 'spiritual',
 'father',
 'of',
 'the',
 'nation',
 '.',
 'Iqbal',
 'was',
 'born',
 'in',
 'Sialkot',
 ',',
 'now',
 'in',
 'Pakistan',
 '’',
 's',
 'Punjab',
 'province',
 '.'

In [None]:
# Remove Punctuation and Lowercasing
# Purpose: Remove punctuation and convert text to lowercase
# This is done for case-insensitive processing and to simplify analysis
cleaned_words = []
for word in words:
    cleaned_word = re.sub('[^a-zA-Z]', '', word).lower() # Remove punctuation and convert to lowercase
    if cleaned_word:  # Add the word to the list only if it's not empty
        cleaned_words.append(cleaned_word)
cleaned_words

['dr',
 'sir',
 'allama',
 'muhammad',
 'iqbal',
 'november',
 'april',
 'was',
 'a',
 'muslim',
 'poet',
 'philosopher',
 'political',
 'thinker',
 'and',
 'politician',
 'from',
 'punjab',
 'british',
 'india',
 'now',
 'pakistan',
 'whose',
 'poetry',
 'in',
 'urdu',
 'and',
 'persian',
 'is',
 'considered',
 'to',
 'be',
 'among',
 'the',
 'greatest',
 'of',
 'the',
 'modern',
 'era',
 'whose',
 'vision',
 'of',
 'an',
 'independent',
 'state',
 'for',
 'the',
 'muslims',
 'of',
 'british',
 'india',
 'was',
 'to',
 'inspire',
 'the',
 'creation',
 'of',
 'pakistan',
 'and',
 'who',
 'is',
 'thus',
 'revered',
 'by',
 'pakistanis',
 'and',
 'recognized',
 'internationally',
 'as',
 'pakistan',
 's',
 'spiritual',
 'father',
 'of',
 'the',
 'nation',
 'iqbal',
 'was',
 'born',
 'in',
 'sialkot',
 'now',
 'in',
 'pakistan',
 's',
 'punjab',
 'province',
 'his',
 'father',
 'sheikh',
 'noor',
 'muhammad',
 'was',
 'a',
 'tailor',
 'by',
 'profession',
 'and',
 'a',
 'pious',
 'individ

In [None]:
# Stop Word Removal
# Purpose: Remove common words that don't carry much meaning
# These words are often removed to focus on more important terms
stop_words = set(stopwords.words('english'))  # Get the set of English stop words
filtered_words = [word for word in cleaned_words if word not in stop_words] # Remove stop words from the cleaned words list
filtered_words

['dr',
 'sir',
 'allama',
 'muhammad',
 'iqbal',
 'november',
 'april',
 'muslim',
 'poet',
 'philosopher',
 'political',
 'thinker',
 'politician',
 'punjab',
 'british',
 'india',
 'pakistan',
 'whose',
 'poetry',
 'urdu',
 'persian',
 'considered',
 'among',
 'greatest',
 'modern',
 'era',
 'whose',
 'vision',
 'independent',
 'state',
 'muslims',
 'british',
 'india',
 'inspire',
 'creation',
 'pakistan',
 'thus',
 'revered',
 'pakistanis',
 'recognized',
 'internationally',
 'pakistan',
 'spiritual',
 'father',
 'nation',
 'iqbal',
 'born',
 'sialkot',
 'pakistan',
 'punjab',
 'province',
 'father',
 'sheikh',
 'noor',
 'muhammad',
 'tailor',
 'profession',
 'pious',
 'individual',
 'mystic',
 'bent',
 'received',
 'formal',
 'education',
 'could',
 'read',
 'urdu',
 'persian',
 'books',
 'treasured',
 'company',
 'scholars',
 'mystics',
 'called',
 'unlettered',
 'philosopher',
 'iqbal',
 'mother',
 'imam',
 'bibi',
 'illiterate',
 'highly',
 'respected',
 'family',
 'wise',
 'ge

In [None]:
# Lemmatization
# Purpose: Reduce words to their base form (lemma)
# This helps group similar words together (e.g., 'running' -> 'run')
lemmatizer = WordNetLemmatizer()  # Initialize the lemmatizer
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words] # Lemmatize the filtered words
lemmatized_words

['dr',
 'sir',
 'allama',
 'muhammad',
 'iqbal',
 'november',
 'april',
 'muslim',
 'poet',
 'philosopher',
 'political',
 'thinker',
 'politician',
 'punjab',
 'british',
 'india',
 'pakistan',
 'whose',
 'poetry',
 'urdu',
 'persian',
 'considered',
 'among',
 'greatest',
 'modern',
 'era',
 'whose',
 'vision',
 'independent',
 'state',
 'muslim',
 'british',
 'india',
 'inspire',
 'creation',
 'pakistan',
 'thus',
 'revered',
 'pakistani',
 'recognized',
 'internationally',
 'pakistan',
 'spiritual',
 'father',
 'nation',
 'iqbal',
 'born',
 'sialkot',
 'pakistan',
 'punjab',
 'province',
 'father',
 'sheikh',
 'noor',
 'muhammad',
 'tailor',
 'profession',
 'pious',
 'individual',
 'mystic',
 'bent',
 'received',
 'formal',
 'education',
 'could',
 'read',
 'urdu',
 'persian',
 'book',
 'treasured',
 'company',
 'scholar',
 'mystic',
 'called',
 'unlettered',
 'philosopher',
 'iqbal',
 'mother',
 'imam',
 'bibi',
 'illiterate',
 'highly',
 'respected',
 'family',
 'wise',
 'generou

In [None]:
# Stemming
# Purpose: Reduce words to their root form (stem)
# This is another way to group similar words (e.g., 'studies' -> 'studi')
stemmer = PorterStemmer()  # Initialize the stemmer
stemmed_words = [stemmer.stem(word) for word in filtered_words] # Stem the filtered words
stemmed_words

['dr',
 'sir',
 'allama',
 'muhammad',
 'iqbal',
 'novemb',
 'april',
 'muslim',
 'poet',
 'philosoph',
 'polit',
 'thinker',
 'politician',
 'punjab',
 'british',
 'india',
 'pakistan',
 'whose',
 'poetri',
 'urdu',
 'persian',
 'consid',
 'among',
 'greatest',
 'modern',
 'era',
 'whose',
 'vision',
 'independ',
 'state',
 'muslim',
 'british',
 'india',
 'inspir',
 'creation',
 'pakistan',
 'thu',
 'rever',
 'pakistani',
 'recogn',
 'intern',
 'pakistan',
 'spiritu',
 'father',
 'nation',
 'iqbal',
 'born',
 'sialkot',
 'pakistan',
 'punjab',
 'provinc',
 'father',
 'sheikh',
 'noor',
 'muhammad',
 'tailor',
 'profess',
 'piou',
 'individu',
 'mystic',
 'bent',
 'receiv',
 'formal',
 'educ',
 'could',
 'read',
 'urdu',
 'persian',
 'book',
 'treasur',
 'compani',
 'scholar',
 'mystic',
 'call',
 'unlett',
 'philosoph',
 'iqbal',
 'mother',
 'imam',
 'bibi',
 'illiter',
 'highli',
 'respect',
 'famili',
 'wise',
 'gener',
 'woman',
 'quietli',
 'gave',
 'financi',
 'help',
 'poor',
 

In [None]:
# Word Frequency Analysis using CountVectorizer
# Purpose: Calculate and display the frequency of each word
vectorizer = CountVectorizer() # Initialize the CountVectorizer
X = vectorizer.fit_transform(lemmatized_words)  # Fit and transform the lemmatized words
word_freq = dict(zip(vectorizer.get_feature_names_out(), X.toarray().sum(axis=0))) # Create a dictionary of word frequencies
word_freq

{'admiring': 1,
 'afghan': 1,
 'allama': 1,
 'among': 1,
 'ancestor': 1,
 'april': 1,
 'arabic': 1,
 'arbitrated': 1,
 'bent': 1,
 'bibi': 1,
 'big': 1,
 'bird': 1,
 'birth': 2,
 'bless': 1,
 'book': 1,
 'born': 1,
 'brahmin': 1,
 'british': 2,
 'called': 1,
 'catch': 1,
 'century': 1,
 'coloured': 1,
 'company': 1,
 'considered': 1,
 'converted': 1,
 'could': 1,
 'creation': 1,
 'crowd': 1,
 'day': 1,
 'dispute': 1,
 'dr': 1,
 'dream': 1,
 'early': 1,
 'education': 1,
 'eighteenth': 1,
 'emigrated': 1,
 'era': 1,
 'everyone': 1,
 'family': 1,
 'father': 3,
 'fell': 1,
 'field': 1,
 'financial': 1,
 'flight': 1,
 'flying': 1,
 'formal': 1,
 'fortune': 1,
 'four': 1,
 'gathered': 1,
 'gave': 2,
 'generous': 1,
 'god': 1,
 'got': 1,
 'grandfather': 1,
 'great': 1,
 'greatest': 1,
 'head': 1,
 'help': 1,
 'hence': 1,
 'highly': 1,
 'hundred': 1,
 'illiterate': 1,
 'imam': 1,
 'independent': 1,
 'india': 3,
 'individual': 1,
 'inspire': 1,
 'internationally': 1,
 'iqbal': 8,
 'islam': 1,
 

In [None]:
# Part-of-Speech (POS) Tagging
# Purpose: Assign grammatical tags to words (e.g., noun, verb, adjective)
pos_tags = nltk.pos_tag(lemmatized_words) # Perform POS tagging on the lemmatized words
pos_tags

[('dr', 'NN'),
 ('sir', 'NN'),
 ('allama', 'NN'),
 ('muhammad', 'NN'),
 ('iqbal', 'JJ'),
 ('november', 'NNP'),
 ('april', 'NN'),
 ('muslim', 'NN'),
 ('poet', 'NN'),
 ('philosopher', 'RB'),
 ('political', 'JJ'),
 ('thinker', 'NN'),
 ('politician', 'NN'),
 ('punjab', 'NN'),
 ('british', 'JJ'),
 ('india', 'NN'),
 ('pakistan', 'NN'),
 ('whose', 'WP$'),
 ('poetry', 'NN'),
 ('urdu', 'JJ'),
 ('persian', 'JJ'),
 ('considered', 'VBN'),
 ('among', 'IN'),
 ('greatest', 'JJS'),
 ('modern', 'JJ'),
 ('era', 'NN'),
 ('whose', 'WP$'),
 ('vision', 'NN'),
 ('independent', 'JJ'),
 ('state', 'NN'),
 ('muslim', 'NNS'),
 ('british', 'JJ'),
 ('india', 'VBP'),
 ('inspire', 'NN'),
 ('creation', 'NN'),
 ('pakistan', 'NN'),
 ('thus', 'RB'),
 ('revered', 'VBD'),
 ('pakistani', 'JJ'),
 ('recognized', 'VBN'),
 ('internationally', 'RB'),
 ('pakistan', 'JJ'),
 ('spiritual', 'JJ'),
 ('father', 'NN'),
 ('nation', 'NN'),
 ('iqbal', 'JJ'),
 ('born', 'VBN'),
 ('sialkot', 'JJ'),
 ('pakistan', 'NN'),
 ('punjab', 'NN'),
 ('p

In [None]:
# Named Entity Recognition (NER)
# Purpose: Identify and classify named entities in the text (e.g., person, organization, location)
ner_tags = nltk.ne_chunk(pos_tags)  # Perform NER on the POS-tagged words
ner_tags

ModuleNotFoundError: No module named 'svgling'

Tree('S', [('dr', 'NN'), ('sir', 'NN'), ('allama', 'NN'), ('muhammad', 'NN'), ('iqbal', 'JJ'), ('november', 'NNP'), ('april', 'NN'), ('muslim', 'NN'), ('poet', 'NN'), ('philosopher', 'RB'), ('political', 'JJ'), ('thinker', 'NN'), ('politician', 'NN'), ('punjab', 'NN'), ('british', 'JJ'), ('india', 'NN'), ('pakistan', 'NN'), ('whose', 'WP$'), ('poetry', 'NN'), ('urdu', 'JJ'), ('persian', 'JJ'), ('considered', 'VBN'), ('among', 'IN'), ('greatest', 'JJS'), ('modern', 'JJ'), ('era', 'NN'), ('whose', 'WP$'), ('vision', 'NN'), ('independent', 'JJ'), ('state', 'NN'), ('muslim', 'NNS'), ('british', 'JJ'), ('india', 'VBP'), ('inspire', 'NN'), ('creation', 'NN'), ('pakistan', 'NN'), ('thus', 'RB'), ('revered', 'VBD'), ('pakistani', 'JJ'), ('recognized', 'VBN'), ('internationally', 'RB'), ('pakistan', 'JJ'), ('spiritual', 'JJ'), ('father', 'NN'), ('nation', 'NN'), ('iqbal', 'JJ'), ('born', 'VBN'), ('sialkot', 'JJ'), ('pakistan', 'NN'), ('punjab', 'NN'), ('province', 'NN'), ('father', 'NN'), ('she

In [None]:
# Regular Expressions with 're'
# Purpose: Demonstrate using regular expressions for text manipulation
import re

In [None]:
# Example 1: Finding all occurrences of a specific word
# Purpose: Find and count occurrences of a word
word_to_find = input("Enter the word to find: ")
matches = re.findall(word_to_find, text)  # Find all occurrences of the word
print(f"Occurrences of '{word_to_find}': {len(matches)}")  # Print the number of occurrences

Enter the word to find: Iqbal
Occurrences of 'Iqbal': 8


In [None]:
# Example 2: Replacing a pattern in the text
# Purpose: Replace a pattern in the text using regex
pattern = r"\d{4}"  # Matches 4-digit numbers (e.g., years)
replacement = "XXXX"
modified_text = re.sub(pattern, replacement, text)  # Replace the pattern with the replacement string
#print(modified_text) # To see the output, run the code.
modified_text

'Dr. Sir Allama Muhammad Iqbal (9 November XXXX – 21 April XXXX) was a Muslim poet, philosopher, political thinker, and politician from Punjab, British India (now\nPakistan), whose poetry in Urdu and Persian is considered to be among the greatest of the modern era, whose vision of an independent state for the Muslims of British\nIndia was to inspire the creation of Pakistan, and who is thus revered by Pakistanis and recognized internationally as Pakistan’s spiritual father of the nation.\nIqbal was born in Sialkot, now in Pakistan’s Punjab province. His father, Sheikh Noor Muhammad, was a tailor by profession and a pious individual with a mystic bent – he\nhad received no formal education but could read Urdu and Persian books and treasured the company of scholars and mystics, some of whom called him an\n“unlettered philosopher”. Iqbal’s mother, Imam Bibi, was illiterate but was highly respected in the family as a wise and generous woman who quietly gave financial\nhelp to the poor and 

In [None]:
# Example 3: Extracting specific information using groups
# Purpose: Extract specific information using regex groups
pattern = r"(\d{1,2} \w+ \d{4})"  # Matches dates (e.g., 9 November 1877)
dates = re.findall(pattern, text)  # Find all matches of the pattern
#print("Dates found:", dates) # To see the output, run the code.
dates

['9 November 1877', '21 April 1938']