In [None]:
# Import necessary libraries for Natural Language Processing (NLP) tasks
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup  # For removing HTML tags
from spellchecker import SpellChecker  # For spelling correction

In [None]:
# Download required NLTK resources
nltk.download('punkt')  # For tokenization
nltk.download('stopwords')  # For stop word removal
nltk.download('wordnet')  # For lemmatization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Input Text
# Purpose: Define the text data for analysis
text="""Dr. Sir Allama Muhammad Iqbal (9 November 1877 – 21 April 1938) was a Muslim poet, philosopher, political thinker, and politician from Punjab, British India (now
Pakistan), whose poetry in Urdu and Persian is considered to be among the greatest of the modern era, whose vision of an independent state for the Muslims of British
India was to inspire the creation of Pakistan, and who is thus revered by Pakistanis and recognized internationally as Pakistan’s spiritual father of the nation.
Iqbal was born in Sialkot, now in Pakistan’s Punjab province. His father, Sheikh Noor Muhammad, was a tailor by profession and a pious individual with a mystic bent – he
had received no formal education but could read Urdu and Persian books and treasured the company of scholars and mystics, some of whom called him an
“unlettered philosopher”. Iqbal’s mother, Imam Bibi, was illiterate but was highly respected in the family as a wise and generous woman who quietly gave financial
help to the poor and needy and arbitrated in neighbours’ disputes. A few days before the birth of Iqbal, his father had a dream: “I saw a big crowd
gathered in a large field. A magnificent coloured bird was flying over our heads and everyone was admiring it and trying to catch it, but no one succeeded, and, at last, it
got tired of its flight and fell into my lap.” He understood this to be a message that God was about to bless him with a world-famous son. Hence, the “unlettered
philosopher” gave his son the name Muhammad Iqbal – the word Iqbal, whose origins lie in the Arabic language, means recognition, stature, respect, and fortune.
About four hundred years before Iqbal’s birth, his Brahmin ancestors, who lived in Kashmir (Northern India), had converted to Islam. In the late eighteenth or early
nineteenth century, when Afghan rule in Kashmir was being replaced by Sikh rule, Iqbal’s great grandfather emigrated from Kashmir to Sialkot. """

In [None]:
def preprocess_text(text):
    """
    Preprocesses text data by applying cleaning and normalization steps.

    Args:
        text (str): The input text to be preprocessed.

    Returns:
        list: A list of preprocessed tokens.
    """
    # 1. Convert to lowercase
    text = text.lower()

    # 2. Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # 3. Remove HTML tags (if any)
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    # 4. Tokenize into words
    tokens = word_tokenize(text)

    # 5. Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 6. Perform stemming (using PorterStemmer)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # 7. Perform lemmatization (using WordNetLemmatizer)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 8. Spelling correction (using SpellChecker)
    spell = SpellChecker()
    tokens = [spell.correction(word) for word in tokens]

    return tokens

In [None]:
processed_tokens = preprocess_text(text)
print(processed_tokens)

['do', 'sir', 'llama', None, 'equal', '9', 'novel', '1877', '21', 'aril', '1938', 'muslin', 'poet', 'philosophy', 'polite', 'thinker', 'politician', 'unjam', 'brutish', 'india', 'pakistan', 'whose', 'poetry', 'rude', 'person', 'conoid', 'among', 'greatest', 'modern', 'era', 'whose', 'vision', 'indeed', 'state', 'muslin', 'brutish', 'india', 'inspire', 'creation', 'pakistan', 'the', 'never', 'pakistan', 'recon', 'intern', 'pakistan', 'spirit', 'father', 'nation', 'equal', 'born', 'shallot', 'pakistan', 'unjam', 'proving', 'father', 'sheikh', 'door', None, 'tailor', 'profess', 'pious', 'individual', 'mystic', 'bent', 'receive', 'formal', 'educe', 'could', 'read', 'rude', 'person', 'book', 'treasure', 'company', 'scholar', 'mystic', 'call', 'unless', 'philosophy', 'equal', 'mother', 'imam', 'bib', 'liter', 'highly', 'respect', 'family', 'wise', 'gene', 'woman', 'quietly', 'gave', 'finance', 'help', 'poor', 'need', 'arbiter', 'neighbor', 'dispute', 'day', 'birth', 'equal', 'father', 'dream