In [1]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re

In [2]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Filler words
filler_words = {"um", "uh", "like", "you know", "actually", "basically", "literally", "so", "well"}

# Stopwords + filler
stop_words = set(stopwords.words("english")).union(filler_words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Suchana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Suchana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Suchana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Suchana\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
# POS mapping for lemmatizer
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [4]:
def text_filteration(text):
    text = text.lower()
    sentences = sent_tokenize(text)

    lemmatizer = WordNetLemmatizer()
    result = []

    for sent in sentences:
        # Keep only period, remove all other special characters
        sent_cleaned = re.sub(r"[^\w\s.]", "", sent)
        tokens = word_tokenize(sent_cleaned)
        tagged = pos_tag(tokens)

        filtered = []
        for word, tag in tagged:
            if word == ".":
                filtered.append(".")  # keep period
            elif word not in stop_words and len(word) > 2:
                lemma = lemmatizer.lemmatize(word, get_wordnet_pos(tag))
                filtered.append(lemma)

        # Reconstruct sentence
        cleaned_sent = " ".join(filtered).replace(" .", ".")
        result.append(cleaned_sent)

    return " ".join(result)

In [8]:
# Example
text = "Nepal is a beautiful country located in South Asia, nestled between the giant nations of China and India. Despite its relatively small size, Nepal is incredibly diverse, both geographically and culturally. It is home to eight of the world’s ten highest peaks, including Mount Everest, the tallest mountain on Earth, which attracts climbers and trekkers from all over the globe. Nepal’s landscape ranges from snow-capped mountains and lush forests to flat plains and deep valleys, offering breathtaking views and abundant natural beauty. The country is also deeply spiritual, being the birthplace of Siddhartha Gautama, known as the Buddha, and is dotted with ancient temples, monasteries, and shrines that reflect its strong religious traditions in Hinduism and Buddhism. Nepalese culture is rich with colorful festivals, traditional music, dance, and crafts that have been passed down through generations. The people of Nepal are known for their warmth, resilience, and hospitality, making visitors feel at home. Despite facing challenges such as political instability and natural disasters, Nepal continues to move forward with optimism and pride. Tourism, agriculture, and remittances are key parts of its economy. Overall, Nepal is a land of natural wonders and deep traditions, offering a unique and unforgettable experience to anyone who visits."
print(text_filteration(text))

nepal beautiful country locate south asia nestle giant nation china india. despite relatively small size nepal incredibly diverse geographically culturally. home eight world ten high peak include mount everest tall mountain earth attract climber trekker globe. nepal landscape range snowcapped mountain lush forest flat plain deep valley offer breathtaking view abundant natural beauty. country also deeply spiritual birthplace siddhartha gautama know buddha dot ancient temple monastery shrine reflect strong religious tradition hinduism buddhism. nepalese culture rich colorful festival traditional music dance craft pass generation. people nepal know warmth resilience hospitality making visitor feel home. despite face challenge political instability natural disaster nepal continue move forward optimism pride. tourism agriculture remittance key part economy. overall nepal land natural wonder deep tradition offer unique unforgettable experience anyone visit.


In [6]:
text1="Science is the pursuit and application of knowledge and understanding of the natural and social world following a systematic methodology based on evidence. Scientific methodology includes the following: Objective observation: Measurement and data (possibly although not necessarily using mathematics as a tool) Evidence."
print(text_filteration(text1))

science pursuit application knowledge understanding natural social world follow systematic methodology base evidence. scientific methodology include following objective observation measurement data possibly although necessarily use mathematics tool evidence.
