In [None]:
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

paragraph = """My favorite book is a captivating mystery thriller that blends fiction with suspense and intrigue. Each chapter unravels secrets and twists that keep me guessing. The characters are complex, the plot is unpredictable, and the tension never fades. It’s a thrilling journey that I couldn’t put down until the end."""

lower_text = paragraph.lower()
clean_text = lower_text.translate(str.maketrans('', '', string.punctuation))

word_tokens = word_tokenize(clean_text)
sentence_tokens = sent_tokenize(clean_text)

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokens if word not in stop_words]

fdist = FreqDist(filtered_words)

print("\nOriginal Text:\n", paragraph)
print("\nCleaned Text:\n", clean_text)
print("\nSentence Tokens:\n", sentence_tokens)
print("\nWord Tokens:\n", word_tokens)
print("\nFiltered Words (No Stopwords):\n", filtered_words)
print("\nWord Frequency:\n")
fdist.plot(10, title="Word Frequency Distribution (Excluding Stopwords)")

In [None]:
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

paragraph = """My favorite book is a captivating mystery thriller that blends fiction with suspense and intrigue. Each chapter unravels secrets and twists that keep me guessing. The characters are complex, the plot is unpredictable, and the tension never fades. It’s a thrilling journey that I couldn’t put down until the end."""

lower_text = paragraph.lower()
clean_text = lower_text.translate(str.maketrans('', '', string.punctuation))
word_tokens = word_tokenize(clean_text)

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokens if word not in stop_words]

porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

print("Word, PorterStemmer, LancasterStemmer, Lemmatizer")
print("-" * 70)
for word in filtered_words:
    porter_stem = porter.stem(word)
    lancaster_stem = lancaster.stem(word)
    lemma = lemmatizer.lemmatize(word)  
    print(word + ", " + porter_stem + ", " + lancaster_stem + ", " + lemma)


In [None]:
import re
text = """My favorite book is a captivating mystery thriller that blends fiction with suspense and intrigue. Each chapter unravels secrets and twists that keep me guessing. The characters are complex, the plot is unpredictable, and the tension never fades. It’s a thrilling journey that I couldn’t put down until the end."""

words_more_than_5 = re.findall(r'\b\w{6,}\b', text)

numbers = re.findall(r'\b\d+\b', text)

capitalized_words = re.findall(r'\b[A-Z][a-z]*\b', text)

only_alpha_words = re.findall(r'\b[a-zA-Z]+\b', text)

words_starting_with_vowels = re.findall(r'\b[aeiouAEIOU]\w*', text)

print("Words with more than 5 letters:", words_more_than_5)
print("Numbers in text:", numbers if numbers else "None found")
print("Capitalized words:", capitalized_words)
print("Words with only alphabets:", only_alpha_words)
print("Words starting with vowels:", words_starting_with_vowels)


In [None]:
import re
text = """My favorite book is a captivating mystery thriller that blends fiction with suspense and intrigue. 
Each chapter unravels secrets and twists that keep me guessing. The characters are complex, the plot is unpredictable, 
and the tension never fades. It’s a thrilling journey that I couldn’t put down until the end. 
Contact me at example@mail.com or visit https://www.example.com. Call 123-456-7890 or +91 9876543210 for more info."""

def custom_tokenizer(text):
    pattern = r"\b\w+(?:[-']\w+)*\b|\d+\.\d+|\d+"
    return re.findall(pattern, text)

def clean_text(text):
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', text) 
    text = re.sub(r'https?://\S+|www\.\S+', '<URL>', text)        
    text = re.sub(r'(\+?\d{1,3}[\s-]?)?\d{3}[-\s]?\d{3}[-\s]?\d{4}', '<PHONE>', text)  
    return text
cleaned_text = clean_text(text)
tokens = custom_tokenizer(cleaned_text)

print("Cleaned Text:\n", cleaned_text)
print("\nCustom Tokens:\n", tokens)
