In [None]:
Q1. Write a unique paragraph (5-6 sentences) about your favorite topic (e.g., sports, technology, food, books, etc.).
1. Convert text to lowercase and remove punctuation.
2. Tokenize the text into words and sentences.
3. Remove stopwords (using NLTK's stopwords list).
4. Display word frequency distribution (excluding stopwords).

import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist

nltk.download('punkt')
nltk.download('stopwords')

# Paragraph
original_text = """Mental health is just as important as physical health—it affects how we think, feel, 
and act. Taking care of our mind helps us handle stress, build strong relationships, and make good decisions. 
Regular self-care, rest, and open conversations can go a long way in maintaining emotional well-being. 
It's okay to seek help when things feel overwhelming. Prioritizing mental health is a sign of strength, not weakness."""

# 1. Lowercase and remove punctuation
text_lower = original_text.lower()
text_clean = text_lower.translate(str.maketrans('', '', string.punctuation))

# 2. Tokenize
words = word_tokenize(text_clean)
sentences = sent_tokenize(original_text)

# 3. Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

# 4. Word frequency distribution
freq_dist = FreqDist(filtered_words)
print("Filtered Words:", filtered_words)
print("Word Frequency:")
for word, freq in freq_dist.items():
    print(f"{word}: {freq}")


Q2: Stemming and Lemmatization
1. Take the tokenized words from Question 1 (after stopword removal).
2. Apply stemming using NLTK's PorterStemmer and LancasterStemmer.
3. Apply lemmatization using NLTK's WordNetLemmatizer.
4. Compare and display results of both techniques.

from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
nltk.download('wordnet')

ps = PorterStemmer()
ls = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

porter_stems = [ps.stem(word) for word in filtered_words]
lancaster_stems = [ls.stem(word) for word in filtered_words]
lemmas = [lemmatizer.lemmatize(word) for word in filtered_words]

print("Porter Stemmer:", porter_stems)
print("Lancaster Stemmer:", lancaster_stems)
print("Lemmatized:", lemmas)


Q3. Regular Expressions and Text Splitting
1. Take the original text from Question 1.
2. Use regular expressions to:
a. Extract all words with more than 5 letters.
b. Extract all numbers (if any exist).
c. Extract all capitalized words.
3. Use text splitting techniques to:
a. Split the text into words containing only alphabets.
b. Extract words starting with a vowel.

import re

# a. Words with more than 5 letters
long_words = re.findall(r'\b[a-zA-Z]{6,}\b', original_text)

# b. Extract numbers
numbers = re.findall(r'\d+', original_text)

# c. Extract capitalized words
capitalized_words = re.findall(r'\b[A-Z][a-z]*\b', original_text)

# d. Words containing only alphabets
alphabet_words = re.findall(r'\b[a-zA-Z]+\b', original_text)

# e. Words starting with a vowel
vowel_words = re.findall(r'\b[aeiouAEIOU][a-zA-Z]*\b', original_text)

print("Words > 5 letters:", long_words)
print("Numbers:", numbers)
print("Capitalized Words:", capitalized_words)
print("Alphabet-only Words:", alphabet_words)
print("Words Starting with Vowel:", vowel_words)


Q4. Custom Tokenization & Regex-based Text Cleaning
1. Take original text from Question 1.
2. Write a custom tokenization function that:
a. Removes punctuation and special symbols but keeps contractions.
b. Keeps hyphenated words as a single token.
c. Tokenizes numbers separately but keeps decimal numbers intact.
3. Use Regex Substitutions (re.sub) to:
a. Replace email addresses with <EMAIL>.
b. Replace URLs with <URL>.
c. Replace phone numbers with <PHONE>.

def custom_tokenize(text):
    # Replace emails
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '<EMAIL>', text)
    # Replace URLs
    text = re.sub(r'https?://\S+|www\.\S+', '<URL>', text)
    # Replace phone numbers
    text = re.sub(r'(\+?\d{1,3}[ -]?)?\d{3}[- ]?\d{3}[- ]?\d{4}', '<PHONE>', text)

    # Remove unwanted punctuation (but keep hyphens and contractions)
    text = re.sub(r"[^\w\s\-']", ' ', text)

    # Tokenize: keep decimals, hyphenated words, contractions
    tokens = re.findall(r"\d+\.\d+|\w+(?:-\w+)*|'[a-z]+|\w+", text)
    return tokens

custom_tokens = custom_tokenize(original_text)
print("Custom Tokens:", custom_tokens)


