In [1]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

In [2]:
text = """Technology has drastically changed the way we live, work, and communicate. 
From the internet to smartphones, technology is constantly evolving and shaping our daily lives. 
The future of technology holds immense potential with advancements in AI, robotics, and quantum computing. 
We are only scratching the surface of what technology can achieve."""

In [3]:
# Q1.1: Convert text to lowercase and remove punctuation
text_lower = text.lower()
text_clean = text_lower.translate(str.maketrans('', '', string.punctuation))

In [4]:

# Q1.2: Tokenize the text into words and sentences
words = word_tokenize(text_clean)
sentences = sent_tokenize(text_clean)

In [5]:
# Q1.3: Remove stopwords using NLTK's stopwords list
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prabh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [6]:

# Q1.4: Word frequency distribution (excluding stopwords)
word_freq = Counter(filtered_words)
print("Q1.4 - Word Frequency Distribution:", word_freq)

Q1.4 - Word Frequency Distribution: Counter({'technology': 4, 'drastically': 1, 'changed': 1, 'way': 1, 'live': 1, 'work': 1, 'communicate': 1, 'internet': 1, 'smartphones': 1, 'constantly': 1, 'evolving': 1, 'shaping': 1, 'daily': 1, 'lives': 1, 'future': 1, 'holds': 1, 'immense': 1, 'potential': 1, 'advancements': 1, 'ai': 1, 'robotics': 1, 'quantum': 1, 'computing': 1, 'scratching': 1, 'surface': 1, 'achieve': 1})


In [7]:
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem import WordNetLemmatizer

In [8]:
# Q2.1: Take tokenized words after stopword removal
# (filtered_words already done in Q1)

# Q2.2: Apply stemming using NLTK's PorterStemmer and LancasterStemmer
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

porter_stemmed = [porter_stemmer.stem(word) for word in filtered_words]
lancaster_stemmed = [lancaster_stemmer.stem(word) for word in filtered_words]

In [9]:
# Q2.2: Apply stemming using NLTK's PorterStemmer and LancasterStemmer
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

porter_stemmed = [porter_stemmer.stem(word) for word in filtered_words]
lancaster_stemmed = [lancaster_stemmer.stem(word) for word in filtered_words]

In [10]:
# Q2.3: Apply lemmatization using NLTK's WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prabh\AppData\Roaming\nltk_data...


In [11]:
# Q2.4: Compare and display results
print("Q2.4 - Original Words:", filtered_words[:10])
print("Q2.4 - Porter Stemmed:", porter_stemmed[:10])
print("Q2.4 - Lancaster Stemmed:", lancaster_stemmed[:10])
print("Q2.4 - Lemmatized Words:", lemmatized_words[:10])

Q2.4 - Original Words: ['technology', 'drastically', 'changed', 'way', 'live', 'work', 'communicate', 'internet', 'smartphones', 'technology']
Q2.4 - Porter Stemmed: ['technolog', 'drastic', 'chang', 'way', 'live', 'work', 'commun', 'internet', 'smartphon', 'technolog']
Q2.4 - Lancaster Stemmed: ['technolog', 'drast', 'chang', 'way', 'liv', 'work', 'commun', 'internet', 'smartphon', 'technolog']
Q2.4 - Lemmatized Words: ['technology', 'drastically', 'changed', 'way', 'live', 'work', 'communicate', 'internet', 'smartphones', 'technology']


In [12]:
import re

# Q3.1: Original text from Q1 (text_clean)
# (Using text_clean from Q1)

# Q3.2a: Extract words with more than 5 letters
words_longer_than_5 = re.findall(r'\b\w{6,}\b', text_clean)

# Q3.2b: Extract all numbers (if any exist)
numbers = re.findall(r'\b\d+\b', text_clean)

# Q3.2c: Extract all capitalized words
capitalized_words = re.findall(r'\b[A-Z][a-z]*\b', text_clean)

# Q3.3a: Split text into words containing only alphabets (remove digits and special characters)
words_only_alpha = re.findall(r'\b[a-zA-Z]+\b', text_clean)

# Q3.3b: Extract words starting with a vowel
words_starting_with_vowel = re.findall(r'\b[aeiouAEIOU]\w*', text_clean)

# Display the results
print("Q3.2a - Words longer than 5 letters:", words_longer_than_5)
print("Q3.2b - Numbers:", numbers)
print("Q3.2c - Capitalized words:", capitalized_words)
print("Q3.3a - Words with only alphabets:", words_only_alpha)
print("Q3.3b - Words starting with a vowel:", words_starting_with_vowel)

Q3.2a - Words longer than 5 letters: ['technology', 'drastically', 'changed', 'communicate', 'internet', 'smartphones', 'technology', 'constantly', 'evolving', 'shaping', 'future', 'technology', 'immense', 'potential', 'advancements', 'robotics', 'quantum', 'computing', 'scratching', 'surface', 'technology', 'achieve']
Q3.2b - Numbers: []
Q3.2c - Capitalized words: []
Q3.3a - Words with only alphabets: ['technology', 'has', 'drastically', 'changed', 'the', 'way', 'we', 'live', 'work', 'and', 'communicate', 'from', 'the', 'internet', 'to', 'smartphones', 'technology', 'is', 'constantly', 'evolving', 'and', 'shaping', 'our', 'daily', 'lives', 'the', 'future', 'of', 'technology', 'holds', 'immense', 'potential', 'with', 'advancements', 'in', 'ai', 'robotics', 'and', 'quantum', 'computing', 'we', 'are', 'only', 'scratching', 'the', 'surface', 'of', 'what', 'technology', 'can', 'achieve']
Q3.3b - Words starting with a vowel: ['and', 'internet', 'is', 'evolving', 'and', 'our', 'of', 'immense

In [13]:
# Q4.1: Original text from Q1
# (Using text from Q1)

# Q4.2: Custom Tokenization Function
def custom_tokenizer(text):
    text = re.sub(r"([a-zA-Z])'([a-zA-Z])", r"\1'\2", text)  # Keeps contractions
    text = re.sub(r"\b[\w'-]+(?:-\w+)*\b", r" \g<0> ", text)  # Keeps hyphenated words as single tokens
    text = re.sub(r'\d+\.\d+', r'NUMBER', text)  # Keeps decimal numbers intact
    text_clean = re.sub(r'[^a-zA-Z0-9\'- ]', ' ', text)  # Remove other punctuation
    return text_clean.split()

In [None]:
# Q4.3: Regex Substitutions - using the random URL from the text
def regex_substitutions(text):
   
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '<EMAIL>', text)  
   
    text = re.sub(r'https?://\S+', '<URL>', text)  
    # Replace phone numbers (formats: 123-456-7890 or +91 9876543210)
    text = re.sub(r'\+?\d{1,3}\s?\(?\d{1,4}\)?\s?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,4}', '<PHONE>', text)  
    return text

# Use the functions to process the text
custom_tokens = custom_tokenizer(text)
cleaned_text = regex_substitutions(text)

# Display the results
print("Q4.2 - Custom Tokenized Text:", custom_tokens[:10])  # First 10 tokens for brevity
print("Q4.3 - Cleaned Text:", cleaned_text)