# Task 2: Tokenization using NLTK:

In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')

text = "Tokenization is the process of breaking down a text into smaller units, typically words or sentences, called tokens. In NLP, tokenization is a fundamental step in preprocessing text data."

# Word Tokenization
word_tokens = word_tokenize(text)
print("Word Tokens:")
print(word_tokens)

# Sentence Tokenization
sentence_tokens = sent_tokenize(text)
print("\nSentence Tokens:")
print(sentence_tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Word Tokens:
['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'a', 'text', 'into', 'smaller', 'units', ',', 'typically', 'words', 'or', 'sentences', ',', 'called', 'tokens', '.', 'In', 'NLP', ',', 'tokenization', 'is', 'a', 'fundamental', 'step', 'in', 'preprocessing', 'text', 'data', '.']

Sentence Tokens:
['Tokenization is the process of breaking down a text into smaller units, typically words or sentences, called tokens.', 'In NLP, tokenization is a fundamental step in preprocessing text data.']


# Task 3: Stop Words Removal using NLTK:

In [3]:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

text = "Tokenization is the process of breaking down a text into smaller units, typically words or sentences, called tokens."

# Tokenize the text
tokens = word_tokenize(text)

# Load English stop words
stop_words = set(stopwords.words('english'))

# Remove stop words
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print("Filtered Tokens after Stop Words Removal:")
print(filtered_tokens)


Filtered Tokens after Stop Words Removal:
['Tokenization', 'process', 'breaking', 'text', 'smaller', 'units', ',', 'typically', 'words', 'sentences', ',', 'called', 'tokens', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Task 4: Stemming and Lemmatization using NLTK:

In [6]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
text = "Tokenization is the process of breaking down a text into smaller units, typically words or sentences, called tokens."

# Tokenize the text
tokens = word_tokenize(text)

# Initialize Stemmer and Lemmatizer
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

# Perform Stemming
stemmed_words = [porter_stemmer.stem(word) for word in tokens]

print("Stemmed Words:")
print(stemmed_words)

# Perform Lemmatization
lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in tokens]

print("\nLemmatized Words:")
print(lemmatized_words)


[nltk_data] Downloading package wordnet to /root/nltk_data...


Stemmed Words:
['token', 'is', 'the', 'process', 'of', 'break', 'down', 'a', 'text', 'into', 'smaller', 'unit', ',', 'typic', 'word', 'or', 'sentenc', ',', 'call', 'token', '.']

Lemmatized Words:
['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'a', 'text', 'into', 'smaller', 'unit', ',', 'typically', 'word', 'or', 'sentence', ',', 'called', 'token', '.']
