# Task 2: Tokenization using NLTK:

In [12]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')

text = "Natural Language Processing is a branch of artificial intelligence that focuses on the interaction between computers and humans through natural language. Text Classification: Automatically categorizing text documents into predefined categories."
# Word Tokenization
word_tokens = word_tokenize(text)
print("Word Tokens:")
print(word_tokens)

# Sentence Tokenization
sentence_tokens = sent_tokenize(text)
print("\nSentence Tokens:")
print(sentence_tokens)


Word Tokens:
['Natural', 'Language', 'Processing', 'is', 'a', 'branch', 'of', 'artificial', 'intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'natural', 'language', '.', 'Text', 'Classification', ':', 'Automatically', 'categorizing', 'text', 'documents', 'into', 'predefined', 'categories', '.']

Sentence Tokens:
['Natural Language Processing is a branch of artificial intelligence that focuses on the interaction between computers and humans through natural language.', 'Text Classification: Automatically categorizing text documents into predefined categories.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Task 3: Stop Words Removal using NLTK:

In [14]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

text = "Stop words are common words that are often filtered out during the preprocessing of text data because they do not carry significant meaning."

# Tokenize the text
tokens = word_tokenize(text)

# Load English stop words
stop_words = set(stopwords.words('english'))

# Remove stop words
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print("Filtered Tokens after Stop Words Removal:")
print(filtered_tokens)


Filtered Tokens after Stop Words Removal:
['Stop', 'words', 'common', 'words', 'often', 'filtered', 'preprocessing', 'text', 'data', 'carry', 'significant', 'meaning', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Task 4: Stemming and Lemmatization using NLTK:

In [8]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
text = "Stemming and lemmatization are both techniques used in NLP for reducing words to their base or root form."

# Tokenize the text
tokens = word_tokenize(text)

# Initialize Stemmer and Lemmatizer
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

# Perform Stemming
stemmed_words = [porter_stemmer.stem(word) for word in tokens]

print("Stemmed Words:")
print(stemmed_words)

# Perform Lemmatization
lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in tokens]

print("\nLemmatized Words:")
print(lemmatized_words)


[nltk_data] Downloading package wordnet to /root/nltk_data...


Stemmed Words:
['stem', 'and', 'lemmat', 'are', 'both', 'techniqu', 'use', 'in', 'nlp', 'for', 'reduc', 'word', 'to', 'their', 'base', 'or', 'root', 'form', '.']

Lemmatized Words:
['Stemming', 'and', 'lemmatization', 'are', 'both', 'technique', 'used', 'in', 'NLP', 'for', 'reducing', 'word', 'to', 'their', 'base', 'or', 'root', 'form', '.']
