In [3]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag
import os

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')


# Ensure NLTK data is available
nltk_data_path = os.path.join(os.getcwd(), 'nltk_data')
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)

# Function to check and download NLTK resources if missing
def ensure_nltk_resource(resource_name, resource_path):
    try:
        nltk.data.find(resource_path)
    except LookupError:
        nltk.download(resource_name, download_dir=nltk_data_path)

# Ensure required NLTK resources are available
ensure_nltk_resource('punkt', 'tokenizers/punkt')
ensure_nltk_resource('stopwords', 'corpora/stopwords')
ensure_nltk_resource('averaged_perceptron_tagger', 'taggers/averaged_perceptron_tagger')

# Sample text
text = "NLTK is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources."

# Tokenization by Word
words = word_tokenize(text)
print("Words Tokenization:")
print(words)
print()

# Tokenization by Sentence
sentences = sent_tokenize(text)
print("Sentence Tokenization:")
print(sentences)
print()

# Eliminating Stop Words
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
print("After Stopword Removal:")
print(filtered_words)
print()

# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print("After Stemming:")
print(stemmed_words)
print()

# Parts of Speech Tagging
pos_tags = pos_tag(filtered_words)
print("Parts of Speech Tagging:")
print(pos_tags)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Words Tokenization:
['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data', '.', 'It', 'provides', 'easy-to-use', 'interfaces', 'to', 'over', '50', 'corpora', 'and', 'lexical', 'resources', '.']

Sentence Tokenization:
['NLTK is a leading platform for building Python programs to work with human language data.', 'It provides easy-to-use interfaces to over 50 corpora and lexical resources.']

After Stopword Removal:
['NLTK', 'leading', 'platform', 'building', 'Python', 'programs', 'work', 'human', 'language', 'data', '.', 'provides', 'easy-to-use', 'interfaces', '50', 'corpora', 'lexical', 'resources', '.']

After Stemming:
['nltk', 'lead', 'platform', 'build', 'python', 'program', 'work', 'human', 'languag', 'data', '.', 'provid', 'easy-to-us', 'interfac', '50', 'corpora', 'lexic', 'resourc', '.']

Parts of Speech Tagging:
[('NLTK', 'NNP'), ('leading', 'VBG'), ('platform', 'NN'), ('building', 'NN'), ('Python'