In [1]:
import nltk
import string
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required resources
nltk.download('punkt', download_dir='./nltk_data')
nltk.download('stopwords', download_dir='./nltk_data')
nltk.download('averaged_perceptron_tagger', download_dir='./nltk_data')
nltk.download('wordnet', download_dir='./nltk_data')
nltk.download('omw-1.4', download_dir='./nltk_data')  # for lemmatizer

# Input text
text = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife."
text = text.lower()
print("Lowercased Text:\n", text)

# Remove punctuation
print("Punctuation characters:\n", string.punctuation)
text_p = "".join([char for char in text if char not in string.punctuation])
print("\nText without punctuation:\n", text_p)

# Tokenization
words = word_tokenize(text_p)
sentences = sent_tokenize(text_p)
print("\nWord Tokens:\n", words)
print("\nSentence Tokens:\n", sentences)

# Stopwords removal
stop_words = stopwords.words('english')
filtered_words = [word for word in words if word not in stop_words]
print("\nFiltered Words (no stopwords):\n", filtered_words)

# Stemming
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in filtered_words]
print("\nStemmed Words:\n", stemmed)

# POS tagging
pos = pos_tag(filtered_words)
print("\nPOS Tags:\n", pos)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]
print("\nLemmatized Words:\n", lemmatized)

# TF-IDF Calculation
corpus = [text]
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

terms = tfidf_vectorizer.get_feature_names_out()
print("\nTF-IDF Terms:\n", terms)

tfidf_values = tfidf_matrix.toarray()
print("\nTF-IDF Values:\n", tfidf_values)


[nltk_data] Downloading package punkt to ./nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     ./nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to ./nltk_data...
[nltk_data] Downloading package omw-1.4 to ./nltk_data...


Lowercased Text:
 it is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.
Punctuation characters:
 !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

Text without punctuation:
 it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife

Word Tokens:
 ['it', 'is', 'a', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man', 'in', 'possession', 'of', 'a', 'good', 'fortune', 'must', 'be', 'in', 'want', 'of', 'a', 'wife']

Sentence Tokens:
 ['it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife']

Filtered Words (no stopwords):
 ['truth', 'universally', 'acknowledged', 'single', 'man', 'possession', 'good', 'fortune', 'must', 'want', 'wife']

Stemmed Words:
 ['truth', 'univers', 'acknowledg', 'singl', 'man', 'possess', 'good', 'fortun', 'must', 'want', 'wife']

POS Tags:
 [('truth', 'NN'), ('universally', 'RB'), ('a

In [2]:
import nltk
import string
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Sample text
text = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife."
text = text.lower()

# Remove punctuation
text_p = "".join([char for char in text if char not in string.punctuation])

# Tokenize
words = word_tokenize(text_p)
sentences = sent_tokenize(text_p)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in filtered_words]

# POS tagging
pos_tags = pos_tag(filtered_words)

# Function to map POS tags to WordNet format
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

# Lemmatization with POS
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]

# TF-IDF
corpus = [text]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)
terms = vectorizer.get_feature_names_out()

# Output
print("Original Text:\n", text)
print("\nFiltered Words:\n", filtered_words)
print("\nStemmed Words:\n", stemmed)
print("\nPOS Tags:\n", pos_tags)
print("\nLemmatized Words:\n", lemmatized)
print("\nTF-IDF Terms:\n", terms)
print("\nTF-IDF Values:\n", X.toarray())


[nltk_data] Downloading package punkt to C:\Users\Rahul
[nltk_data]     Wanjare\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Rahul
[nltk_data]     Wanjare\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rahul Wanjare\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Rahul
[nltk_data]     Wanjare\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Rahul
[nltk_data]     Wanjare\AppData\Roaming\nltk_data...


Original Text:
 it is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.

Filtered Words:
 ['truth', 'universally', 'acknowledged', 'single', 'man', 'possession', 'good', 'fortune', 'must', 'want', 'wife']

Stemmed Words:
 ['truth', 'univers', 'acknowledg', 'singl', 'man', 'possess', 'good', 'fortun', 'must', 'want', 'wife']

POS Tags:
 [('truth', 'NN'), ('universally', 'RB'), ('acknowledged', 'VBD'), ('single', 'JJ'), ('man', 'NN'), ('possession', 'NN'), ('good', 'JJ'), ('fortune', 'NN'), ('must', 'MD'), ('want', 'VB'), ('wife', 'NN')]

Lemmatized Words:
 ['truth', 'universally', 'acknowledge', 'single', 'man', 'possession', 'good', 'fortune', 'must', 'want', 'wife']

TF-IDF Terms:
 ['acknowledged' 'fortune' 'good' 'man' 'possession' 'single' 'truth'
 'universally' 'want' 'wife']

TF-IDF Values:
 [[0.31622777 0.31622777 0.31622777 0.31622777 0.31622777 0.31622777
  0.31622777 0.31622777 0.31622777 0.31622777]]
