In [25]:
!pip install nltk
!pip install scikit-learn



In [26]:
import nltk

In [27]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Srushti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [35]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Srushti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Srushti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [41]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Srushti\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [42]:
doc = "Srushti is learning text analytics. She loves working with data and solving problems."

In [43]:
# Tokenization
tokens = word_tokenize(doc)
print("Tokens:", tokens)

Tokens: ['Srushti', 'is', 'learning', 'text', 'analytics', '.', 'She', 'loves', 'working', 'with', 'data', 'and', 'solving', 'problems', '.']


In [44]:
# POS Tagging
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('Srushti', 'NNP'), ('is', 'VBZ'), ('learning', 'VBG'), ('text', 'JJ'), ('analytics', 'NNS'), ('.', '.'), ('She', 'PRP'), ('loves', 'VBZ'), ('working', 'VBG'), ('with', 'IN'), ('data', 'NNS'), ('and', 'CC'), ('solving', 'VBG'), ('problems', 'NNS'), ('.', '.')]


In [45]:
# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("After Stop Words Removal:", filtered_tokens)

After Stop Words Removal: ['Srushti', 'learning', 'text', 'analytics', '.', 'loves', 'working', 'data', 'solving', 'problems', '.']


In [46]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['srushti', 'learn', 'text', 'analyt', '.', 'love', 'work', 'data', 'solv', 'problem', '.']


In [54]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Tokens:", lemmatized_tokens)

Lemmatized Tokens: ['Srushti', 'learning', 'text', 'analytics', '.', 'love', 'working', 'data', 'solving', 'problem', '.']


In [55]:
# TF & IDF (Term Frequency & Inverse Document Frequency)
# corpus is a list or collection of text documents
corpus = [
    "Srushti is learning text analytics.",
    "She loves working with data.",
    "Analytics helps in solving problems."
]

In [62]:
# vectorizer is a tool that changes text into numbers (vectors) so machines can work with the text.
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [63]:
# Show TF-IDF scores
print("\nTF-IDF Matrix:")
print(X.toarray())


TF-IDF Matrix:
[[0.35543247 0.         0.         0.         0.46735098 0.46735098
  0.         0.         0.         0.         0.46735098 0.46735098
  0.         0.        ]
 [0.         0.4472136  0.         0.         0.         0.
  0.4472136  0.         0.4472136  0.         0.         0.
  0.4472136  0.4472136 ]
 [0.35543247 0.         0.46735098 0.46735098 0.         0.
  0.         0.46735098 0.         0.46735098 0.         0.
  0.         0.        ]]


In [64]:
# returns the list of all unique words (features) that the vectorizer has learned from your text data.
print("Feature Names (Words):", vectorizer.get_feature_names_out())

Feature Names (Words): ['analytics' 'data' 'helps' 'in' 'is' 'learning' 'loves' 'problems' 'she'
 'solving' 'srushti' 'text' 'with' 'working']
