In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer , WordNetLemmatizer
from nltk import pos_tag

import string

In [2]:
sample_doc = "Text analytics is the process of analyzing unstructured text data to derive meaningful insights and patterns."

In [3]:
tokens = word_tokenize(sample_doc)

In [4]:
tokens

['Text',
 'analytics',
 'is',
 'the',
 'process',
 'of',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'to',
 'derive',
 'meaningful',
 'insights',
 'and',
 'patterns',
 '.']

In [5]:
pos_tags= pos_tag(tokens) 

In [6]:
pos_tags

[('Text', 'NN'),
 ('analytics', 'NNS'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('process', 'NN'),
 ('of', 'IN'),
 ('analyzing', 'VBG'),
 ('unstructured', 'JJ'),
 ('text', 'NN'),
 ('data', 'NNS'),
 ('to', 'TO'),
 ('derive', 'VB'),
 ('meaningful', 'JJ'),
 ('insights', 'NNS'),
 ('and', 'CC'),
 ('patterns', 'NNS'),
 ('.', '.')]

In [7]:
stopWords = set(stopwords.words('english'))

In [15]:
filtered_tokens =  [word for word in tokens if word.lower() not in stopWords ]

In [16]:
filtered_tokens

['Text',
 'analytics',
 'process',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'derive',
 'meaningful',
 'insights',
 'patterns',
 '.']

In [21]:
stemmer = PorterStemmer()

In [22]:
stemmed_words = [ stemmer.stem(word) for word in filtered_tokens]

In [23]:
stemmed_words

['text',
 'analyt',
 'process',
 'analyz',
 'unstructur',
 'text',
 'data',
 'deriv',
 'meaning',
 'insight',
 'pattern',
 '.']

In [26]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

In [28]:
lemmatized_tokens

['Text',
 'analytics',
 'process',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'derive',
 'meaningful',
 'insight',
 'pattern',
 '.']

In [30]:
processed_doc = ''.join(lemmatized_tokens)

In [31]:
processed_doc

'Textanalyticsprocessanalyzingunstructuredtextdataderivemeaningfulinsightpattern.'

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([processed_doc])

In [34]:
tfidf_matrix

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [35]:
feature_names = tfidf_vectorizer.get_feature_names_out()

In [36]:
feature_names

array(['textanalyticsprocessanalyzingunstructuredtextdataderivemeaningfulinsightpattern'],
      dtype=object)

In [45]:
for col in tfidf_matrix.nonzero()[1]:
    print(f"{feature_names[col]}: {tfidf_matrix[0, col]}")

textanalyticsprocessanalyzingunstructuredtextdataderivemeaningfulinsightpattern: 1.0
