In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
# Sample Document
file_path = 'sample.txt'
with open(file_path, 'r') as file:
    text = file.read()

In [4]:
import pandas as pd
df = pd.DataFrame({'text': [text]})

In [5]:
sample_doc = df['text'][0]

In [6]:
# Tokenization
tokens = word_tokenize(sample_doc)
tokens

['Text',
 'analytics',
 'is',
 'the',
 'process',
 'of',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'to',
 'derive',
 'meaningful',
 'insights',
 '.',
 'It',
 'involves',
 'various',
 'preprocessing',
 'steps',
 'such',
 'as',
 'tokenization',
 ',',
 'POS',
 'tagging',
 ',',
 'stop',
 'words',
 'removal',
 ',',
 'stemming',
 ',',
 'and',
 'lemmatization',
 '.',
 'Once',
 'the',
 'text',
 'is',
 'preprocessed',
 ',',
 'we',
 'can',
 'calculate',
 'term',
 'frequency',
 'and',
 'inverse',
 'document',
 'frequency',
 'to',
 'represent',
 'the',
 'document',
 '.']

In [7]:
# POS Tagging
pos_tags = pos_tag(tokens)
pos_tags

[('Text', 'NN'),
 ('analytics', 'NNS'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('process', 'NN'),
 ('of', 'IN'),
 ('analyzing', 'VBG'),
 ('unstructured', 'JJ'),
 ('text', 'NN'),
 ('data', 'NNS'),
 ('to', 'TO'),
 ('derive', 'VB'),
 ('meaningful', 'JJ'),
 ('insights', 'NNS'),
 ('.', '.'),
 ('It', 'PRP'),
 ('involves', 'VBZ'),
 ('various', 'JJ'),
 ('preprocessing', 'VBG'),
 ('steps', 'NNS'),
 ('such', 'JJ'),
 ('as', 'IN'),
 ('tokenization', 'NN'),
 (',', ','),
 ('POS', 'NNP'),
 ('tagging', 'NN'),
 (',', ','),
 ('stop', 'VB'),
 ('words', 'NNS'),
 ('removal', 'JJ'),
 (',', ','),
 ('stemming', 'VBG'),
 (',', ','),
 ('and', 'CC'),
 ('lemmatization', 'NN'),
 ('.', '.'),
 ('Once', 'IN'),
 ('the', 'DT'),
 ('text', 'NN'),
 ('is', 'VBZ'),
 ('preprocessed', 'VBN'),
 (',', ','),
 ('we', 'PRP'),
 ('can', 'MD'),
 ('calculate', 'VB'),
 ('term', 'NN'),
 ('frequency', 'NN'),
 ('and', 'CC'),
 ('inverse', 'JJ'),
 ('document', 'NN'),
 ('frequency', 'NN'),
 ('to', 'TO'),
 ('represent', 'VB'),
 ('the', 'DT'),
 ('do

In [8]:
# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
filtered_tokens

['Text',
 'analytics',
 'process',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'derive',
 'meaningful',
 'insights',
 '.',
 'involves',
 'various',
 'preprocessing',
 'steps',
 'tokenization',
 ',',
 'POS',
 'tagging',
 ',',
 'stop',
 'words',
 'removal',
 ',',
 'stemming',
 ',',
 'lemmatization',
 '.',
 'text',
 'preprocessed',
 ',',
 'calculate',
 'term',
 'frequency',
 'inverse',
 'document',
 'frequency',
 'represent',
 'document',
 '.']

In [9]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
stemmed_tokens

['text',
 'analyt',
 'process',
 'analyz',
 'unstructur',
 'text',
 'data',
 'deriv',
 'meaning',
 'insight',
 '.',
 'involv',
 'variou',
 'preprocess',
 'step',
 'token',
 ',',
 'po',
 'tag',
 ',',
 'stop',
 'word',
 'remov',
 ',',
 'stem',
 ',',
 'lemmat',
 '.',
 'text',
 'preprocess',
 ',',
 'calcul',
 'term',
 'frequenc',
 'invers',
 'document',
 'frequenc',
 'repres',
 'document',
 '.']

In [10]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
lemmatized_tokens

['Text',
 'analytics',
 'process',
 'analyzing',
 'unstructured',
 'text',
 'data',
 'derive',
 'meaningful',
 'insight',
 '.',
 'involves',
 'various',
 'preprocessing',
 'step',
 'tokenization',
 ',',
 'POS',
 'tagging',
 ',',
 'stop',
 'word',
 'removal',
 ',',
 'stemming',
 ',',
 'lemmatization',
 '.',
 'text',
 'preprocessed',
 ',',
 'calculate',
 'term',
 'frequency',
 'inverse',
 'document',
 'frequency',
 'represent',
 'document',
 '.']

In [11]:
# Join tokens into a single string for TF-IDF calculation
preprocessed_text = " ".join(lemmatized_tokens)
preprocessed_text

'Text analytics process analyzing unstructured text data derive meaningful insight . involves various preprocessing step tokenization , POS tagging , stop word removal , stemming , lemmatization . text preprocessed , calculate term frequency inverse document frequency represent document .'

In [12]:
# Calculate TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([preprocessed_text])
tfidf_matrix

<1x28 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [13]:
# Get feature names (terms)
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['analytics', 'analyzing', 'calculate', 'data', 'derive',
       'document', 'frequency', 'insight', 'inverse', 'involves',
       'lemmatization', 'meaningful', 'pos', 'preprocessed',
       'preprocessing', 'process', 'removal', 'represent', 'stemming',
       'step', 'stop', 'tagging', 'term', 'text', 'tokenization',
       'unstructured', 'various', 'word'], dtype=object)

In [14]:
# Print Preprocessed Text
print("Preprocessed Text:", preprocessed_text)

Preprocessed Text: Text analytics process analyzing unstructured text data derive meaningful insight . involves various preprocessing step tokenization , POS tagging , stop word removal , stemming , lemmatization . text preprocessed , calculate term frequency inverse document frequency represent document .


In [15]:
# Print TF-IDF Representation
print("\nTF-IDF Representation:")
for i, feature in enumerate(feature_names):
    print(f"{feature}: {tfidf_matrix[0, i]}")


TF-IDF Representation:
analytics: 0.1543033499620919
analyzing: 0.1543033499620919
calculate: 0.1543033499620919
data: 0.1543033499620919
derive: 0.1543033499620919
document: 0.3086066999241838
frequency: 0.3086066999241838
insight: 0.1543033499620919
inverse: 0.1543033499620919
involves: 0.1543033499620919
lemmatization: 0.1543033499620919
meaningful: 0.1543033499620919
pos: 0.1543033499620919
preprocessed: 0.1543033499620919
preprocessing: 0.1543033499620919
process: 0.1543033499620919
removal: 0.1543033499620919
represent: 0.1543033499620919
stemming: 0.1543033499620919
step: 0.1543033499620919
stop: 0.1543033499620919
tagging: 0.1543033499620919
term: 0.1543033499620919
text: 0.4629100498862757
tokenization: 0.1543033499620919
unstructured: 0.1543033499620919
various: 0.1543033499620919
word: 0.1543033499620919
