In [None]:
# Text Analytics
    # 1. Extract Sample document and apply following document preprocessing methods:
    # Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
    # 2. Create representation of document by calculating Term Frequency and Inverse Document
    # Frequency.

In [None]:
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nkolh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nkolh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [6]:
text = """Text analytics is the process of deriving useful insights from text data. 
It involves techniques like Tokenization, POS Tagging, Stop Words Removal, Stemming, and Lemmatization."""

In [24]:
sentence = sent_tokenize(text)
print("Tokenized sentence:", sentence)

Tokenized sentence: ['Text analytics is the process of deriving useful insights from text data.', 'It involves techniques like Tokenization, POS Tagging, Stop Words Removal, Stemming, and Lemmatization.']


In [25]:
words = word_tokenize(text)
print("Tokenized Words:", words)


Tokenized Words: ['Text', 'analytics', 'is', 'the', 'process', 'of', 'deriving', 'useful', 'insights', 'from', 'text', 'data', '.', 'It', 'involves', 'techniques', 'like', 'Tokenization', ',', 'POS', 'Tagging', ',', 'Stop', 'Words', 'Removal', ',', 'Stemming', ',', 'and', 'Lemmatization', '.']


In [26]:
pos_tags = pos_tag(words)
print("POS Tags:", pos_tags)


POS Tags: [('Text', 'NN'), ('analytics', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('deriving', 'VBG'), ('useful', 'JJ'), ('insights', 'NNS'), ('from', 'IN'), ('text', 'NN'), ('data', 'NNS'), ('.', '.'), ('It', 'PRP'), ('involves', 'VBZ'), ('techniques', 'NNS'), ('like', 'IN'), ('Tokenization', 'NNP'), (',', ','), ('POS', 'NNP'), ('Tagging', 'NNP'), (',', ','), ('Stop', 'NNP'), ('Words', 'NNP'), ('Removal', 'NNP'), (',', ','), ('Stemming', 'NNP'), (',', ','), ('and', 'CC'), ('Lemmatization', 'NNP'), ('.', '.')]


In [27]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in words if word.lower() not in stop_words and word not in string.punctuation]
print("Tokens after Stopwords Removal:", filtered_tokens)

Tokens after Stopwords Removal: ['Text', 'analytics', 'process', 'deriving', 'useful', 'insights', 'text', 'data', 'involves', 'techniques', 'like', 'Tokenization', 'POS', 'Tagging', 'Stop', 'Words', 'Removal', 'Stemming', 'Lemmatization']


In [28]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Words:", stemmed_words)


Stemmed Words: ['text', 'analyt', 'process', 'deriv', 'use', 'insight', 'text', 'data', 'involv', 'techniqu', 'like', 'token', 'po', 'tag', 'stop', 'word', 'remov', 'stem', 'lemmat']


In [29]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Words:", lemmatized_words)


Lemmatized Words: ['Text', 'analytics', 'process', 'deriving', 'useful', 'insight', 'text', 'data', 'involves', 'technique', 'like', 'Tokenization', 'POS', 'Tagging', 'Stop', 'Words', 'Removal', 'Stemming', 'Lemmatization']


In [31]:
vectorizer = TfidfVectorizer()
new_sentence=[' '.join(sentence)]
tfidf_matrix = vectorizer.fit_transform(new_sentence)
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:")
print(df_tfidf)


TF-IDF Matrix:
   analytics      and     data  deriving     from  insights  involves  \
0    0.19245  0.19245  0.19245   0.19245  0.19245   0.19245   0.19245   

        is       it  lemmatization  ...  removal  stemming     stop  tagging  \
0  0.19245  0.19245        0.19245  ...  0.19245   0.19245  0.19245  0.19245   

   techniques    text      the  tokenization   useful    words  
0     0.19245  0.3849  0.19245       0.19245  0.19245  0.19245  

[1 rows x 24 columns]
