7 Text Analytics

1.Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.

2.Create representation of documents by calculating Term Frequency and Inverse
DocumentFrequency.

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Sample document
document = "Text analytics is the process of deriving meaningful information from natural language text. It involves several steps such as tokenization, part-of-speech tagging, removing stop words, stemming, and lemmatization."

# Tokenization
tokens = word_tokenize(document)

# POS Tagging
pos_tags = pos_tag(tokens)

# Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print("Original Tokens:", tokens)
print("POS Tags:", pos_tags)
print("Filtered Tokens (Stop Words Removal):", filtered_tokens)
print("Stemmed Tokens:", stemmed_tokens)
print("Lemmatized Tokens:", lemmatized_tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Original Tokens: ['Text', 'analytics', 'is', 'the', 'process', 'of', 'deriving', 'meaningful', 'information', 'from', 'natural', 'language', 'text', '.', 'It', 'involves', 'several', 'steps', 'such', 'as', 'tokenization', ',', 'part-of-speech', 'tagging', ',', 'removing', 'stop', 'words', ',', 'stemming', ',', 'and', 'lemmatization', '.']
POS Tags: [('Text', 'NN'), ('analytics', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('deriving', 'VBG'), ('meaningful', 'JJ'), ('information', 'NN'), ('from', 'IN'), ('natural', 'JJ'), ('language', 'NN'), ('text', 'NN'), ('.', '.'), ('It', 'PRP'), ('involves', 'VBZ'), ('several', 'JJ'), ('steps', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('tokenization', 'NN'), (',', ','), ('part-of-speech', 'JJ'), ('tagging', 'NN'), (',', ','), ('removing', 'VBG'), ('stop', 'NN'), ('words', 'NNS'), (',', ','), ('stemming', 'VBG'), (',', ','), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.')]
Filtered Tokens (Stop Words Removal): ['Text', 'an

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example documents
documents = [
    "Text analytics is the process of deriving meaningful information from natural language text.",
    "It involves several steps such as tokenization, part-of-speech tagging, removing stop words, stemming, and lemmatization."
]

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame to display TF-IDF representation
import pandas as pd
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print(tfidf_df)


   analytics       and        as  deriving      from  information  involves  \
0   0.262556  0.000000  0.000000  0.262556  0.262556     0.262556  0.000000   
1   0.000000  0.246136  0.246136  0.000000  0.000000     0.000000  0.246136   

         is        it  language  ...    speech  stemming     steps      stop  \
0  0.262556  0.000000  0.262556  ...  0.000000  0.000000  0.000000  0.000000   
1  0.000000  0.246136  0.000000  ...  0.246136  0.246136  0.246136  0.246136   

       such   tagging      text       the  tokenization     words  
0  0.000000  0.000000  0.525113  0.262556      0.000000  0.000000  
1  0.246136  0.246136  0.000000  0.000000      0.246136  0.246136  

[2 rows x 28 columns]
