#  Text Analytics
1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document 
Frequency

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [17]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\balendu
[nltk_data]     singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\balendu
[nltk_data]     singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\balendu singh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to C:\Users\balendu
[nltk_data]     singh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
#sample document

document="MS Dhoni is a legendary Indian cricketer, known for his calm demeanor and exceptional wicket-keeping skills. He is also known as Captain Cool."

In [19]:
# 1. Tokenization
tokens = word_tokenize(document)
print("Tokenization:")
print(tokens)


Tokenization:
['MS', 'Dhoni', 'is', 'a', 'legendary', 'Indian', 'cricketer', ',', 'known', 'for', 'his', 'calm', 'demeanor', 'and', 'exceptional', 'wicket-keeping', 'skills', '.', 'He', 'is', 'also', 'known', 'as', 'Captain', 'Cool', '.']


In [20]:
# 2. POS Tagging
pos_tags = pos_tag(tokens)
print("POS Tagging:")
print(pos_tags)


POS Tagging:
[('MS', 'NNP'), ('Dhoni', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('legendary', 'JJ'), ('Indian', 'JJ'), ('cricketer', 'NN'), (',', ','), ('known', 'VBN'), ('for', 'IN'), ('his', 'PRP$'), ('calm', 'NN'), ('demeanor', 'NN'), ('and', 'CC'), ('exceptional', 'JJ'), ('wicket-keeping', 'NN'), ('skills', 'NNS'), ('.', '.'), ('He', 'PRP'), ('is', 'VBZ'), ('also', 'RB'), ('known', 'VBN'), ('as', 'IN'), ('Captain', 'NNP'), ('Cool', 'NNP'), ('.', '.')]


In [21]:
# 3. Stop Words Removal
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in punctuation]
print("After Stop Words Removal:")
print(filtered_tokens)


After Stop Words Removal:
['MS', 'Dhoni', 'legendary', 'Indian', 'cricketer', 'known', 'calm', 'demeanor', 'exceptional', 'wicket-keeping', 'skills', 'also', 'known', 'Captain', 'Cool']


In [22]:
# 4. Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemming Results:")
print(stemmed_tokens)


Stemming Results:
['ms', 'dhoni', 'legendari', 'indian', 'cricket', 'known', 'calm', 'demeanor', 'except', 'wicket-keep', 'skill', 'also', 'known', 'captain', 'cool']


In [23]:
# 5. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = []
for word, tag in pos_tags:
    if word.lower() not in stop_words and word not in punctuation:
        pos = tag[0].lower()
        if pos in ['a', 'r', 'n', 'v']:
            lemma = lemmatizer.lemmatize(word, pos=pos)
        else:
            lemma = lemmatizer.lemmatize(word)
        lemmatized_tokens.append(lemma)
print("Lemmatization Results:")
print(lemmatized_tokens)

Lemmatization Results:
['MS', 'Dhoni', 'legendary', 'Indian', 'cricketer', 'know', 'calm', 'demeanor', 'exceptional', 'wicket-keeping', 'skill', 'also', 'know', 'Captain', 'Cool']


In [24]:
# Create a small corpus (3 documents)
corpus = [
    document,
    "Virat Kohli is another great Indian cricketer known for his aggressive batting style.",
    "Captain Cool is a nickname given to MS Dhoni for his calm leadership."
]

In [25]:
# Create TF-IDF representation
vectorizer = TfidfVectorizer(stop_words='english', 
                            lowercase=True,
                            tokenizer=word_tokenize,
                            token_pattern=None)
tfidf_matrix = vectorizer.fit_transform(corpus)

In [26]:
# Get feature names
feature_names = vectorizer.get_feature_names_out()

In [27]:
# Convert to DataFrame for better visualization
import pandas as pd
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), 
                        columns=feature_names,
                        index=['Document 1', 'Document 2', 'Document 3'])

print("\nTF-IDF Representation:")
print(df_tfidf.round(2))


TF-IDF Representation:
               ,     .  aggressive  batting  calm  captain  cool  cricketer  \
Document 1  0.27  0.32        0.00     0.00  0.21     0.21  0.21       0.21   
Document 2  0.00  0.21        0.35     0.35  0.00     0.00  0.00       0.27   
Document 3  0.00  0.24        0.00     0.00  0.30     0.30  0.30       0.00   

            demeanor  dhoni  ...  known  kohli  leadership  legendary    ms  \
Document 1      0.27   0.21  ...   0.41   0.00         0.0       0.27  0.21   
Document 2      0.00   0.00  ...   0.27   0.35         0.0       0.00  0.00   
Document 3      0.00   0.30  ...   0.00   0.00         0.4       0.00  0.30   

            nickname  skills  style  virat  wicket-keeping  
Document 1       0.0    0.27   0.00   0.00            0.27  
Document 2       0.0    0.00   0.35   0.35            0.00  
Document 3       0.4    0.00   0.00   0.00            0.00  

[3 rows x 24 columns]
