# Text Analytics

1. Extract Sample document and apply following document preprocessing methods: Tokenization,POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document Frequency.


In [45]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download required NLTK data (run once)
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Tokenzation of words , sentences :- 
- we use method word_tokenise() for spliting words from sentences


In [18]:
text = "Text analytics includes preprocessing steps like tokenization, stemming, and lemmatization."

# Tokenization
tokens = word_tokenize(text)
print(tokens)

['Text', 'analytics', 'includes', 'preprocessing', 'steps', 'like', 'tokenization', ',', 'stemming', ',', 'and', 'lemmatization', '.']


In [19]:
from nltk.tokenize import sent_tokenize
sentence = 'my name is Abhijit . I am from India'
print(sent_tokenize(sentence))

['my name is Abhijit .', 'I am from India']


# POS Tagging

In [20]:
tags = pos_tag(tokens)
tags

[('Text', 'NN'),
 ('analytics', 'NNS'),
 ('includes', 'VBZ'),
 ('preprocessing', 'VBG'),
 ('steps', 'NNS'),
 ('like', 'IN'),
 ('tokenization', 'NN'),
 (',', ','),
 ('stemming', 'VBG'),
 (',', ','),
 ('and', 'CC'),
 ('lemmatization', 'NN'),
 ('.', '.')]

# Stemming 
- cutting  down the words that have samw origin but different formats
- wait -> waiting , waited , waits
- so convert all words to root word wait

In [27]:
# simple stemmign example 
my_words = ['wait','waiting','waited','go','going','here','there']
ps = PorterStemmer()
for i in my_words:
    rootword = ps.stem(i)
    print(f"stemming for {i} is : {rootword}")

stemming for wait is : wait
stemming for waiting is : wait
stemming for waited is : wait
stemming for go is : go
stemming for going is : go
stemming for here is : here
stemming for there is : there


# Lemmatization 
- Why is lemmatization better than stemming
  because stemmign does suffix removal but lemmatization does the morphological
  analysis of the words

In [35]:
my_words = ['wait','waiting','waited','go','go','here','there']
ls = WordNetLemmatizer()
for i in my_words:
    rootword = ls.lemmatize(i)
    print(f"lemmatization for {i} is : {rootword}")

lemmatization for wait is : wait
lemmatization for waiting is : waiting
lemmatization for waited is : waited
lemmatization for go is : go
lemmatization for go is : go
lemmatization for here is : here
lemmatization for there is : there


# Stop words 
- A sentence can have variou stopwords in it like - is , are , the etc
- We remove them because they not carry significant meaning

In [50]:
stopword = set(stopwords.words('english'))
text = "A Text is analytics includes preprocessing steps like tokenization, stemming, and lemmatization."
token = word_tokenize(text)
filter_words = []

for i in token:
    if i not in stopword:
        filter_words.append(i)

print(filter_words)


['A', 'Text', 'analytics', 'includes', 'preprocessing', 'steps', 'like', 'tokenization', ',', 'stemming', ',', 'lemmatization', '.']


# TF and IDF

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer


# Sample documents
docs = [
    "Text analytics is the process of analyzing text data.",
    "It includes steps like tokenization and stemming.",
    "Lemmatization is another important preprocessing step."
]

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform
tfidf_matrix = vectorizer.fit_transform(docs)

# Show results
print("\nTF-IDF Matrix:")
print(tfidf_matrix)

print("\nFeature Names (terms):")
print(vectorizer.get_feature_names_out())



TF-IDF Matrix:
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 21 stored elements and shape (3, 20)>
  Coords	Values
  (0, 17)	0.6149219764307087
  (0, 0)	0.30746098821535434
  (0, 7)	0.2338320064840948
  (0, 18)	0.30746098821535434
  (0, 13)	0.30746098821535434
  (0, 11)	0.30746098821535434
  (0, 1)	0.30746098821535434
  (0, 4)	0.30746098821535434
  (1, 8)	0.37796447300922725
  (1, 6)	0.37796447300922725
  (1, 16)	0.37796447300922725
  (1, 10)	0.37796447300922725
  (1, 19)	0.37796447300922725
  (1, 2)	0.37796447300922725
  (1, 14)	0.37796447300922725
  (2, 7)	0.3220024178194947
  (2, 9)	0.4233944834119594
  (2, 3)	0.4233944834119594
  (2, 5)	0.4233944834119594
  (2, 12)	0.4233944834119594
  (2, 15)	0.4233944834119594

Feature Names (terms):
['analytics' 'analyzing' 'and' 'another' 'data' 'important' 'includes'
 'is' 'it' 'lemmatization' 'like' 'of' 'preprocessing' 'process'
 'stemming' 'step' 'steps' 'text' 'the' 'tokenization']
