Text Analytics

1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of documents by calculating Term Frequency and Inverse
DocumentFrequency.

In [1]:
import nltk # natural language toolkit
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sahilraina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sahilraina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sahilraina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sahilraina/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

TOKENIZATION






In [2]:
from nltk import word_tokenize, sent_tokenize

In [3]:
x = "Sachin was the GOAT of the previous generation. Virat is the GOAT of this generation. Shubman will be the GOAT of the next generation"
     

In [4]:
word_tokenize(x)

['Sachin',
 'was',
 'the',
 'GOAT',
 'of',
 'the',
 'previous',
 'generation',
 '.',
 'Virat',
 'is',
 'the',
 'GOAT',
 'of',
 'this',
 'generation',
 '.',
 'Shubman',
 'will',
 'be',
 'the',
 'GOAT',
 'of',
 'the',
 'next',
 'generation']

In [5]:
sent_tokenize(x)

['Sachin was the GOAT of the previous generation.',
 'Virat is the GOAT of this generation.',
 'Shubman will be the GOAT of the next generation']

POS TAGGING 

In [6]:
from nltk import pos_tag
y = word_tokenize(x)
pos_tag(y)

[('Sachin', 'NNP'),
 ('was', 'VBD'),
 ('the', 'DT'),
 ('GOAT', 'NNP'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('previous', 'JJ'),
 ('generation', 'NN'),
 ('.', '.'),
 ('Virat', 'NNP'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('GOAT', 'NNP'),
 ('of', 'IN'),
 ('this', 'DT'),
 ('generation', 'NN'),
 ('.', '.'),
 ('Shubman', 'NNP'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('the', 'DT'),
 ('GOAT', 'NNP'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('next', 'JJ'),
 ('generation', 'NN')]

STOP WORD REMOVAL

In [7]:
from nltk.corpus import stopwords
s = set(stopwords.words("english"))

In [8]:
y = word_tokenize(x)
cleaned_tokens = []
for token in y :
    if (token not in s):
        cleaned_tokens.append(token)
print(cleaned_tokens)

['Sachin', 'GOAT', 'previous', 'generation', '.', 'Virat', 'GOAT', 'generation', '.', 'Shubman', 'GOAT', 'next', 'generation']


STEMMING

In [9]:
from nltk.stem import PorterStemmer

In [10]:
stem = PorterStemmer()

In [11]:
stem_tokens = []
for token in cleaned_tokens:
    stemmed = stem.stem(token)
    stem_tokens.append(stemmed)
print(stem_tokens)  

['sachin', 'goat', 'previou', 'gener', '.', 'virat', 'goat', 'gener', '.', 'shubman', 'goat', 'next', 'gener']


LEMMATIZATION

In [12]:
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4') #this is making the error if making any

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sahilraina/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [13]:
lemmatizer = WordNetLemmatizer()

In [14]:
lemmatized_tokens = []
for token in cleaned_tokens:
  lemmatized = lemmatizer.lemmatize(token)
  lemmatized_tokens.append(lemmatized)
print(lemmatized_tokens)

['Sachin', 'GOAT', 'previous', 'generation', '.', 'Virat', 'GOAT', 'generation', '.', 'Shubman', 'GOAT', 'next', 'generation']


TF - IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [16]:
xy = ["Sachin was the GOAT of the previous generation. Virat is the GOAT of this generation. Shubman will be the GOAT of the next generation"]


In [17]:
vectorizer = TfidfVectorizer()

In [18]:
matrix = vectorizer.fit(xy)
matrix.vocabulary_

{'sachin': 7,
 'was': 12,
 'the': 9,
 'goat': 2,
 'of': 5,
 'previous': 6,
 'generation': 1,
 'virat': 11,
 'is': 3,
 'this': 10,
 'shubman': 8,
 'will': 13,
 'be': 0,
 'next': 4}

In [20]:
tfidf_matrix = vectorizer.transform(xy)
print(tfidf_matrix)

  (0, 13)	0.1270001270001905
  (0, 12)	0.1270001270001905
  (0, 11)	0.1270001270001905
  (0, 10)	0.1270001270001905
  (0, 9)	0.6350006350009525
  (0, 8)	0.1270001270001905
  (0, 7)	0.1270001270001905
  (0, 6)	0.1270001270001905
  (0, 5)	0.3810003810005715
  (0, 4)	0.1270001270001905
  (0, 3)	0.1270001270001905
  (0, 2)	0.3810003810005715
  (0, 1)	0.3810003810005715
  (0, 0)	0.1270001270001905


In [21]:
print(vectorizer.get_feature_names_out())

['be' 'generation' 'goat' 'is' 'next' 'of' 'previous' 'sachin' 'shubman'
 'the' 'this' 'virat' 'was' 'will']
