In [3]:
! pip install nltk scikit-learn spacy



In [23]:
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [5]:
doc = "Lorem ipsum dolor set Hello how are you this document is for text analytics and we will perform various operations on this text documnent, Thank you"

In [8]:
tokens = word_tokenize(doc)

In [9]:
tokens

['Lorem',
 'ipsum',
 'dolor',
 'set',
 'Hello',
 'how',
 'are',
 'you',
 'this',
 'document',
 'is',
 'for',
 'text',
 'analytics',
 'and',
 'we',
 'will',
 'perform',
 'various',
 'operations',
 'on',
 'this',
 'text',
 'documnent',
 ',',
 'Thank',
 'you']

In [12]:
pos_tags = nltk.pos_tag(tokens)

In [13]:
pos_tags

[('Lorem', 'NNP'),
 ('ipsum', 'JJ'),
 ('dolor', 'NN'),
 ('set', 'VBN'),
 ('Hello', 'NNP'),
 ('how', 'WRB'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('this', 'DT'),
 ('document', 'NN'),
 ('is', 'VBZ'),
 ('for', 'IN'),
 ('text', 'JJ'),
 ('analytics', 'NNS'),
 ('and', 'CC'),
 ('we', 'PRP'),
 ('will', 'MD'),
 ('perform', 'VB'),
 ('various', 'JJ'),
 ('operations', 'NNS'),
 ('on', 'IN'),
 ('this', 'DT'),
 ('text', 'NN'),
 ('documnent', 'NN'),
 (',', ','),
 ('Thank', 'NNP'),
 ('you', 'PRP')]

In [15]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


True

In [17]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [18]:
pos_tags_with_tagset = nltk.pos_tag(tokens, tagset='universal')

In [19]:
pos_tags_with_tagset

[('Lorem', 'NOUN'),
 ('ipsum', 'ADJ'),
 ('dolor', 'NOUN'),
 ('set', 'VERB'),
 ('Hello', 'NOUN'),
 ('how', 'ADV'),
 ('are', 'VERB'),
 ('you', 'PRON'),
 ('this', 'DET'),
 ('document', 'NOUN'),
 ('is', 'VERB'),
 ('for', 'ADP'),
 ('text', 'ADJ'),
 ('analytics', 'NOUN'),
 ('and', 'CONJ'),
 ('we', 'PRON'),
 ('will', 'VERB'),
 ('perform', 'VERB'),
 ('various', 'ADJ'),
 ('operations', 'NOUN'),
 ('on', 'ADP'),
 ('this', 'DET'),
 ('text', 'NOUN'),
 ('documnent', 'NOUN'),
 (',', '.'),
 ('Thank', 'NOUN'),
 ('you', 'PRON')]

In [20]:
stopwords = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stopwords]

In [21]:
filtered_tokens

['Lorem',
 'ipsum',
 'dolor',
 'set',
 'Hello',
 'document',
 'text',
 'analytics',
 'perform',
 'various',
 'operations',
 'text',
 'documnent',
 ',',
 'Thank']

In [22]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
stemmed_words

['lorem',
 'ipsum',
 'dolor',
 'set',
 'hello',
 'document',
 'text',
 'analyt',
 'perform',
 'variou',
 'oper',
 'text',
 'documn',
 ',',
 'thank']

In [24]:
tf_idf_vectorizer = TfidfVectorizer()
tf_idf_matrix = tf_idf_vectorizer.fit_transform([doc])
tf_idf_matrix.toarray()

array([[0.1767767 , 0.1767767 , 0.1767767 , 0.1767767 , 0.1767767 ,
        0.1767767 , 0.1767767 , 0.1767767 , 0.1767767 , 0.1767767 ,
        0.1767767 , 0.1767767 , 0.1767767 , 0.1767767 , 0.1767767 ,
        0.1767767 , 0.35355339, 0.1767767 , 0.35355339, 0.1767767 ,
        0.1767767 , 0.1767767 , 0.35355339]])

In [27]:
tf_idf_vectorizer.get_feature_names_out()

array(['analytics', 'and', 'are', 'document', 'documnent', 'dolor', 'for',
       'hello', 'how', 'ipsum', 'is', 'lorem', 'on', 'operations',
       'perform', 'set', 'text', 'thank', 'this', 'various', 'we', 'will',
       'you'], dtype=object)