In [10]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer ,  WordNetLemmatizer
from nltk import pos_tag,word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
doc="Cats are running faster than the dogs. The dogs were chasing the cats all night long!"

In [14]:
doc

'Cats are running faster than the dogs. The dogs were chasing the cats all night long!'

In [20]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

## 1. tokenization

In [47]:
import string
tokens = word_tokenize(doc)
tokens


['Cats',
 'are',
 'running',
 'faster',
 'than',
 'the',
 'dogs',
 '.',
 'The',
 'dogs',
 'were',
 'chasing',
 'the',
 'cats',
 'all',
 'night',
 'long',
 '!']

## 2. pos tagging

In [50]:
from nltk import pos_tag

In [52]:
 nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [54]:
p=pos_tag(tokens)

In [56]:
p

[('Cats', 'NNS'),
 ('are', 'VBP'),
 ('running', 'VBG'),
 ('faster', 'RBR'),
 ('than', 'IN'),
 ('the', 'DT'),
 ('dogs', 'NNS'),
 ('.', '.'),
 ('The', 'DT'),
 ('dogs', 'NNS'),
 ('were', 'VBD'),
 ('chasing', 'VBG'),
 ('the', 'DT'),
 ('cats', 'NNS'),
 ('all', 'DT'),
 ('night', 'NN'),
 ('long', 'RB'),
 ('!', '.')]

## 3. Remove punctuation and stop words

In [59]:
tokens= [token for token in tokens if token not in string.punctuation]

In [61]:
tokens

['Cats',
 'are',
 'running',
 'faster',
 'than',
 'the',
 'dogs',
 'The',
 'dogs',
 'were',
 'chasing',
 'the',
 'cats',
 'all',
 'night',
 'long']

In [63]:
from nltk.corpus import stopwords

In [67]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [69]:
s= set(stopwords.words('english'))

In [73]:
filtered = [token for token in tokens if token.lower() not in s]

In [75]:
filtered

['Cats',
 'running',
 'faster',
 'dogs',
 'dogs',
 'chasing',
 'cats',
 'night',
 'long']

## 4. Stemming

In [79]:
from nltk.stem import PorterStemmer

In [81]:
s= PorterStemmer()

In [83]:
sd=[s.stem(w) for w in filtered]

In [85]:
sd

['cat', 'run', 'faster', 'dog', 'dog', 'chase', 'cat', 'night', 'long']

## 5. Lemmetization

In [88]:
from nltk.stem import WordNetLemmatizer

In [90]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


True

In [94]:
l= WordNetLemmatizer()

In [96]:
ld= [l.lemmatize(w) for w in filtered]

In [98]:
ld

['Cats', 'running', 'faster', 'dog', 'dog', 'chasing', 'cat', 'night', 'long']

## 6. TFIDF

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [105]:
corpus = [' '.join(filtered)]

In [107]:
v= TfidfVectorizer()

In [109]:
tf= v.fit_transform(corpus)

In [119]:
tf=tf.toarray()

In [121]:
v.get_feature_names_out()

array(['cats', 'chasing', 'dogs', 'faster', 'long', 'night', 'running'],
      dtype=object)

In [123]:
df=pd.DataFrame(tf)

In [125]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,0.5547,0.27735,0.5547,0.27735,0.27735,0.27735,0.27735
