In [2]:
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [5]:
doc = "It is truth universally acknowledged, that a single man in possession of good fortune, must be in want of a wife"
doc = doc.lower()
import string
doc = "".join([s for s in doc if s not in string.punctuation])
print(doc)

it is truth universally acknowledged that a single man in possession of good fortune must be in want of a wife


In [7]:
# tokenization

tokens = word_tokenize(doc)
print(tokens)

['it', 'is', 'truth', 'universally', 'acknowledged', 'that', 'a', 'single', 'man', 'in', 'possession', 'of', 'good', 'fortune', 'must', 'be', 'in', 'want', 'of', 'a', 'wife']


In [8]:
# stopwords removal

sw = stopwords.words('english')
filtered_words = [s for s in tokens if s not in sw]
print(filtered_words)

['truth', 'universally', 'acknowledged', 'single', 'man', 'possession', 'good', 'fortune', 'must', 'want', 'wife']


In [12]:
# stemming

porter = PorterStemmer()
stemmed = [porter.stem(s) for s in filtered_words] 

print(stemmed)

['truth', 'univers', 'acknowledg', 'singl', 'man', 'possess', 'good', 'fortun', 'must', 'want', 'wife']


In [15]:
# lemmatization

lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(s) for s in filtered_words] 

print(lemmatized)

['truth', 'universally', 'acknowledged', 'single', 'man', 'possession', 'good', 'fortune', 'must', 'want', 'wife']


In [18]:
# pos tagging

pos = pos_tag(filtered_words)
print(pos)

[('truth', 'NN'), ('universally', 'RB'), ('acknowledged', 'VBD'), ('single', 'JJ'), ('man', 'NN'), ('possession', 'NN'), ('good', 'JJ'), ('fortune', 'NN'), ('must', 'MD'), ('want', 'VB'), ('wife', 'NN')]


In [19]:
tfidf = TfidfVectorizer()

doc2 = [
    "single man",
    "single women"
] 
result = tfidf.fit_transform(doc2)
print("\ntf-idf Values are : \n")
print(result)


tf-idf Values are : 

  (0, 0)	0.8148024746671689
  (0, 1)	0.5797386715376657
  (1, 2)	0.8148024746671689
  (1, 1)	0.5797386715376657
