In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
sample_document = "Hello this is the most famous place Agra where the monument Taj Mahal is loacted"

tokens = word_tokenize(sample_document)
print("Tokenization:", tokens)

Tokenization: ['Hello', 'this', 'is', 'the', 'most', 'famous', 'place', 'Agra', 'where', 'the', 'monument', 'Taj', 'Mahal', 'is', 'loacted']


In [11]:
pos_tags = pos_tag(tokens)
print("\nPOS Tagging:", pos_tags)


POS Tagging: [('Hello', 'NNP'), ('this', 'DT'), ('is', 'VBZ'), ('the', 'DT'), ('most', 'RBS'), ('famous', 'JJ'), ('place', 'NN'), ('Agra', 'NNP'), ('where', 'WRB'), ('the', 'DT'), ('monument', 'NN'), ('Taj', 'NNP'), ('Mahal', 'NNP'), ('is', 'VBZ'), ('loacted', 'VBN')]


In [12]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("\nStopwords Removal: ", filtered_tokens)


Stopwords Removal:  ['Hello', 'famous', 'place', 'Agra', 'monument', 'Taj', 'Mahal', 'loacted']


In [13]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("\nStemming:", stemmed_tokens)


Stemming: ['hello', 'famou', 'place', 'agra', 'monument', 'taj', 'mahal', 'loact']


In [14]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nLemmatization: ", lemmatized_tokens)


Lemmatization:  ['Hello', 'famous', 'place', 'Agra', 'monument', 'Taj', 'Mahal', 'loacted']


In [15]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_representation = tfidf_vectorizer.fit_transform([sample_document])
print("\nTF-IDF Representation")
print(tfidf_representation)


TF-IDF Representation
  (0, 4)	0.22941573387056174
  (0, 5)	0.22941573387056174
  (0, 9)	0.22941573387056174
  (0, 6)	0.22941573387056174
  (0, 12)	0.22941573387056174
  (0, 0)	0.22941573387056174
  (0, 8)	0.22941573387056174
  (0, 1)	0.22941573387056174
  (0, 7)	0.22941573387056174
  (0, 10)	0.4588314677411235
  (0, 3)	0.4588314677411235
  (0, 11)	0.22941573387056174
  (0, 2)	0.22941573387056174
