In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...


In [3]:
document="In 1983, India won their First Cricket World Cup under the captaincy of Kapil Dev, making a historic moment in Indian cricket history."

In [4]:
tokens=word_tokenize(document)

In [5]:
print(tokens)

['In', '1983', ',', 'India', 'won', 'their', 'First', 'Cricket', 'World', 'Cup', 'under', 'the', 'captaincy', 'of', 'Kapil', 'Dev', ',', 'making', 'a', 'historic', 'moment', 'in', 'Indian', 'cricket', 'history', '.']


In [6]:
pos_tags=nltk.pos_tag(tokens)
print(pos_tags)

[('In', 'IN'), ('1983', 'CD'), (',', ','), ('India', 'NNP'), ('won', 'VBD'), ('their', 'PRP$'), ('First', 'JJ'), ('Cricket', 'NNP'), ('World', 'NNP'), ('Cup', 'NNP'), ('under', 'IN'), ('the', 'DT'), ('captaincy', 'NN'), ('of', 'IN'), ('Kapil', 'NNP'), ('Dev', 'NNP'), (',', ','), ('making', 'VBG'), ('a', 'DT'), ('historic', 'JJ'), ('moment', 'NN'), ('in', 'IN'), ('Indian', 'JJ'), ('cricket', 'NN'), ('history', 'NN'), ('.', '.')]


In [7]:
stop_words=set(stopwords.words('english'))
filtered_tokens=[word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens)

['1983', ',', 'India', 'First', 'Cricket', 'World', 'Cup', 'captaincy', 'Kapil', 'Dev', ',', 'making', 'historic', 'moment', 'Indian', 'cricket', 'history', '.']


In [9]:
Stemmer=PorterStemmer()
stemmed_tokens=[Stemmer.stem(word) for word in filtered_tokens]

In [10]:
print(stemmed_tokens)

['1983', ',', 'india', 'first', 'cricket', 'world', 'cup', 'captainci', 'kapil', 'dev', ',', 'make', 'histor', 'moment', 'indian', 'cricket', 'histori', '.']


In [11]:
lemmatizer=WordNetLemmatizer()
lemmatized_tokens=[lemmatizer.lemmatize(word) for word in filtered_tokens]
print(lemmatized_tokens)

['1983', ',', 'India', 'First', 'Cricket', 'World', 'Cup', 'captaincy', 'Kapil', 'Dev', ',', 'making', 'historic', 'moment', 'Indian', 'cricket', 'history', '.']


In [13]:
tfidf_vectorizer=TfidfVectorizer()
tfidf_matrix=tfidf_vectorizer.fit_transform([' '.join(lemmatized_tokens)])

In [14]:
from collections import Counter
import math

In [15]:
words=document.split()

In [16]:
word_count=Counter(words)
total_words=len(words)
tf={word:Count/total_words for word,Count in word_count.items()}

In [17]:
print('Term Frequency for each word:')
for word,tf_value in tf.items():
    print(f"{word}:{tf_value}")

Term Frequency for each word:
In:0.043478260869565216
1983,:0.043478260869565216
India:0.043478260869565216
won:0.043478260869565216
their:0.043478260869565216
First:0.043478260869565216
Cricket:0.043478260869565216
World:0.043478260869565216
Cup:0.043478260869565216
under:0.043478260869565216
the:0.043478260869565216
captaincy:0.043478260869565216
of:0.043478260869565216
Kapil:0.043478260869565216
Dev,:0.043478260869565216
making:0.043478260869565216
a:0.043478260869565216
historic:0.043478260869565216
moment:0.043478260869565216
in:0.043478260869565216
Indian:0.043478260869565216
cricket:0.043478260869565216
history.:0.043478260869565216


In [18]:
words=set(document.split())

In [19]:
doc_containing_word=Counter()
for word in words:
    doc_containing_word[word]+=1

In [21]:
total_documents=1
idf={word:math.log(total_documents/count) for word ,count in doc_containing_word.items()}

In [23]:
print("Inverse document Frequency for each word")
for word,idf_value in idf.items():
    print(f"{word}:{idf_value}")

Inverse document Frequency for each word
Dev,:0.0
Cup:0.0
making:0.0
a:0.0
Cricket:0.0
captaincy:0.0
moment:0.0
In:0.0
of:0.0
the:0.0
Indian:0.0
history.:0.0
under:0.0
First:0.0
won:0.0
Kapil:0.0
India:0.0
in:0.0
World:0.0
cricket:0.0
historic:0.0
their:0.0
1983,:0.0


In [24]:
print("TF-IDF Matrix")
print(tfidf_matrix.toarray())
print("Feature names:",tfidf_vectorizer.get_feature_names_out())

TF-IDF Matrix
[[0.24253563 0.24253563 0.48507125 0.24253563 0.24253563 0.24253563
  0.24253563 0.24253563 0.24253563 0.24253563 0.24253563 0.24253563
  0.24253563 0.24253563]]
Feature names: ['1983' 'captaincy' 'cricket' 'cup' 'dev' 'first' 'historic' 'history'
 'india' 'indian' 'kapil' 'making' 'moment' 'world']
