In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ravir\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ravir\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravir\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ravir\AppData\Roaming\nltk_data...


In [17]:
document = " In 1983 , India won their first Cricket World Cup under the captaincy of Kapil Dev , marking a historic movement in Indian Cricket history"

In [18]:
tokens = word_tokenize(document)
print(tokens)

['In', '1983', ',', 'India', 'won', 'their', 'first', 'Cricket', 'World', 'Cup', 'under', 'the', 'captaincy', 'of', 'Kapil', 'Dev', ',', 'marking', 'a', 'historic', 'movement', 'in', 'Indian', 'Cricket', 'history']


In [51]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens)

['1983', ',', 'India', 'first', 'Cricket', 'World', 'Cup', 'captaincy', 'Kapil', 'Dev', ',', 'marking', 'historic', 'movement', 'Indian', 'Cricket', 'history']


In [52]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print(stemmed_tokens)

['1983', ',', 'india', 'first', 'cricket', 'world', 'cup', 'captainci', 'kapil', 'dev', ',', 'mark', 'histor', 'movement', 'indian', 'cricket', 'histori']


In [53]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print(lemmatized_tokens)

['1983', ',', 'India', 'first', 'Cricket', 'World', 'Cup', 'captaincy', 'Kapil', 'Dev', ',', 'marking', 'historic', 'movement', 'Indian', 'Cricket', 'history']


In [22]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([''.join(lemmatized_tokens)])
from collections import Counter
import math

In [23]:
words = document.split()

In [24]:
word_count = Counter(words)
total_words = len(words)
tf = {word : Count/total_words for word, Count in word_count.items()}

In [25]:
print("Term Frequency for each word : ")
for word, tf_value in tf.items():
    print(f"{word}: {tf_value}")

Term Frequency for each word : 
In: 0.04
1983: 0.04
,: 0.08
India: 0.04
won: 0.04
their: 0.04
first: 0.04
Cricket: 0.08
World: 0.04
Cup: 0.04
under: 0.04
the: 0.04
captaincy: 0.04
of: 0.04
Kapil: 0.04
Dev: 0.04
marking: 0.04
a: 0.04
historic: 0.04
movement: 0.04
in: 0.04
Indian: 0.04
history: 0.04


In [26]:
words = set(document.split())
documents_containing_word = Counter()
for word in words:
    documents_containing_word[word] += 1

In [30]:
total_documents = 1
idf = {word:math.log(total_documents/count) for word, count in documents_containing_word.items()}

In [31]:
print("Inverse Document Frequency for each word: ")
for word, idf_value in idf.items():
    print(f"{word}: {idf_value}")

Inverse Document Frequency for each word: 
Indian: 0.0
a: 0.0
marking: 0.0
history: 0.0
Dev: 0.0
in: 0.0
their: 0.0
Cup: 0.0
the: 0.0
of: 0.0
historic: 0.0
first: 0.0
1983: 0.0
won: 0.0
Cricket: 0.0
,: 0.0
In: 0.0
India: 0.0
World: 0.0
captaincy: 0.0
under: 0.0
Kapil: 0.0
movement: 0.0
