<h2><center><font color='red'>Importing Libraries</font></h2>

In [12]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

<h2><center><font color='red'>Tokenization</font></h2>

In [31]:
text = 'Sachin is considered as one of the greatest cricket players. Virat is the captain of the Indian cricket team.'
print(f"Word Tokenization: {nltk.word_tokenize(text.lower())}\n\nSentence Tokenization: {nltk.sent_tokenize(text.lower())}")

Word Tokenization: ['sachin', 'is', 'considered', 'as', 'one', 'of', 'the', 'greatest', 'cricket', 'players', '.', 'virat', 'is', 'the', 'captain', 'of', 'the', 'indian', 'cricket', 'team', '.']

Sentence Tokenization: ['sachin is considered as one of the greatest cricket players.', 'virat is the captain of the indian cricket team.']


<h2><center><font color='red'>Stop Words Removal</font></h2>

In [32]:
stopwords = nltk.corpus.stopwords.words('english')
text_cleaned = []

for word in nltk.word_tokenize(text):
    
    if word not in stopwords and word.isalpha(): text_cleaned.append(word.lower())

print(text_cleaned)

['sachin', 'considered', 'one', 'greatest', 'cricket', 'players', 'virat', 'captain', 'indian', 'cricket', 'team']


<h2><center><font color='red'>Stemming</font></h2>

In [33]:
stemmer = nltk.stem.PorterStemmer()

stemmed_text = [stemmer.stem(word) for word in text_cleaned]
print(stemmed_text)

['sachin', 'consid', 'one', 'greatest', 'cricket', 'player', 'virat', 'captain', 'indian', 'cricket', 'team']


<h2><center><font color='red'>POS Tagging</font></h2>

In [35]:
pos_tagged = nltk.pos_tag(text_cleaned)
print(pos_tagged)

[('sachin', 'NN'), ('considered', 'VBD'), ('one', 'CD'), ('greatest', 'JJS'), ('cricket', 'NN'), ('players', 'NNS'), ('virat', 'VBP'), ('captain', 'NN'), ('indian', 'JJ'), ('cricket', 'NN'), ('team', 'NN')]


<h2><center><font color='red'>TFIDF</font></h2>

In [41]:
text = ['Sachin is considered as one of the greatest cricket players. Virat is the captain of the Indian cricket team.']

vectorizer = TfidfVectorizer(analyzer='word', use_idf=True, smooth_idf=True)

out = vectorizer.fit(text)
print(out.vocabulary_)

{'sachin': 10, 'is': 6, 'considered': 2, 'as': 0, 'one': 8, 'of': 7, 'the': 12, 'greatest': 4, 'cricket': 3, 'players': 9, 'virat': 13, 'captain': 1, 'indian': 5, 'team': 11}


In [42]:
tfid_out = vectorizer.fit_transform(text)
print(tfid_out)

  (0, 11)	0.1796053020267749
  (0, 5)	0.1796053020267749
  (0, 1)	0.1796053020267749
  (0, 13)	0.1796053020267749
  (0, 9)	0.1796053020267749
  (0, 3)	0.3592106040535498
  (0, 4)	0.1796053020267749
  (0, 12)	0.5388159060803247
  (0, 7)	0.3592106040535498
  (0, 8)	0.1796053020267749
  (0, 0)	0.1796053020267749
  (0, 2)	0.1796053020267749
  (0, 6)	0.3592106040535498
  (0, 10)	0.1796053020267749


In [43]:
print(vectorizer.get_feature_names_out())

['as' 'captain' 'considered' 'cricket' 'greatest' 'indian' 'is' 'of' 'one'
 'players' 'sachin' 'team' 'the' 'virat']


In [46]:
data = pd.DataFrame(tfid_out.todense().tolist(), columns=vectorizer.get_feature_names_out())
print(data)

         as   captain  considered   cricket  greatest    indian        is  \
0  0.179605  0.179605    0.179605  0.359211  0.179605  0.179605  0.359211   

         of       one   players    sachin      team       the     virat  
0  0.359211  0.179605  0.179605  0.179605  0.179605  0.538816  0.179605  
