In [7]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
from nltk import word_tokenize, sent_tokenize

sent = "Sachin is considered to be one of the greatest cricket players. Virat is the captain of the Indian cricket team Which consists of 11 players"
print(word_tokenize(sent))
print(sent_tokenize(sent))

['Sachin', 'is', 'considered', 'to', 'be', 'one', 'of', 'the', 'greatest', 'cricket', 'players', '.', 'Virat', 'is', 'the', 'captain', 'of', 'the', 'Indian', 'cricket', 'team', 'Which', 'consists', 'of', '11', 'players']
['Sachin is considered to be one of the greatest cricket players.', 'Virat is the captain of the Indian cricket team Which consists of 11 players']


# **Stop Words Removal**

In [16]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
token = word_tokenize(sent)
cleaned_token = []

for word in token :
  if word not in stop_words :
    cleaned_token.append(word)

print("This is the unclean version : ", token)
print("This is the cleaned version : ", cleaned_token)

This is the unclean version :  ['Sachin', 'is', 'considered', 'to', 'be', 'one', 'of', 'the', 'greatest', 'cricket', 'players', '.', 'Virat', 'is', 'the', 'captain', 'of', 'the', 'Indian', 'cricket', 'team', 'Which', 'consists', 'of', '11', 'players']
This is the cleaned version :  ['Sachin', 'considered', 'one', 'greatest', 'cricket', 'players', '.', 'Virat', 'captain', 'Indian', 'cricket', 'team', 'Which', 'consists', '11', 'players']


In [18]:
words = [cleaned_token.lower() for cleaned_token in cleaned_token if cleaned_token.isalpha()]
print(words)

['sachin', 'considered', 'one', 'greatest', 'cricket', 'players', 'virat', 'captain', 'indian', 'cricket', 'team', 'which', 'consists', 'players']


# **Stemming**

In [21]:
from nltk.stem  import PorterStemmer
stemmer = PorterStemmer()
port_stemmmer_output = [stemmer.stem(words) for words in words]
print(port_stemmmer_output)

['sachin', 'consid', 'one', 'greatest', 'cricket', 'player', 'virat', 'captain', 'indian', 'cricket', 'team', 'which', 'consist', 'player']


# **Lemmatization**

In [22]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatizer_output = [lemmatizer.lemmatize(words) for words in words]

print(lemmatizer_output)

[nltk_data] Downloading package wordnet to /root/nltk_data...


['sachin', 'considered', 'one', 'greatest', 'cricket', 'player', 'virat', 'captain', 'indian', 'cricket', 'team', 'which', 'consists', 'player']


In [24]:
from nltk import pos_tag

nltk.download('averaged_perceptron_tagger')

token = word_tokenize(sent)
cleaned_token = []

for word in token:
    if word not in stop_words:
        cleaned_token.append(word)

tagged = pos_tag(cleaned_token)
print(tagged)

[('Sachin', 'NNP'), ('considered', 'VBD'), ('one', 'CD'), ('greatest', 'JJS'), ('cricket', 'NN'), ('players', 'NNS'), ('.', '.'), ('Virat', 'NNP'), ('captain', 'NN'), ('Indian', 'JJ'), ('cricket', 'NN'), ('team', 'NN'), ('Which', 'NNP'), ('consists', 'VBZ'), ('11', 'CD'), ('players', 'NNS')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# **TF And IDF**

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [26]:
docs = [ "Sachin is considered to be one of the greatest cricket players",
         "Federer is considered one of the greatest tennis players",
         "Nadal is considered one of the greatest tennis players",
         "Virat is the captain of the Indian cricket team"]

In [27]:
vectorizer = TfidfVectorizer(analyzer = "word", norm = None, use_idf = True, smooth_idf = True)
Mat = vectorizer.fit(docs)
print(Mat.vocabulary_)

{'sachin': 12, 'is': 7, 'considered': 2, 'to': 16, 'be': 0, 'one': 10, 'of': 9, 'the': 15, 'greatest': 5, 'cricket': 3, 'players': 11, 'federer': 4, 'tennis': 14, 'nadal': 8, 'virat': 17, 'captain': 1, 'indian': 6, 'team': 13}


In [28]:
tfidMat = vectorizer.fit_transform(docs)
print(tfidMat)

  (0, 11)	1.2231435513142097
  (0, 3)	1.5108256237659907
  (0, 5)	1.2231435513142097
  (0, 15)	1.0
  (0, 9)	1.0
  (0, 10)	1.2231435513142097
  (0, 0)	1.916290731874155
  (0, 16)	1.916290731874155
  (0, 2)	1.2231435513142097
  (0, 7)	1.0
  (0, 12)	1.916290731874155
  (1, 14)	1.5108256237659907
  (1, 4)	1.916290731874155
  (1, 11)	1.2231435513142097
  (1, 5)	1.2231435513142097
  (1, 15)	1.0
  (1, 9)	1.0
  (1, 10)	1.2231435513142097
  (1, 2)	1.2231435513142097
  (1, 7)	1.0
  (2, 8)	1.916290731874155
  (2, 14)	1.5108256237659907
  (2, 11)	1.2231435513142097
  (2, 5)	1.2231435513142097
  (2, 15)	1.0
  (2, 9)	1.0
  (2, 10)	1.2231435513142097
  (2, 2)	1.2231435513142097
  (2, 7)	1.0
  (3, 13)	1.916290731874155
  (3, 6)	1.916290731874155
  (3, 1)	1.916290731874155
  (3, 17)	1.916290731874155
  (3, 3)	1.5108256237659907
  (3, 15)	2.0
  (3, 9)	1.0
  (3, 7)	1.0


In [29]:
features_names = vectorizer.get_feature_names_out()
print(features_names)

['be' 'captain' 'considered' 'cricket' 'federer' 'greatest' 'indian' 'is'
 'nadal' 'of' 'one' 'players' 'sachin' 'team' 'tennis' 'the' 'to' 'virat']


In [30]:
dense = tfidMat.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns = features_names)

In [31]:
df

Unnamed: 0,be,captain,considered,cricket,federer,greatest,indian,is,nadal,of,one,players,sachin,team,tennis,the,to,virat
0,1.916291,0.0,1.223144,1.510826,0.0,1.223144,0.0,1.0,0.0,1.0,1.223144,1.223144,1.916291,0.0,0.0,1.0,1.916291,0.0
1,0.0,0.0,1.223144,0.0,1.916291,1.223144,0.0,1.0,0.0,1.0,1.223144,1.223144,0.0,0.0,1.510826,1.0,0.0,0.0
2,0.0,0.0,1.223144,0.0,0.0,1.223144,0.0,1.0,1.916291,1.0,1.223144,1.223144,0.0,0.0,1.510826,1.0,0.0,0.0
3,0.0,1.916291,0.0,1.510826,0.0,0.0,1.916291,1.0,0.0,1.0,0.0,0.0,0.0,1.916291,0.0,2.0,0.0,1.916291
