In [1]:
sample_document = "Tokenization is the process of splitting a document into individual words or tokens. POS tagging assigns grammatical tags to tokens. Stop words are common words that are often removed. Stemming reduces words to their root form. Lemmatization is similar to stemming but considers the meaning of words."


In [3]:
pip install nltk

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.4.16-cp311-cp311-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ----------------------------- ---------- 30.7/42.0 kB 1.3 MB/s eta 0:00:01
     ----------------------------- ---------- 30.7/42.0 kB 1.3 MB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 253.9 kB/s eta 0:00:00
Collecting tqdm (from nltk)
  Using cached tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.5 MB 4.8 MB/s eta 0:00:01
   -- ------------------------------------- 0.1/1.5 MB 4.8 MB/s eta 0:00:01
   -- ------------------------------------- 0.1/1.

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Tokenization
tokens = word_tokenize(sample_document)

# POS tagging
pos_tags = pos_tag(tokens)

# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

# Print results
print("Original Document:")
print(sample_document)
print("\nTokenization:")
print(tokens)
print("\nPOS Tagging:")
print(pos_tags)
print("\nStop Words Removal:")
print(filtered_tokens)
print("\nStemming:")
print(stemmed_tokens)
print("\nLemmatization:")
print(lemmatized_tokens)


[nltk_data] Downloading package punkt to C:\Users\Vivobook
[nltk_data]     Pro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\Users\Vivobook
[nltk_data]     Pro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\Vivobook
[nltk_data]     Pro\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Vivobook Pro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


Original Document:
Tokenization is the process of splitting a document into individual words or tokens. POS tagging assigns grammatical tags to tokens. Stop words are common words that are often removed. Stemming reduces words to their root form. Lemmatization is similar to stemming but considers the meaning of words.

Tokenization:
['Tokenization', 'is', 'the', 'process', 'of', 'splitting', 'a', 'document', 'into', 'individual', 'words', 'or', 'tokens', '.', 'POS', 'tagging', 'assigns', 'grammatical', 'tags', 'to', 'tokens', '.', 'Stop', 'words', 'are', 'common', 'words', 'that', 'are', 'often', 'removed', '.', 'Stemming', 'reduces', 'words', 'to', 'their', 'root', 'form', '.', 'Lemmatization', 'is', 'similar', 'to', 'stemming', 'but', 'considers', 'the', 'meaning', 'of', 'words', '.']

POS Tagging:
[('Tokenization', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('splitting', 'VBG'), ('a', 'DT'), ('document', 'NN'), ('into', 'IN'), ('individual', 'JJ'), ('words

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

docs = [ "Sachin is considered to be one of the greatest cricket players",
 "Federer is considered one of the greatest tennis players",
 "Nadal is considered one of the greatest tennis players",
 "Virat is the captain of the Indian cricket team"]


In [8]:
vectorizer = TfidfVectorizer(analyzer = "word", norm = None , use_idf = True , smooth_idf=True)
Mat = vectorizer.fit(docs)
print(Mat.vocabulary_)

{'sachin': 12, 'is': 7, 'considered': 2, 'to': 16, 'be': 0, 'one': 10, 'of': 9, 'the': 15, 'greatest': 5, 'cricket': 3, 'players': 11, 'federer': 4, 'tennis': 14, 'nadal': 8, 'virat': 17, 'captain': 1, 'indian': 6, 'team': 13}


In [9]:
tfidfMat = vectorizer.fit_transform(docs)

In [10]:
print(tfidfMat)

  (0, 11)	1.2231435513142097
  (0, 3)	1.5108256237659907
  (0, 5)	1.2231435513142097
  (0, 15)	1.0
  (0, 9)	1.0
  (0, 10)	1.2231435513142097
  (0, 0)	1.916290731874155
  (0, 16)	1.916290731874155
  (0, 2)	1.2231435513142097
  (0, 7)	1.0
  (0, 12)	1.916290731874155
  (1, 14)	1.5108256237659907
  (1, 4)	1.916290731874155
  (1, 11)	1.2231435513142097
  (1, 5)	1.2231435513142097
  (1, 15)	1.0
  (1, 9)	1.0
  (1, 10)	1.2231435513142097
  (1, 2)	1.2231435513142097
  (1, 7)	1.0
  (2, 8)	1.916290731874155
  (2, 14)	1.5108256237659907
  (2, 11)	1.2231435513142097
  (2, 5)	1.2231435513142097
  (2, 15)	1.0
  (2, 9)	1.0
  (2, 10)	1.2231435513142097
  (2, 2)	1.2231435513142097
  (2, 7)	1.0
  (3, 13)	1.916290731874155
  (3, 6)	1.916290731874155
  (3, 1)	1.916290731874155
  (3, 17)	1.916290731874155
  (3, 3)	1.5108256237659907
  (3, 15)	2.0
  (3, 9)	1.0
  (3, 7)	1.0


In [11]:
features_names = vectorizer.get_feature_names_out()
print(features_names)

['be' 'captain' 'considered' 'cricket' 'federer' 'greatest' 'indian' 'is'
 'nadal' 'of' 'one' 'players' 'sachin' 'team' 'tennis' 'the' 'to' 'virat']


In [12]:
dense = tfidfMat.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist , columns = features_names)

In [13]:
df

Unnamed: 0,be,captain,considered,cricket,federer,greatest,indian,is,nadal,of,one,players,sachin,team,tennis,the,to,virat
0,1.916291,0.0,1.223144,1.510826,0.0,1.223144,0.0,1.0,0.0,1.0,1.223144,1.223144,1.916291,0.0,0.0,1.0,1.916291,0.0
1,0.0,0.0,1.223144,0.0,1.916291,1.223144,0.0,1.0,0.0,1.0,1.223144,1.223144,0.0,0.0,1.510826,1.0,0.0,0.0
2,0.0,0.0,1.223144,0.0,0.0,1.223144,0.0,1.0,1.916291,1.0,1.223144,1.223144,0.0,0.0,1.510826,1.0,0.0,0.0
3,0.0,1.916291,0.0,1.510826,0.0,0.0,1.916291,1.0,0.0,1.0,0.0,0.0,0.0,1.916291,0.0,2.0,0.0,1.916291


In [16]:
docList = ['Doc 1','Doc 2','Doc 3','Doc 4']
skDocsIfIdfdf = pd.DataFrame(tfidfMat.todense(),index = sorted(docList), columns=features_names)
print(skDocsIfIdfdf)

             be   captain  considered   cricket   federer  greatest    indian  \
Doc 1  1.916291  0.000000    1.223144  1.510826  0.000000  1.223144  0.000000   
Doc 2  0.000000  0.000000    1.223144  0.000000  1.916291  1.223144  0.000000   
Doc 3  0.000000  0.000000    1.223144  0.000000  0.000000  1.223144  0.000000   
Doc 4  0.000000  1.916291    0.000000  1.510826  0.000000  0.000000  1.916291   

        is     nadal   of       one   players    sachin      team    tennis  \
Doc 1  1.0  0.000000  1.0  1.223144  1.223144  1.916291  0.000000  0.000000   
Doc 2  1.0  0.000000  1.0  1.223144  1.223144  0.000000  0.000000  1.510826   
Doc 3  1.0  1.916291  1.0  1.223144  1.223144  0.000000  0.000000  1.510826   
Doc 4  1.0  0.000000  1.0  0.000000  0.000000  0.000000  1.916291  0.000000   

       the        to     virat  
Doc 1  1.0  1.916291  0.000000  
Doc 2  1.0  0.000000  0.000000  
Doc 3  1.0  0.000000  0.000000  
Doc 4  2.0  0.000000  1.916291  


In [17]:
csim = cosine_similarity(tfidfMat,tfidfMat)

In [18]:
csimDf = pd.DataFrame(csim,index=sorted(docList),columns=sorted(docList))

In [19]:
print(csimDf)

          Doc 1     Doc 2     Doc 3     Doc 4
Doc 1  1.000000  0.492416  0.492416  0.277687
Doc 2  0.492416  1.000000  0.754190  0.215926
Doc 3  0.492416  0.754190  1.000000  0.215926
Doc 4  0.277687  0.215926  0.215926  1.000000
