In [1]:
sample_document = "Tokenization is the process of splitting a document into individual words or tokens. POS tagging assigns grammatical tags to tokens. Stop words are common words that are often removed. Stemming reduces words to their root form. Lemmatization is similar to stemming but considers the meaning of words."


In [3]:
pip install nltk

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.4.16-cp311-cp311-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ----------------------------- ---------- 30.7/42.0 kB 1.3 MB/s eta 0:00:01
     ----------------------------- ---------- 30.7/42.0 kB 1.3 MB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 253.9 kB/s eta 0:00:00
Collecting tqdm (from nltk)
  Using cached tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.5 MB 4.8 MB/s eta 0:00:01
   -- ------------------------------------- 0.1/1.5 MB 4.8 MB/s eta 0:00:01
   -- ------------------------------------- 0.1/1.

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Tokenization
tokens = word_tokenize(sample_document)

# POS tagging
pos_tags = pos_tag(tokens)

# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

# Print results
print("Original Document:")
print(sample_document)
print("\nTokenization:")
print(tokens)
print("\nPOS Tagging:")
print(pos_tags)
print("\nStop Words Removal:")
print(filtered_tokens)
print("\nStemming:")
print(stemmed_tokens)
print("\nLemmatization:")
print(lemmatized_tokens)


[nltk_data] Downloading package punkt to C:\Users\Vivobook
[nltk_data]     Pro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\Users\Vivobook
[nltk_data]     Pro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\Vivobook
[nltk_data]     Pro\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Vivobook Pro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


Original Document:
Tokenization is the process of splitting a document into individual words or tokens. POS tagging assigns grammatical tags to tokens. Stop words are common words that are often removed. Stemming reduces words to their root form. Lemmatization is similar to stemming but considers the meaning of words.

Tokenization:
['Tokenization', 'is', 'the', 'process', 'of', 'splitting', 'a', 'document', 'into', 'individual', 'words', 'or', 'tokens', '.', 'POS', 'tagging', 'assigns', 'grammatical', 'tags', 'to', 'tokens', '.', 'Stop', 'words', 'are', 'common', 'words', 'that', 'are', 'often', 'removed', '.', 'Stemming', 'reduces', 'words', 'to', 'their', 'root', 'form', '.', 'Lemmatization', 'is', 'similar', 'to', 'stemming', 'but', 'considers', 'the', 'meaning', 'of', 'words', '.']

POS Tagging:
[('Tokenization', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('splitting', 'VBG'), ('a', 'DT'), ('document', 'NN'), ('into', 'IN'), ('individual', 'JJ'), ('words

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample document
sample_document = "Tokenization is the process of splitting a document into individual words or tokens. POS tagging assigns grammatical tags to tokens."

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer and transform the document into a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform([sample_document])

# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Get the TF-IDF values for each term in the document
tfidf_values = tfidf_matrix.toarray()[0]

# Create a dictionary to store the TF-IDF values for each term
tfidf_dict = {term: tfidf_values[i] for i, term in enumerate(feature_names)}

# Print the TF-IDF values for each term
for term, tfidf in tfidf_dict.items():
    print(f"{term}: {tfidf}")



assigns: 0.2182178902359924
document: 0.2182178902359924
grammatical: 0.2182178902359924
individual: 0.2182178902359924
into: 0.2182178902359924
is: 0.2182178902359924
of: 0.2182178902359924
or: 0.2182178902359924
pos: 0.2182178902359924
process: 0.2182178902359924
splitting: 0.2182178902359924
tagging: 0.2182178902359924
tags: 0.2182178902359924
the: 0.2182178902359924
to: 0.2182178902359924
tokenization: 0.2182178902359924
tokens: 0.4364357804719848
words: 0.2182178902359924
