**Importing necessary libraries**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import math

**Downloading necessary resources**

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

**Sample Document**

In [None]:
text = '''The lion (Panthera leo) is a large cat of the genus Panthera,
native to Africa and India. It has a muscular, broad-chested body; a short,
rounded head; round ears; and a hairy tuft at the end of its tail.
It is a social species, forming groups called prides.
A lion's pride consists of a few adult males, related females, and cubs.
Groups of female lions usually hunt together, preying mostly on large ungulates.
The lion inhabits grasslands, savannahs, and shrublands.
It is usually more diurnal than other wild cats, but when persecuted,
it adapts to being active at night and at twilight.
It has been listed as Vulnerable on the IUCN Red List since 1996
because populations in African countries have declined by about 43%
since the early 1990s.
Lion populations are untenable outside designated protected areas.
Although the cause of the decline is not fully understood,
habitat loss and conflicts with humans are the greatest causes for concern.
One of the most widely recognised animal symbols in human culture,
the lion has been extensively depicted in sculptures and paintings,
on national flags, and in literature and films.'''

**Tokenisation**

In [None]:
tokens = word_tokenize(text)

In [None]:
tokens

['The',
 'lion',
 '(',
 'Panthera',
 'leo',
 ')',
 'is',
 'a',
 'large',
 'cat',
 'of',
 'the',
 'genus',
 'Panthera',
 ',',
 'native',
 'to',
 'Africa',
 'and',
 'India',
 '.',
 'It',
 'has',
 'a',
 'muscular',
 ',',
 'broad-chested',
 'body',
 ';',
 'a',
 'short',
 ',',
 'rounded',
 'head',
 ';',
 'round',
 'ears',
 ';',
 'and',
 'a',
 'hairy',
 'tuft',
 'at',
 'the',
 'end',
 'of',
 'its',
 'tail',
 '.',
 'It',
 'is',
 'a',
 'social',
 'species',
 ',',
 'forming',
 'groups',
 'called',
 'prides',
 '.',
 'A',
 'lion',
 "'s",
 'pride',
 'consists',
 'of',
 'a',
 'few',
 'adult',
 'males',
 ',',
 'related',
 'females',
 ',',
 'and',
 'cubs',
 '.',
 'Groups',
 'of',
 'female',
 'lions',
 'usually',
 'hunt',
 'together',
 ',',
 'preying',
 'mostly',
 'on',
 'large',
 'ungulates',
 '.',
 'The',
 'lion',
 'inhabits',
 'grasslands',
 ',',
 'savannahs',
 ',',
 'and',
 'shrublands',
 '.',
 'It',
 'is',
 'usually',
 'more',
 'diurnal',
 'than',
 'other',
 'wild',
 'cats',
 ',',
 'but',
 'when'

**POS tagging**

In [None]:
pos_tags = nltk.pos_tag(tokens)

In [None]:
pos_tags

[('The', 'DT'),
 ('lion', 'NN'),
 ('(', '('),
 ('Panthera', 'NNP'),
 ('leo', 'NN'),
 (')', ')'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('large', 'JJ'),
 ('cat', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('genus', 'NN'),
 ('Panthera', 'NNP'),
 (',', ','),
 ('native', 'JJ'),
 ('to', 'TO'),
 ('Africa', 'NNP'),
 ('and', 'CC'),
 ('India', 'NNP'),
 ('.', '.'),
 ('It', 'PRP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('muscular', 'JJ'),
 (',', ','),
 ('broad-chested', 'JJ'),
 ('body', 'NN'),
 (';', ':'),
 ('a', 'DT'),
 ('short', 'JJ'),
 (',', ','),
 ('rounded', 'JJ'),
 ('head', 'NN'),
 (';', ':'),
 ('round', 'JJ'),
 ('ears', 'NNS'),
 (';', ':'),
 ('and', 'CC'),
 ('a', 'DT'),
 ('hairy', 'JJ'),
 ('tuft', 'NN'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('end', 'NN'),
 ('of', 'IN'),
 ('its', 'PRP$'),
 ('tail', 'NN'),
 ('.', '.'),
 ('It', 'PRP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('social', 'JJ'),
 ('species', 'NNS'),
 (',', ','),
 ('forming', 'VBG'),
 ('groups', 'NNS'),
 ('called', 'VBD'),
 ('prides', 'NNS'),
 ('.', '.'),
 ('A

**Stop words removal**

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [None]:
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

In [None]:
filtered_tokens

['lion',
 '(',
 'Panthera',
 'leo',
 ')',
 'large',
 'cat',
 'genus',
 'Panthera',
 ',',
 'native',
 'Africa',
 'India',
 '.',
 'muscular',
 ',',
 'broad-chested',
 'body',
 ';',
 'short',
 ',',
 'rounded',
 'head',
 ';',
 'round',
 'ears',
 ';',
 'hairy',
 'tuft',
 'end',
 'tail',
 '.',
 'social',
 'species',
 ',',
 'forming',
 'groups',
 'called',
 'prides',
 '.',
 'lion',
 "'s",
 'pride',
 'consists',
 'adult',
 'males',
 ',',
 'related',
 'females',
 ',',
 'cubs',
 '.',
 'Groups',
 'female',
 'lions',
 'usually',
 'hunt',
 'together',
 ',',
 'preying',
 'mostly',
 'large',
 'ungulates',
 '.',
 'lion',
 'inhabits',
 'grasslands',
 ',',
 'savannahs',
 ',',
 'shrublands',
 '.',
 'usually',
 'diurnal',
 'wild',
 'cats',
 ',',
 'persecuted',
 ',',
 'adapts',
 'active',
 'night',
 'twilight',
 '.',
 'listed',
 'Vulnerable',
 'IUCN',
 'Red',
 'List',
 'since',
 '1996',
 'populations',
 'African',
 'countries',
 'declined',
 '43',
 '%',
 'since',
 'early',
 '1990s',
 '.',
 'Lion',
 'popula

**Stemming**

In [None]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
stemmed_tokens

['lion',
 '(',
 'panthera',
 'leo',
 ')',
 'larg',
 'cat',
 'genu',
 'panthera',
 ',',
 'nativ',
 'africa',
 'india',
 '.',
 'muscular',
 ',',
 'broad-chest',
 'bodi',
 ';',
 'short',
 ',',
 'round',
 'head',
 ';',
 'round',
 'ear',
 ';',
 'hairi',
 'tuft',
 'end',
 'tail',
 '.',
 'social',
 'speci',
 ',',
 'form',
 'group',
 'call',
 'pride',
 '.',
 'lion',
 "'s",
 'pride',
 'consist',
 'adult',
 'male',
 ',',
 'relat',
 'femal',
 ',',
 'cub',
 '.',
 'group',
 'femal',
 'lion',
 'usual',
 'hunt',
 'togeth',
 ',',
 'prey',
 'mostli',
 'larg',
 'ungul',
 '.',
 'lion',
 'inhabit',
 'grassland',
 ',',
 'savannah',
 ',',
 'shrubland',
 '.',
 'usual',
 'diurnal',
 'wild',
 'cat',
 ',',
 'persecut',
 ',',
 'adapt',
 'activ',
 'night',
 'twilight',
 '.',
 'list',
 'vulner',
 'iucn',
 'red',
 'list',
 'sinc',
 '1996',
 'popul',
 'african',
 'countri',
 'declin',
 '43',
 '%',
 'sinc',
 'earli',
 '1990',
 '.',
 'lion',
 'popul',
 'unten',
 'outsid',
 'design',
 'protect',
 'area',
 '.',
 'althou

**Lemmatization**

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
lemmatized_tokens

['lion',
 '(',
 'Panthera',
 'leo',
 ')',
 'large',
 'cat',
 'genus',
 'Panthera',
 ',',
 'native',
 'Africa',
 'India',
 '.',
 'muscular',
 ',',
 'broad-chested',
 'body',
 ';',
 'short',
 ',',
 'rounded',
 'head',
 ';',
 'round',
 'ear',
 ';',
 'hairy',
 'tuft',
 'end',
 'tail',
 '.',
 'social',
 'specie',
 ',',
 'forming',
 'group',
 'called',
 'pride',
 '.',
 'lion',
 "'s",
 'pride',
 'consists',
 'adult',
 'male',
 ',',
 'related',
 'female',
 ',',
 'cub',
 '.',
 'Groups',
 'female',
 'lion',
 'usually',
 'hunt',
 'together',
 ',',
 'preying',
 'mostly',
 'large',
 'ungulate',
 '.',
 'lion',
 'inhabits',
 'grassland',
 ',',
 'savannah',
 ',',
 'shrublands',
 '.',
 'usually',
 'diurnal',
 'wild',
 'cat',
 ',',
 'persecuted',
 ',',
 'adapts',
 'active',
 'night',
 'twilight',
 '.',
 'listed',
 'Vulnerable',
 'IUCN',
 'Red',
 'List',
 'since',
 '1996',
 'population',
 'African',
 'country',
 'declined',
 '43',
 '%',
 'since',
 'early',
 '1990s',
 '.',
 'Lion',
 'population',
 'untena

**TF-IDF representation**

In [None]:
# Tokenize sentences
sentences = nltk.sent_tokenize(text)
sentences

['The lion (Panthera leo) is a large cat of the genus Panthera, \nnative to Africa and India.',
 'It has a muscular, broad-chested body; a short, \nrounded head; round ears; and a hairy tuft at the end of its tail.',
 'It is a social species, forming groups called prides.',
 "A lion's pride consists of a few adult males, related females, and cubs.",
 'Groups of female lions usually hunt together, preying mostly on large ungulates.',
 'The lion inhabits grasslands, savannahs, and shrublands.',
 'It is usually more diurnal than other wild cats, but when persecuted, \nit adapts to being active at night and at twilight.',
 'It has been listed as Vulnerable on the IUCN Red List since 1996 \nbecause populations in African countries have declined by about 43% \nsince the early 1990s.',
 'Lion populations are untenable outside designated protected areas.',
 'Although the cause of the decline is not fully understood, \nhabitat loss and conflicts with humans are the greatest causes for concern.'

In [None]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
# Fit and transform the sentences
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
# Calculate TF-IDF for each word
tfidf_scores = {}
for i, sentence in enumerate(sentences):
    tokens = nltk.word_tokenize(sentence)
    token_counter = Counter(tokens)
    total_tokens = len(tokens)
    for token, count in token_counter.items():
        tf = count / total_tokens
        idf = math.log(len(sentences) / sum([1 for s in sentences if token in s]))
        tfidf_scores[token] = tf * idf
print()
print("TF-IDF Scores:")
for word, score in tfidf_scores.items():
    print(f"{word}: {score}")


TF-IDF Scores:
The: 0.17047480922384253
lion: 0.023892647283765767
(: 0.11418548918087479
Panthera: 0.22837097836174958
leo: 0.11418548918087479
): 0.11418548918087479
is: 0.01883271348929405
a: 0.0
large: 0.12176772087417323
cat: 0.08117848058278215
of: 0.018367751623342893
the: 0.02739303780260953
genus: 0.11418548918087479
,: 0.018242790496559205
native: 0.11418548918087479
to: 0.0404640364671392
Africa: 0.08117848058278215
and: 0.041089556703914294
India: 0.11418548918087479
.: 0.0
It: 0.034882790057878615
has: 0.03937221164031094
muscular: 0.08563911688565609
broad-chested: 0.08563911688565609
body: 0.08563911688565609
;: 0.25691735065696825
short: 0.08563911688565609
rounded: 0.08563911688565609
head: 0.08563911688565609
round: 0.08563911688565609
ears: 0.08563911688565609
hairy: 0.08563911688565609
tuft: 0.08563911688565609
at: 0.0160536556369721
end: 0.08563911688565609
its: 0.060883860437086615
tail: 0.08563911688565609
social: 0.21799047934530644
species: 0.21799047934530644