In [1]:
paragraph = """Narendra Damodardas Modi was born on 17 September 1950 to a Gujarati Hindu family of oil presser (Modh-Ghanchi) which is an Other Backward Class (OBC) category[44][45] in Vadnagar, Mehsana district, Bombay State (present-day Gujarat). He was the third of six children born to Damodardas Mulchand Modi (c. 1915–1989) and Hiraben Modi (1923–2022).[46][a][47]

Modi had infrequently worked as a child in his father's tea business on the Vadnagar railway station platform, according to Modi and his neighbours.[48][49][50]

Modi completed his higher secondary education in Vadnagar in 1967; his teachers described him as an average student and a keen, gifted debater with an interest in theatre.[51] He preferred playing larger-than-life characters in theatrical productions, which has influenced his political image.[52][53]

When Modi was eight years old, he was introduced to the Rashtriya Swayamsevak Sangh (RSS) and began attending its local shakhas (training sessions). There, he met Lakshmanrao Inamdar, who inducted Modi as a balswayamsevak (junior cadet) in the RSS and became his political mentor.[54] While Modi was training with the RSS, he also met Vasant Gajendragadkar and Nathalal Jaghda, Bharatiya Jana Sangh leaders who in 1980 helped found the BJP's Gujarat unit.[55] As a teenager, he was enrolled in the National Cadet Corps.[56]"""

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

def clean_up_text(paragraph):
    documents = nltk.sent_tokenize(paragraph.lower())
    corpus = []
    stop_words = set(stopwords.words('english'))
    lem = WordNetLemmatizer()

    for doc in documents:
        doc = re.sub('[^a-z]', ' ', doc)
        words = nltk.word_tokenize(doc)
        words = [lem.lemmatize(word) for word in words if word not in stop_words]
        corpus.append(' '.join(words))

    return corpus
    
def tf_idf(paragraph):
    corpus = clean_up_text(paragraph)
    
    vect = TfidfVectorizer()
    tfidf = vect.fit_transform(corpus)
    
    print("Vocabulary :", vect.vocabulary_)
    print("\nFeature Names :", vect.get_feature_names_out())
    print("\nTF-IDF Matrix:", tfidf.toarray())
    

In [3]:
tf_idf(paragraph)

Vocabulary : {'narendra': 62, 'damodardas': 20, 'modi': 60, 'born': 11, 'september': 80, 'gujarati': 35, 'hindu': 38, 'family': 28, 'oil': 67, 'presser': 74, 'modh': 59, 'ghanchi': 32, 'backward': 4, 'class': 17, 'obc': 66, 'category': 14, 'vadnagar': 96, 'mehsana': 56, 'district': 24, 'bombay': 10, 'state': 84, 'present': 73, 'day': 21, 'gujarat': 34, 'third': 93, 'six': 83, 'child': 16, 'mulchand': 61, 'hiraben': 39, 'infrequently': 44, 'worked': 98, 'father': 29, 'tea': 88, 'business': 12, 'railway': 76, 'station': 85, 'platform': 69, 'according': 0, 'neighbour': 65, 'completed': 18, 'higher': 37, 'secondary': 79, 'education': 25, 'teacher': 89, 'described': 23, 'average': 3, 'student': 86, 'keen': 50, 'gifted': 33, 'debater': 22, 'interest': 45, 'theatre': 91, 'preferred': 72, 'playing': 70, 'larger': 52, 'life': 54, 'character': 15, 'theatrical': 92, 'production': 75, 'influenced': 43, 'political': 71, 'image': 40, 'eight': 26, 'year': 99, 'old': 68, 'introduced': 46, 'rashtriya':