In [None]:
#installing nlp library
!pip install nltk

In [None]:
paragraph = """
Rahul Rajiv Gandhi (Hindi pronunciation: [ˈraːɦʊl raːdʒiːʋ ˈɡaːndʱiː] ⓘ; born 19 June 1970) is an Indian politician. A member of the Indian National Congress (INC), he has served as the 12th Leader of the Opposition in Lok Sabha and as the member of the Lok Sabha for Rae Bareli, Uttar Pradesh, since June 2024.[c][d] He previously represented the constituency of Wayanad, Kerala, from 2019 to 2024, and Amethi, Uttar Pradesh, from 2004 to 2019. Gandhi served as the party president of the INC from December 2017 to July 2019 and is the chairperson of the Indian Youth Congress, the National Students Union of India, and a trustee of the Rajiv Gandhi Foundation and Rajiv Gandhi Charitable Trust. He is a member of the Nehru–Gandhi political family.

Born in New Delhi, Gandhi spent his early childhood between Delhi and Dehradun and stayed away from the public sphere for much of his childhood and early youth. He received primary education in New Delhi and then attended the elite all-boys' boarding The Doon School in Dehradun. Due to security concerns, he was later home-schooled. Gandhi commenced his undergraduate degree at St. Stephen's College before moving to Harvard University. The following year, due to security threats following the assassination of his father, he moved to Rollins College in Florida, completing his degree in 1994. The next year, he obtained his M.Phil. from Cambridge. After completing his post-graduation, he initiated his professional career with the Monitor Group, a management consulting firm in London. Soon thereafter, he returned to India and founded Backops Services Private Ltd, a technology outsourcing firm based in Mumbai.
"""

In [None]:
#import libraries

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
#Tokenizing
nltk.download('punkt')
sentence = nltk.sent_tokenize(paragraph)

In [None]:
sentence

In [None]:
type(sentence), len(sentence)

In [47]:
#initializing and downloading necessary components
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [53]:
import re

corpus = []
for i in range(len(sentence)):
  review = re.sub('[^a-zA-Z]', ' ', sentence[i])
  review = review.lower()
  review = review.split()
  review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
  review = " ".join(review)
  corpus.append(review)

In [54]:
corpus

['rahul rajiv gandhi hindi pronunciation ra l ra nd born june indian politician',
 'member indian national congress inc served th leader opposition lok sabha member lok sabha rae bareli uttar pradesh since june',
 'c previously represented constituency wayanad kerala amethi uttar pradesh',
 'gandhi served party president inc december july chairperson indian youth congress national student union india trustee rajiv gandhi foundation rajiv gandhi charitable trust',
 'member nehru gandhi political family',
 'born new delhi gandhi spent early childhood delhi dehradun stayed away public sphere much childhood early youth',
 'received primary education new delhi attended elite boy boarding doon school dehradun',
 'due security concern later home schooled',
 'gandhi commenced undergraduate degree st stephen college moving harvard university',
 'following year due security threat following assassination father moved rollins college florida completing degree',
 'next year obtained phil',
 'cambr

In [55]:
#Bag of words

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

In [57]:
voc = cv.fit_transform(corpus)

In [62]:
cv.vocabulary_

{'rahul': 86,
 'rajiv': 87,
 'gandhi': 38,
 'hindi': 42,
 'pronunciation': 82,
 'ra': 84,
 'nd': 64,
 'born': 8,
 'june': 49,
 'indian': 46,
 'politician': 74,
 'member': 57,
 'national': 63,
 'congress': 19,
 'inc': 44,
 'served': 96,
 'th': 107,
 'leader': 52,
 'opposition': 69,
 'lok': 53,
 'sabha': 92,
 'rae': 85,
 'bareli': 5,
 'uttar': 115,
 'pradesh': 76,
 'since': 98,
 'previously': 78,
 'represented': 89,
 'constituency': 20,
 'wayanad': 116,
 'kerala': 50,
 'amethi': 0,
 'party': 71,
 'president': 77,
 'december': 22,
 'july': 48,
 'chairperson': 12,
 'youth': 118,
 'student': 105,
 'union': 113,
 'india': 45,
 'trustee': 111,
 'foundation': 36,
 'charitable': 13,
 'trust': 110,
 'nehru': 65,
 'political': 73,
 'family': 31,
 'new': 66,
 'delhi': 25,
 'spent': 100,
 'early': 28,
 'childhood': 14,
 'dehradun': 24,
 'stayed': 103,
 'away': 3,
 'public': 83,
 'sphere': 101,
 'much': 61,
 'received': 88,
 'primary': 79,
 'education': 29,
 'attended': 2,
 'elite': 30,
 'boy': 9,
 

In [65]:
voc[0].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [66]:
corpus[0]

'rahul rajiv gandhi hindi pronunciation ra l ra nd born june indian politician'

**TFIDF Practice**

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(ngram_range=(3,3))

In [68]:
ch = tf.fit_transform(corpus)

In [70]:
ch[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.31622777, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.31622777, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.31622777, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.31622777, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

<14x131 sparse matrix of type '<class 'numpy.float64'>'
	with 131 stored elements in Compressed Sparse Row format>