In [3]:
import re
import nltk
from nltk.corpus import stopwords

In [2]:
paragraph = """Narendra Damodardas Modi was born on 17 September 1950 to a Gujarati Hindu family of oil presser (Modh-Ghanchi) which is an Other Backward Class (OBC) category[44][45] in Vadnagar, Mehsana district, Bombay State (present-day Gujarat). He was the third of six children born to Damodardas Mulchand Modi (c. 1915–1989) and Hiraben Modi (1923–2022).[46][a][47]

Modi had infrequently worked as a child in his father's tea business on the Vadnagar railway station platform, according to Modi and his neighbours.[48][49][50]

Modi completed his higher secondary education in Vadnagar in 1967; his teachers described him as an average student and a keen, gifted debater with an interest in theatre.[51] He preferred playing larger-than-life characters in theatrical productions, which has influenced his political image.[52][53]

When Modi was eight years old, he was introduced to the Rashtriya Swayamsevak Sangh (RSS) and began attending its local shakhas (training sessions). There, he met Lakshmanrao Inamdar, who inducted Modi as a balswayamsevak (junior cadet) in the RSS and became his political mentor.[54] While Modi was training with the RSS, he also met Vasant Gajendragadkar and Nathalal Jaghda, Bharatiya Jana Sangh leaders who in 1980 helped found the BJP's Gujarat unit.[55] As a teenager, he was enrolled in the National Cadet Corps.[56]"""

In [77]:
# Sentence Tokenization
documents = nltk.sent_tokenize(paragraph)

['Narendra Damodardas Modi was born on 17 September 1950 to a Gujarati Hindu family of oil presser (Modh-Ghanchi) which is an Other Backward Class (OBC) category[44][45] in Vadnagar, Mehsana district, Bombay State (present-day Gujarat).',
 'He was the third of six children born to Damodardas Mulchand Modi (c.\u20091915–1989) and Hiraben Modi (1923–2022).',
 "[46][a][47]\n\nModi had infrequently worked as a child in his father's tea business on the Vadnagar railway station platform, according to Modi and his neighbours.",
 '[48][49][50]\n\nModi completed his higher secondary education in Vadnagar in 1967; his teachers described him as an average student and a keen, gifted debater with an interest in theatre.',
 '[51] He preferred playing larger-than-life characters in theatrical productions, which has influenced his political image.',
 '[52][53]\n\nWhen Modi was eight years old, he was introduced to the Rashtriya Swayamsevak Sangh (RSS) and began attending its local shakhas (training se

In [70]:
# Stemming
stemmer = nltk.stem.PorterStemmer()
stemmer.stem('running')

'work'

In [53]:
stemmer.stem('history')

'histori'

In [71]:
# Lammatization
lem = nltk.stem.WordNetLemmatizer()
lem.lemmatize('history')

'worked'

In [4]:
# Sentence Tokenization
documents = nltk.sent_tokenize(paragraph)

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

lem = nltk.stem.WordNetLemmatizer()
corpus = []

for text in documents:
    text = text.lower()
    # Remove special chars and numbers
    text = re.sub('[^a-z]', ' ', text)
    # Tokenize sentences to words
    words = nltk.tokenize.word_tokenize(text)
    # Filter out stop words
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lem.lemmatize(word) for word in words]
    corpus.append(words)

corpus

[['narendra',
  'damodardas',
  'modi',
  'born',
  'september',
  'gujarati',
  'hindu',
  'family',
  'oil',
  'presser',
  'modh',
  'ghanchi',
  'backward',
  'class',
  'obc',
  'category',
  'vadnagar',
  'mehsana',
  'district',
  'bombay',
  'state',
  'present',
  'day',
  'gujarat'],
 ['third',
  'six',
  'child',
  'born',
  'damodardas',
  'mulchand',
  'modi',
  'c',
  'hiraben',
  'modi'],
 ['modi',
  'infrequently',
  'worked',
  'child',
  'father',
  'tea',
  'business',
  'vadnagar',
  'railway',
  'station',
  'platform',
  'according',
  'modi',
  'neighbour'],
 ['modi',
  'completed',
  'higher',
  'secondary',
  'education',
  'vadnagar',
  'teacher',
  'described',
  'average',
  'student',
  'keen',
  'gifted',
  'debater',
  'interest',
  'theatre'],
 ['preferred',
  'playing',
  'larger',
  'life',
  'character',
  'theatrical',
  'production',
  'influenced',
  'political',
  'image'],
 ['modi',
  'eight',
  'year',
  'old',
  'introduced',
  'rashtriya',
  '

In [22]:
# Part of speech tagging: (POS Tagging)
tags = nltk.pos_tag(corpus[0])
tags

[('narendra', 'JJ'),
 ('damodardas', 'NNS'),
 ('modi', 'VBP'),
 ('born', 'JJ'),
 ('september', 'NNP'),
 ('gujarati', 'NN'),
 ('hindu', 'NN'),
 ('family', 'NN'),
 ('oil', 'NN'),
 ('presser', 'NN'),
 ('modh', 'NN'),
 ('ghanchi', 'FW'),
 ('backward', 'JJ'),
 ('class', 'NN'),
 ('obc', 'NN'),
 ('category', 'NN'),
 ('vadnagar', 'NN'),
 ('mehsana', 'NNP'),
 ('district', 'NN'),
 ('bombay', 'NN'),
 ('state', 'NN'),
 ('present', 'JJ'),
 ('day', 'NN'),
 ('gujarat', 'NN')]

In [23]:
# Named Entity Recognition
nltk.ne_chunk(tags).draw()